├── ASIS_Soil_SVM.R
├── GBM_talk_Austin_R_Users_20140724.R
├── README.md
├── bci_challenge
    ├── README.md
    ├── createMatFiles_fromR.R
    ├── primary_model.R
    └── use_tf_kernel.m
├── loan_default_loss_basics.R
├── modern_logistic_regression
    ├── README.md
    ├── fast_solution.py
    ├── fast_solution_v2.py
    └── fast_solution_v3.py
├── rain
    ├── README.md
    ├── prepareTrain.R
    └── rain_functions.R
├── rossmann-store-sales
    └── ACM-data-science-camp.R
├── see_click_predict_fix_basis.R
└── will_it_rain
    └── histogram_starter_model.R


/ASIS_Soil_SVM.R:
--------------------------------------------------------------------------------
  1 | ## this code borrows the main structure from the forum post regarding beating the benchmark
  2 | ## additions to it are mainly: using two layers, updating cost parameters, capping low/high values
  3 | ##  trimming the range for some targets, and using the [incorrectly calculated] derivatives.
  4 | 
  5 | library(e1071)
  6 | ## this is not the true derivative, but it took a while before I caught it, and didn't fix it
  7 | ## it is the one-column difference
  8 | deriv<-function(a){b<-c(0,a[1:length(a)-1]); a[1]<-0; return(a-b)}
  9 | 
 10 | train <- read.csv("./training.csv",header=TRUE,stringsAsFactors=FALSE)
 11 | test <- read.csv("./sorted_test.csv",header=TRUE,stringsAsFactors=FALSE)
 12 | submission <- test[,1]
 13 | header <- test[,1]
 14 | train[,1]<-NULL
 15 | test[,1]<-NULL
 16 | labels <- train[,c("Ca","P","pH","SOC","Sand")]
 17 | 
 18 | ## running these in loops is not good R programming, but doing it once and exporting to .Rdata, it was simple enough to move on
 19 | dTrain<-train[,1:3578]*0
 20 | dTest<-test[,1:3578]*0
 21 | for(i in 1:nrow(dTrain)){dTrain[i,]<-deriv(t(train[i,1:3578])); if(i%%30==1){print(i)}}
 22 | for(i in 1:nrow(dTest)){dTest[i,]<-deriv(t(test[i,1:3578])); if(i%%30==1){print(i)}}
 23 | d2Train<-train[,1:3578]*0
 24 | d2Test<-test[,1:3578]*0
 25 | for(i in 1:nrow(d2Train)){d2Train[i,]<-deriv(t(dTrain[i,1:3578])); if(i%%30==1){print(i)}}
 26 | for(i in 1:nrow(d2Test)){d2Test[i,]<-deriv(t(dTest[i,1:3578])); if(i%%30==1){print(i)}}
 27 | 
 28 | ## 10-fold cross-validation, but applied in continuous chunks to help keep grid sections together
 29 | ##  intro model applied just to infrared columns in the last ~35%
 30 | cuts<-rep(0,11)
 31 | for(i in 1:10){cuts[i+1]<-round(nrow(train)*i*0.1,0)}
 32 | getCaHoldout<-function(col){
 33 | 	for(fold in 1:10){
 34 | 		idx<-rep(TRUE,nrow(train))
 35 | 		idx[(cuts[fold]+1):cuts[fold+1]]<-FALSE
 36 | 		trainLocal<-cbind(train[idx,2200:3578],dTrain[idx,2200:3578])
 37 | 		testLocal<-cbind(train[idx==FALSE,2200:3578],dTrain[idx==FALSE,2200:3578])
 38 | 		#train$CaEstimate<-round(labels$Ca[idx],1)
 39 | 		#test$CaEstimate<-round(labels$Ca[idx==FALSE],1)
 40 | 
 41 | 		svmCa <- svm(trainLocal,labels[idx,col],cost=10000,scale=FALSE)
 42 | 		if(fold==1){p <- predict(svmCa,newdata=testLocal); h <- predict(svmCa,newdata=cbind(test[,2200:3578],dTest[,2200:3578]))}
 43 | 		if(fold>1){p<-c(p,predict(svmCa,newdata=testLocal)); h<-cbind(h,predict(svmCa,newdata=cbind(test[,2200:3578],dTest[,2200:3578])))}
 44 | 	}
 45 |     return(c(p,rowMeans(h)))
 46 | }
 47 | reduce1<-getCaHoldout(1)
 48 | reduce2<-getCaHoldout(2)
 49 | reduce3<-getCaHoldout(3)
 50 | reduce4<-getCaHoldout(4)
 51 | reduce5<-getCaHoldout(5)
 52 | 
 53 | ##create separate sets for each subproblem
 54 | ##	Ca: derivatives appear not to help; having more data appears to help, despite significant part of spectrum
 55 | ##	P:	use as much as possible, lenght+derivatives; likely should use holdout predictions from the other elements as well
 56 | ##	pH:	tough to improve; do not add derivatives
 57 | ##	SOC:	no derivatives; local points help vs. all
 58 | ##	Sand:	derivatives help; local points help
 59 | 
 60 | #make column usage specific to each target
 61 | #align with ordering of labels/targets: labels <- train[,c("Ca","P","pH","SOC","Sand")]
 62 | starts<-c(1,1,2200,2200,2300)
 63 | ends<-rep(3578,5)
 64 | derivStarts<-c(2,2200,2,2,2300)
 65 | derivEnds<-c(3,3578,3,3,3578)
 66 | #deriv2Starts<-c(2,2,2,2,2900)
 67 | #deriv2Ends<-c(3,3,3,3,3250)
 68 | 
 69 | ##add back in the initial predictions from all five models, to all five models; e.g. Ca can see P, pH, SOC, and Sand predictions
 70 | i<-1
 71 | svmTrains<-list(cbind(train[,starts[i]:ends[i]],dTrain[,derivStarts[i]:derivEnds[i]],reduce1[1:nrow(train)]
 72 | 	,reduce2[1:nrow(train)],reduce3[1:nrow(train)],reduce4[1:nrow(train)],reduce5[1:nrow(train)]))
 73 | svmTests<-list(cbind(test[,starts[i]:ends[i]],dTest[,derivStarts[i]:derivEnds[i]],reduce1[(nrow(train)+1):length(reduce1)]
 74 | 	,reduce2[(nrow(train)+1):length(reduce2)],reduce3[(nrow(train)+1):length(reduce3)]
 75 | 	,reduce4[(nrow(train)+1):length(reduce4)],reduce5[(nrow(train)+1):length(reduce5)]
 76 | 	))
 77 | colnames(svmTrains[[i]])[(ncol(svmTrains[[i]])-4):ncol(svmTrains[[i]])]<-c("CaModel","PModel","pHModel","SOCModel","SandModel")
 78 | colnames(svmTests[[i]])[(ncol(svmTests[[i]])-4):ncol(svmTests[[i]])]<-c("CaModel","PModel","pHModel","SOCModel","SandModel")
 79 | for(i in 2:5){
 80 | 	svmTrains<-c(svmTrains,list(cbind(train[,starts[i]:ends[i]],dTrain[,derivStarts[i]:derivEnds[i]],reduce1[1:nrow(train)]
 81 | 		,reduce2[1:nrow(train)],reduce3[1:nrow(train)],reduce4[1:nrow(train)],reduce5[1:nrow(train)])))
 82 | 	svmTests<-c(svmTests,list(cbind(test[,starts[i]:ends[i]],dTest[,derivStarts[i]:derivEnds[i]],reduce1[(nrow(train)+1):length(reduce1)]
 83 | 		,reduce2[(nrow(train)+1):length(reduce2)],reduce3[(nrow(train)+1):length(reduce3)]
 84 | 		,reduce4[(nrow(train)+1):length(reduce4)],reduce5[(nrow(train)+1):length(reduce5)]
 85 | 		)))
 86 | 	colnames(svmTrains[[i]])[(ncol(svmTrains[[i]])-4):ncol(svmTrains[[i]])]<-c("CaModel","PModel","pHModel","SOCModel","SandModel")
 87 | 	colnames(svmTests[[i]])[(ncol(svmTests[[i]])-4):ncol(svmTests[[i]])]<-c("CaModel","PModel","pHModel","SOCModel","SandModel")
 88 | 	}
 89 | 
 90 | ##last-second add of Depth as binary column
 91 | isTopTrain<-as.data.frame(ifelse(train$Depth=="Topsoil",1,0))
 92 | isTopTest<-as.data.frame(ifelse(test$Depth=="Topsoil",1,0))
 93 | colnames(isTopTrain)<-"isTopsoil"
 94 | colnames(isTopTest)<-"isTopsoil"
 95 | ## separate costs per SVM; these were tuned using cross-validation and roughly grid search
 96 | ## adjusting gamma was also suggested by the tuning, but it was incorrectly applied when I used it and I scaled back to just cost
 97 | costs<-c(15000,15000,15000,10000,20000)
 98 | 
 99 | AllSvms <- lapply(1:ncol(labels),function(i){svm(cbind(svmTrains[[i]],isTopTrain),labels[,i],cost=costs[i],scale=FALSE)})
100 | predictions1 <- sapply(1:ncol(labels),function(i){predict(AllSvms[[i]],newdata=cbind(svmTests[[i]],isTopTest))})
101 | 				
102 | predictions<-as.data.frame(unlist(predictions1))
103 | colnames(predictions) <- c("Ca","P","pH","SOC","Sand")
104 | 
105 | ## truncate final predictions to be within range of training data; this helped considerably in early modeling 
106 | predictions[,1]<-pmax(pmin(predictions[,1],max(labels$Ca)),min(labels$Ca))
107 | predictions[,2]<-pmax(pmin(predictions[,2],1),min(labels$P))
108 | predictions[,3]<-pmax(pmin(predictions[,3],max(labels$pH)),min(labels$pH))
109 | predictions[,4]<-pmax(pmin(predictions[,4],max(labels$SOC)),min(labels$SOC))
110 | predictions[,5]<-pmax(pmin(predictions[,5],max(labels$Sand)),min(labels$Sand))
111 | 
112 | submission <- cbind(PIDN=header,predictions)
113 | write.csv(submission,"submission_20141021a.csv",row.names=FALSE,quote=FALSE)
114 | 


--------------------------------------------------------------------------------
/GBM_talk_Austin_R_Users_20140724.R:
--------------------------------------------------------------------------------
 1 | library(Metrics)	##load evaluation package
 2 | setwd("C:/Users/Mark_Landry/Documents/K/dozer/")
 3 | ##Done in advance to speed up loading of data set
 4 | 	train<-read.csv("Train.csv")
 5 | 	train$saleTransform<-strptime(train$saledate,"%m/%d/%Y %H:%M")
 6 | 	train<-train[order(train$saleTransform),]
 7 | 	save(train,file="rTrain.Rdata")
 8 | 
 9 | 
10 | load("rTrain.Rdata")	
11 | xTrain<-train[(nrow(train)-149999):(nrow(train)-50000),5:ncol(train)]
12 | xTest<-train[(nrow(train)-49999):nrow(train),5:ncol(train)]
13 | yTrain<-train[(nrow(train)-149999):(nrow(train)-50000),2]
14 | yTest<-train[(nrow(train)-49999):nrow(train),2]
15 | 
16 | dim(xTrain); dim(xTest)
17 | sapply(xTrain,function(x) length(levels(x)))
18 | 	## check levels; gbm is robust, but still has a limit of 1024 per factor; for initial model, remove
19 | 	## after iterating through model, would want to go back and compress these factors to investigate 
20 | 	##	their usefulness (or other information analysis)
21 | xTrain$saledate<-NULL; xTest$saledate<-NULL
22 | xTrain$fiModelDesc<-NULL; xTest$fiModelDesc<-NULL
23 | xTrain$fiBaseModel<-NULL; xTest$fiBaseModel<-NULL
24 | xTrain$saleTransform<-NULL; xTest$saleTransform<-NULL
25 | 
26 | library(gbm)
27 | ##	Set up parameters to pass in; there are many more hyper-parameters available, but these are the most common to control
28 | GBM_NTREES = 400
29 | 	##	400 trees in the model; can scale back later for predictions, if desired or overfitting is suspected
30 | GBM_SHRINKAGE = 0.05
31 | 	##	shrinkage is a regularization parameter dictating how fast/aggressive the algorithm moves across the loss gradient
32 | 	##	0.05 is somewhat aggressive; default is 0.001, values below 0.1 tend to produce good results
33 | 	##		decreasing shrinkage generally improves results, but requires more trees, so the two should be adjusted in tandem
34 | GBM_DEPTH = 4
35 | 	##	depth 4 means each tree will evaluate four decisions; 
36 | 	##		will always yield [3*depth + 1] nodes and [2*depth + 1] terminal nodes (depth 4 = 9) 
37 | 	##		because each decision yields 3 nodes, but each decision will come from a prior node
38 | GBM_MINOBS = 30
39 | 	##	regularization parameter to dictate how many observations must be present to yield a terminal node
40 | 	##	higher number means more conservative fit; 30 is fairly high, but good for exploratory fits; default is 10
41 | 
42 | ##	Fit model
43 | g<-gbm.fit(x=xTrain,y=yTrain,distribution = "gaussian",n.trees = GBM_NTREES,shrinkage = GBM_SHRINKAGE,
44 | 	interaction.depth = GBM_DEPTH,n.minobsinnode = GBM_MINOBS)
45 | 	## gbm fit; provide all remaining independent variables in xTrain; provide targets as yTrain;
46 | 	##	gaussian distribution will optimize squared loss; 
47 | 
48 | ## get predictions; first on train set, then on unseen test data
49 | tP1 <- predict.gbm(object = g,newdata = xTrain,GBM_NTREES)
50 | hP1 <- predict.gbm(object = g,newdata = xTest,GBM_NTREES)
51 | 
52 | ## compare model performance to default (overall mean)
53 | rmse(yTrain,tP1)			##  9452.742 on data used for training
54 | rmse(yTest,hP1)				##  9740.559 ~3% drop on unseen data; does not seem to be overfit
55 | rmse(yTest,mean(yTrain))	## 24481.08  overall mean; cut error rate (from perfection) by 60%
56 | 
57 | ## look at variables
58 | summary(g)	## summary will plot and then show the relative influence of each variable to the entire GBM model (all trees)
59 | 
60 | ## test dominant variable mean
61 | library(sqldf)
62 | trainProdClass<-as.data.frame(cbind(as.character(xTrain$fiProductClassDesc),yTrain))
63 | testProdClass<-as.data.frame(cbind(as.character(xTest$fiProductClassDesc),yTest))
64 | colnames(trainProdClass)<-c("fiProductClassDesc","y"); colnames(testProdClass)<-c("fiProductClassDesc","y")
65 | ProdClassMeans<-sqldf("SELECT fiProductClassDesc,avg(y) avg, COUNT(*) n FROM trainProdClass GROUP BY fiProductClassDesc")
66 | ProdClassPredictions<-sqldf("SELECT case when n > 30 then avg ELSE 31348.63 end avg 
67 | FROM ProdClassMeans P LEFT JOIN testProdClass t ON t.fiProductClassDesc = P.fiProductClassDesc")
68 | rmse(yTest,ProdClassPredictions$avg)	## 29082.64	? peculiar result on the fiProductClassDesc means, which seemed fairly stable and useful
69 | 										##seems to say that the primary factor alone is not helpful; full tree needed
70 | 
71 | 
72 | ## Investigate actual GBM model
73 | pretty.gbm.tree(g,1)	##	show underlying model for the first decision tree
74 | summary(xTrain[,10])	##	underlying model showed variable 9 to be first point in tree (9 with 0 index = 10th column)
75 | g$initF					##	view what is effectively the "y intercept"
76 | mean(yTrain)			##	equivalence shows gaussian y intercept is the mean
77 | t(g$c.splits[1][[1]]) 	##	show whether each factor level should go left or right
78 | plot(g,10)				##	plot fiProductClassDesc, the variable with the highest rel.inf
79 | plot(g,3)				##	plot YearMade, continuous variable with 2nd highest rel.inf
80 | interact.gbm(g,xTrain,c(10,3))
81 | 						##	compute H statistic to show interaction; integrates 
82 | interact.gbm(g,xTrain,c(10,3))
83 | 						##	example of uninteresting interaction
84 |  
85 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Public Kaggle Code and Info
2 | ======
3 | 
4 | I will try to post code when I wind up close enough to the top that my models might be slightly interesting.
5 | 
6 | Unfortunately, most of my development is spent in fragments, and often split between SQL and R, so I'll do the best I can of getting functions posted and some sense of driver code enough to get the point across.
7 | 


--------------------------------------------------------------------------------
/bci_challenge/README.md:
--------------------------------------------------------------------------------
1 | Upcoming notes regarding the Kaggle BCI Challenge.
2 | 


--------------------------------------------------------------------------------
/bci_challenge/createMatFiles_fromR.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(R.matlab)
 3 | personList<-c("S02","S06","S07","S11","S12","S13","S14","S16","S17","S18","S20","S21","S22","S23","S24","S26")
 4 | sessionList<-c("01","02","03","04","05")
 5 | for(i in 1:length(personList)){
 6 |   for(j in 1:length(sessionList)){
 7 | 	fileName<-paste0("train/Data_",personList[i],"_Sess",sessionList[j])
 8 | 	x<-as.matrix(fread(paste0(fileName,".csv")))
 9 | 	writeMat(paste0(fileName,".mat"),x=x)
10 |   }
11 | }
12 | 
13 | library(data.table)
14 | library(R.matlab)
15 | personList<-c("S01","S03","S04","S05","S08","S09","S10","S15","S19","S25")
16 | sessionList<-c("01","02","03","04","05")
17 | for(i in 1:length(personList)){
18 |   for(j in 1:length(sessionList)){
19 | 	fileName<-paste0("train/Data_",personList[i],"_Sess",sessionList[j])
20 | 	x<-as.matrix(fread(paste0(fileName,".csv")))
21 | 	writeMat(paste0(fileName,".mat"),x=x)
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/bci_challenge/primary_model.R:
--------------------------------------------------------------------------------
  1 | ###################################################################
  2 | ## Function Definitions
  3 | ##  these are very hard-coded to work with this data set
  4 | ##  simply here to save typing, not much for modularity/flexibility
  5 | ###################################################################
  6 | 
  7 | getOneSec<-function(person,session){
  8 |     ##fread a given person ID and session ID (character IDs, not integers)
  9 |     ##find all feedback events, get 1.000 seconds that follow
 10 |     ##retain the Cz and EOG channels
 11 |     ##returns data.frame with session, person, feedback id, Cz and EOG channel data
 12 |     library(data.table)
 13 |     fileName<-paste0("train/Data_",person,"_Sess",session,".csv")
 14 |     a<-fread(fileName)
 15 |     a[,rowId:=seq(1:nrow(a))]
 16 |     feedBacks<-a[FeedBackEvent==1,rowId]
 17 |     feedBackNum<-1
 18 |     returnSet<-a[feedBacks[feedBackNum]:(feedBacks[feedBackNum]+200),list(Cz,EOG,feedBackNum)]
 19 |     for(feedBackNum in 2:length(feedBacks)){returnSet<-rbind(returnSet,a[feedBacks[feedBackNum]:(feedBacks[feedBackNum]+200),list(Cz,EOG,feedBackNum)])}
 20 |     returnSet[,Person:=person]
 21 |     returnSet[,Session:=session]
 22 |     returnSet[,IdFeedBack:=paste0(Person,"_Sess",Session,"_FB",ifelse(feedBackNum<10,"00",ifelse(feedBackNum<100,"0","")),feedBackNum)]
 23 |     return(as.data.frame(returnSet)) 
 24 | }
 25 | 
 26 | getMovingAvgs<-function(vec){
 27 |     ## originally picked specific vectors, but after comparing; always used p15
 28 |     globalMin<-min(vec)
 29 |     globalMax<-max(vec)
 30 |     scaled<-(globalMax-vec)/(globalMax-globalMin)
 31 |     p5<-rep(0,186)
 32 |     p15<-rep(0,186)
 33 |     for(i in 1:186){
 34 |         p5[i]<-mean(scaled[(i+10):(i+14)])
 35 |         p15[i]<-mean(scaled[i:(i+14)])
 36 |     }   
 37 |     return(as.data.frame(cbind(p5,p15)))
 38 | } 
 39 | 
 40 | fitGbm<-function(x,y,outType="auc",trees=100,shrink=0.05,no=10,depth=4,cvInt=1){
 41 |   ##runs a gbm and passes back either the score, the fit object, or a vector of predictions
 42 |   ## also performs very specifically leave-one-out CV
 43 |   library(gbm)
 44 |   library(Metrics)
 45 |   if(cvInt>=0){
 46 |     idx<-rep(FALSE,nrow(x))
 47 |     idx[(1+(cvInt-1)*340):(cvInt*340)]<-TRUE
 48 |     x1<-x[idx==FALSE,]
 49 |     y1<-y[idx==FALSE]
 50 |     x2<-x[idx,]
 51 |     y2<-y[idx]
 52 |   }
 53 |   if(cvInt<0){x1<-x; x2<-x; y1<-y; y2<-y}
 54 |   fit<-gbm.fit(x=x1,y=y1,distribution="adaboost",n.trees=trees,shrinkage=shrink,n.minobsinnode=no,interaction.depth=depth)
 55 |   p<-predict(fit,newdata=x2,n.trees=trees,type="response")
 56 |   if(outType=="auc"){print(summary(fit,plotit=FALSE)); return(auc(y2,p))}
 57 |   if(outType=="predictions"){return(p)}
 58 |   if(outType=="model"){return(fit)}
 59 | }
 60 | 
 61 | prepareFeatures<-function(personSession,depth=60,vec=1){
 62 |     ## given session data, calculate some hand-crafted features that seemed
 63 |     ##  interesting when looking at plots
 64 |     ## stated AUCs are against the training set, in its entirety, I believe; so probably higher than what would translate to new data
 65 |     for(i in 1:depth){
 66 |         b<-getMovingAvgs(personSession[(1+((i-1)*201)):(i*201),vec])
 67 |         tmp<-cbind(
 68 |         sum(b[1:92,2])-sum(b[93:184,2]) ##auc 0.625
 69 |         ,findMin(b[,2],20,70)   ##auc 0.604
 70 |         ,findDepthToMinDrop(b[,2],20,70)    ##auc 0.5809985
 71 |         ,findDepthAfterMinDrop(b[,2],20,70) ##auc 0.5875286
 72 |         ,findDepthAfterMinDrop(b[,2],90,170)    ##auc 0.5737571
 73 |         ,b[nrow(b),2]-b[1,2]    ##auc 0.5654925
 74 |         ,sd(b[,2])  ##auc 0.55485 (1-auc)
 75 |         ,getVolatility(b[,2],110,160)   ##auc 0.5299781
 76 |         ,sd(b[20:70,2])/getVolatility(b[,2],20,70)  ##auc 0.5608
 77 |         ,personSession[(1+((i-1)*201)),3]
 78 |         ,personSession[(1+((i-1)*201)),5]
 79 |         ,t(b[1:184,2])
 80 |         ,min(personSession[(1+((i-1)*201)),vec])
 81 |         ,max(personSession[(1+((i-1)*201)),vec])
 82 |         ,max(personSession[(1+((i-1)*201)),vec])-min(personSession[(1+((i-1)*201)),vec])
 83 |         )
 84 |         if(i==1){x<-tmp}
 85 |         if(i>1){x<-rbind(x,tmp)}
 86 |     }
 87 |     return(x)
 88 | }
 89 | 
 90 | findMin<-function(vec,cutMin=1,cutMax=10000){
 91 |  vec<-vec[cutMin:(pmin(length(vec),cutMax))]
 92 |  id<-seq(1:length(vec))
 93 |  return(id[vec==min(vec)])
 94 | }
 95 | 
 96 | findDepthToMinDrop<-function(vec,cutMin=1,cutMax=10000){
 97 |  vec<-vec[cutMin:(pmin(length(vec),cutMax))]
 98 |  id<-seq(1:length(vec))
 99 |  end<-id[vec==min(vec)]
100 |  return(max(vec[pmax(1,end-30):end])-vec[end])
101 | }
102 | 
103 | findDepthAfterMinDrop<-function(vec,cutMin=1,cutMax=10000){
104 |  vec<-vec[cutMin:(pmin(length(vec),cutMax))]
105 |  id<-seq(1:length(vec))
106 |  end<-id[vec==min(vec)]
107 |  return(max(vec[pmin(length(vec),end+30):end])-vec[end])
108 | }
109 | 
110 | getVolatility<-function(vec,cutMin=1,cutMax=10000){
111 |  vec<-vec[cutMin:(pmin(length(vec),cutMax))]
112 |  return(sd(vec[2:length(vec)]-vec[1:(length(vec)-1)]))
113 | }
114 | 
115 | 
116 | getFeedBacks<-function(person,session,cap=100){
117 | 	##fread a given person ID and session ID (character IDs, not integers)
118 | 	##return offsets of the FeedBackEvents
119 | 	library(data.table)
120 | 	fileName<-paste0("train/Data_",person,"_Sess",session,".csv")
121 | 	a<-fread(fileName)
122 | 	a[,rowId:=seq(1:nrow(a))]
123 | 	feedBacks<-a[FeedBackEvent==1,rowId]
124 | 	return(feedBacks[1:pmin(cap,length(feedBacks))])
125 | }
126 | 
127 | createMagic<-function(v,cutoff){
128 |   ## take in a vector of feedback differences
129 |   ## produce a vector where negative means likely wrong; positive is likely right
130 |   ##   and the values are scaled by the implied accuracy rate from the feedback timings
131 |   magic<-ifelse(v>=70,0,ifelse(v>cutoff,-1,1))
132 |   net<-sum(magic)
133 |   scalar<-ifelse(net>75,4,
134 |           ifelse(net>56,3,
135 |           ifelse(net>12,2,1)))
136 |   magic<-magic*scalar
137 |   return(magic)
138 | }
139 | 
140 | 
141 | 
142 | ###############################
143 | ##	Create Train Data Frame
144 | ###############################
145 | 
146 | ##get main features: 11 hand-crafted and higher-level features (subject/session) plus ~180 15-period averaged min/max scaled Cz signals
147 | ##currently set to use vector 1 (Cz); later used 2 (EOG), but minimal improvement
148 | personList<-c("S02","S06","S07","S11","S12","S13","S14","S16","S17","S18","S20","S21","S22","S23","S24","S26")
149 | sessionList<-c("01","02","03","04","05")
150 | depthList<-c(60,60,60,60,100)
151 | for(i in 1:length(personList)){
152 |   for(j in 1:length(sessionList)){
153 |     if(i*j==1){xAll<-prepareFeatures(getOneSec(personList[i],sessionList[j]),depthList[j],1)}
154 |     if(i*j>1){xAll<-rbind(xAll,prepareFeatures(getOneSec(personList[i],sessionList[j]),depthList[j],1))}
155 |     print(dim(xAll))
156 |   }
157 | }
158 | 
159 | 
160 | ## use coefficients of PLS model associating EOG with features from Matlab time-frequency kernel 
161 | for(i in 1:length(personList)){
162 |   for(j in 1:length(sessionList)){
163 |     fileBase<-paste0("pls/Data_",personList[i],"_Sess",sessionList[j],"_pls.csv")
164 |     if(i*j==1){plsAll<-read.csv(fileBase,header=FALSE)}
165 |     if(i*j>1){plsAll<-rbind(plsAll,read.csv(fileBase,header=FALSE))}
166 |   }
167 | }
168 | 
169 | ## get timings between subjects
170 | for(i in 1:length(personList)){
171 |   for(j in 1:length(sessionList)){
172 | 	if(i*j==1){xFB<-getFeedBacks(personList[i],sessionList[j],depthList[j])}
173 | 	if(i*j>1){xFB<-c(xFB,getFeedBacks(personList[i],sessionList[j],depthList[j]))}
174 |   }
175 |   print(i)
176 | }
177 | xFBDiff<-pmax(0,c(0,xFB[2:length(xFB)]-xFB[1:(length(xFB)-1)]))
178 | xFBDiff[xFBDiff==0]<-4500
179 | xFBDiff<-round(xFBDiff/50,0)
180 | 
181 | trainAll<-cbind(xAll,plsAll)
182 | trainAll$xFBDiff<-xFBDiff
183 | 
184 | ##count up the zones for spelling
185 | #spellCheck<-c(rep(65,340),rep(93,340),rep(88,340),rep(62,340),rep(60,340),rep(50,340),rep(79,340),rep(63,340),rep(56,340),rep(70,340),
186 | # rep(62,340),rep(88,340),rep(94,340),rep(73,340),rep(68,340),rep(73,340))
187 | trainAll$spellCheck<-rep(0,nrow(trainAll))  #spellCheck
188 | delay<-rep(0,nrow(trainAll))
189 | ## attempt to encode not the straight numbers from spellCheck, but a great/good/average/bad speller type encoding
190 | ##  the intent is to scale to unseen data (test has some really bad ones)
191 | for(i in 1:15){delay[(241+(i-1)*340):(i*340)]<-trainAll$spellCheck[(241+(i-1)*340):(i*340)]*10000+round(delay[(242+(i-1)*340):(1+i*340)]/4,0)}
192 | delay[5341:5439]<-trainAll$spellCheck[5341:5439]*10000+round(delay[5342:5440]/4,0)	##for i = 16
193 | delay<-as.factor(as.character(delay))
194 | 
195 | x2<-as.data.frame(unlist(sapply(trainAll,function(x) as.numeric(as.character(x)))))
196 | x2$delay<-delay
197 | 
198 | trainSess5<-c(0,1600)
199 | for(i in 1:16){trainSess5[(1+(i-1)*100):(i*100)]<-xFBDiff[(241+(i-1)*340):(i*340)]}
200 | for(i in 1:length(trainSess5)){if(i%%5==1) trainSess5[i]<-trainSess5[i]-20}
201 | 
202 | xMagic<-rep(0,nrow(x2))
203 | xSingleMagic<-rep(0,nrow(x2))
204 | adjFbDiff<-x2$xFBDiff
205 | 
206 | trainCuts<-c(   37,         40,40,         42,42,44,44,   45,46,47,  47,47,48,51,52,   52)
207 | for(i in 1:16){
208 |   m1<-createMagic(trainSess5[(1+(i-1)*100):(i*100)],trainCuts[i])
209 |   singleMagic<-pmax(5,round(sum(pmax(0,m1)/max(m1))/10,0))
210 |   xMagic[(241+(i-1)*340):(i*340)]<-m1
211 |   xSingleMagic[(1+(i-1)*340):(i*340)]<-singleMagic
212 |   adjFbDiff[(241+(i-1)*340):(i*340)]<-0
213 | }
214 | 
215 | x2$xMagic<-as.factor(xMagic)
216 | x2$xSingleMagic<-xSingleMagic
217 | x2$xxAdjFBDiff<-adjFbDiff
218 | 
219 | ## fix alignment problem
220 | x2$xMagic[1:5439]<-x2$xMagic[2:5440]
221 | 
222 | 
223 | ## add EOG/Cz differences
224 | maxLoop<-nrow(xAll)
225 | for(i in 1:maxLoop){
226 | 	a<-as.numeric(xAll[i,12:190])	##eog
227 | 	b<-as.numeric(x2[i,12:190])	##cz
228 | 	a<-(a-min(a))/(max(a)-min(a))	##eog
229 | 	b<-(b-min(b))/(max(b)-min(b))	##cz
230 | 	diffs<-a-b	##eog-cz
231 | 	scalar<-1-(max(diffs)/100)
232 | 	if(i==1){diff2<-t(as.data.frame(b-(a*scalar)))}
233 | 	if(i>1){diff2<-rbind(diff2,t(as.data.frame(b-(a*scalar))))}	
234 | }
235 | 
236 | maxLoop<-nrow(xAll)
237 | for(i in 1:maxLoop){
238 | 	a<-as.numeric(xAll[i,12:190])	##eog
239 | 	b<-as.numeric(x2[i,12:190])	##cz
240 | 	a<-(a-min(a))/(max(a)-min(a))	##eog
241 | 	b<-(b-min(b))/(max(b)-min(b))	##cz
242 | 	straightDiff<-a-b
243 | 	if(i==1){straightDiffs<-t(as.data.frame(straightDiff))}
244 | 	if(i>1){straightDiffs<-rbind(straightDiffs,t(as.data.frame(straightDiff)))}	
245 | }
246 | 
247 | 
248 | ## add new features
249 | x4<-cbind(x2,diff2)
250 | 
251 | diffDf<-as.data.frame(straightDiffs); colnames(diffDf)<-paste0("diff",colnames(diffDf)); rownames(diffDf)<-NULL
252 | x6<-cbind(x4,diffDf)
253 | 
254 | 
255 | ###############################
256 | ##	Create Test Data Frame
257 | ###############################
258 | 
259 | testPersonList<-c("S01","S03","S04","S05","S08","S09","S10","S15","S19","S25")
260 | sessionList<-c("01","02","03","04","05")
261 | depthList<-c(60,60,60,60,100)
262 | for(i in 1:length(testPersonList)){
263 |   for(j in 1:length(sessionList)){
264 |     if(i*j==1){xAllTest<-prepareFeatures(getOneSec(testPersonList[i],sessionList[j]),depthList[j])}
265 |     if(i*j>1){xAllTest<-rbind(xAllTest,prepareFeatures(getOneSec(testPersonList[i],sessionList[j]),depthList[j]))}
266 |     print(dim(xAllTest))
267 |   }
268 | }
269 | 
270 | 
271 | for(i in 1:length(testPersonList)){
272 |   for(j in 1:length(sessionList)){
273 |     fileBase<-paste0("pls/Data_",testPersonList[i],"_Sess",sessionList[j],"_pls.csv")
274 |     if(i*j==1){plsAllTest<-read.csv(fileBase,header=FALSE)}
275 |     if(i*j>1){plsAllTest<-rbind(plsAllTest,read.csv(fileBase,header=FALSE))}
276 |   }
277 | }
278 | 
279 | 
280 | for(i in 1:length(testPersonList)){
281 |   for(j in 1:length(sessionList)){
282 | 	if(i*j==1){xFBTest<-getFeedBacks(testPersonList[i],sessionList[j],depthList[j])}
283 | 	if(i*j>1){xFBTest<-c(xFBTest,getFeedBacks(testPersonList[i],sessionList[j],depthList[j]))}
284 |   }
285 |   print(i)
286 | }
287 | xTestFBDiff<-pmax(0,c(0,xFBTest[2:length(xFBTest)]-xFBTest[1:(length(xFBTest)-1)]))
288 | xTestFBDiff[xTestFBDiff==0]<-4500
289 | xTestFBDiff<-round(xTestFBDiff/50,0)
290 | 
291 | testAll<-cbind(xAllTest,plsAllTest)
292 | testAll$xTestFBDiff<-xTestFBDiff
293 | 
294 | ## isolate all session5's for analysis
295 | testSess5<-c(0,1000)
296 | for(i in 1:10){testSess5[(1+(i-1)*100):(i*100)]<-xTestFBDiff[(241+(i-1)*340):(i*340)]}
297 | for(i in 1:length(testSess5)){if(i%%5==1) testSess5[i]<-testSess5[i]-20}
298 | 
299 | testAll$spellCheck<-0
300 | x3<-as.data.frame(unlist(sapply(testAll,function(x) as.numeric(as.character(x)))))
301 | x3$delay<-x2$delay[1:nrow(x3)]
302 | colnames(x3)<-colnames(x2)[1:ncol(x3)]
303 | 
304 | testCuts<-c( 36,   38,39,39,      40,41,42,            44,        47,               51   ) 
305 | xMagicTest<-rep(0,nrow(x3))
306 | xSingleMagicTest<-rep(0,nrow(x3))
307 | adjFbDiffTest<-x3$xTestFBDiff
308 | 
309 | for(i in 1:10){
310 |   m1<-createMagic(testSess5[(1+(i-1)*100):(i*100)],testCuts[i])
311 |   singleMagic<-pmax(5,round(sum(pmax(0,m1)/max(m1))/10,0))
312 |   xMagicTest[(241+(i-1)*340):(i*340)]<-m1
313 |   xSingleMagicTest[(1+(i-1)*340):(i*340)]<-singleMagic
314 |   adjFbDiffTest[(241+(i-1)*340):(i*340)]<-0
315 | }
316 | 
317 | x3$xMagic<-as.factor(xMagicTest)
318 | x3$xSingleMagic<-xSingleMagicTest
319 | x3$xxAdjFBDiff<-adjFbDiffTest
320 | 
321 | ## fix alignment problem
322 | x3$xMagic[1:3399]<-x3$xMagic[2:3400]
323 | 
324 |   maxLoop<-nrow(x3)
325 |   for(i in 1:maxLoop){
326 |   	a<-as.numeric(x3[i,12:190])	##eog
327 |   	b<-as.numeric(x3[i,12:190])	##cz
328 |   	a<-(a-min(a))/(max(a)-min(a))	##eog
329 |   	b<-(b-min(b))/(max(b)-min(b))	##cz
330 |   	diffs<-a-b	##eog-cz
331 |   	scalar<-1-(max(diffs)/100)
332 |   	if(i==1){diff2Test<-t(as.data.frame(b-(a*scalar)))}
333 |   	if(i>1){diff2Test<-rbind(diff2Test,t(as.data.frame(b-(a*scalar))))}	
334 |   }
335 | 
336 |   maxLoop<-nrow(x3)
337 |   for(i in 1:maxLoop){
338 |   	a<-as.numeric(x3[i,12:190])	##eog
339 |   	b<-as.numeric(x3[i,12:190])	##cz
340 |   	a<-(a-min(a))/(max(a)-min(a))	##eog
341 |   	b<-(b-min(b))/(max(b)-min(b))	##cz
342 |   	straightDiff<-a-b
343 |   	if(i==1){straightDiffsTest<-t(as.data.frame(straightDiff))}
344 |   	if(i>1){straightDiffsTest<-rbind(straightDiffsTest,t(as.data.frame(straightDiff)))}	
345 |   }
346 | 
347 | x5<-cbind(x3,diff2Test)
348 | diffDfTest<-as.data.frame(straightDiffsTest); colnames(diffDfTest)<-paste0("diff",colnames(diffDfTest)); rownames(diffDfTest)<-NULL
349 | x7<-cbind(x5,diffDfTest)
350 | 
351 | 
352 | #############################################
353 | ##	Test Training by running
354 | ##	 leave-one-out (LOO) validation
355 | ##	 but waiting to check AUC against
356 | ##	 the overall set of points, as
357 | ##	 that will be different than averaging
358 | ##	 each subject's AUC
359 | ##	Most testing was done on smaller GBMs (100/0.1 vs 1000/0.01)
360 | #############################################
361 | 
362 | l<-read.csv("TrainLabels.csv")
363 | y<-l[,2]
364 | 
365 | cols<-c(1:11,77:88,132,125:128,64:66,120,165,199:205,402:404)
366 | 
367 | s1<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,1)
368 | s2<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,2)
369 | s3<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,3)
370 | s4<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,4)
371 | s5<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,5)
372 | s6<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,6)
373 | s7<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,7)
374 | s8<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,8)
375 | s9<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,9)
376 | s10<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,10)
377 | s11<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,11)
378 | s12<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,12)
379 | s13<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,13)
380 | s14<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,14)
381 | s15<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,15)
382 | s16<-fitGbm(x6[,cols],y,"predictions",100,0.1,10,10,16)
383 | 
384 | sAll1k<-c(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16)
385 | library(Metrics)
386 | auc(y,sAll1k)
387 | 
388 | 
389 | ##################################
390 | ##	Generate submission
391 | ##################################
392 | fullFit<-fitGbm(x6[,cols],y,"model",1000,0.01,10,10,-1)
393 | summary(fullFit,plotit=FALSE)[1:30,]
394 | pFinal<-predict(fullFit,newdata=x7[,cols],n.trees=1000,type="response")
395 | plot(pFinal)
396 | z<-read.csv("SampleSubmission.csv")
397 | z[,2]<-pFinal
398 | 
399 | ################################
400 | ## Post Processing
401 | ##
402 | ## The encoding for subject accuracy
403 | ##  is a conservative one, to ensure 
404 | ##  it would scale to new subjects.
405 | ##  This model alone was good for 3rd
406 | ##  place. But it jumped ~0.04 in AUC
407 | ##  to bump up and down all points for
408 | ##  a single subject, based on their 
409 | ##  rate of abnormal lenghts.
410 | 
411 | z[(1+0*340):(1*340),2]<-z[(1+0*340):(1*340),2]-0
412 | z[(1+1*340):(2*340),2]<-z[(1+1*340):(2*340),2]-0.05
413 | z[(1+2*340):(3*340),2]<-z[(1+2*340):(3*340),2]-0
414 | z[(1+3*340):(4*340),2]<-z[(1+3*340):(4*340),2]-0.06
415 | z[(1+4*340):(5*340),2]<-z[(1+4*340):(5*340),2]-0
416 | z[(1+5*340):(6*340),2]<-z[(1+5*340):(6*340),2]+0.01
417 | z[(1+6*340):(7*340),2]<-z[(1+6*340):(7*340),2]+0.02
418 | z[(1+7*340):(8*340),2]<-z[(1+7*340):(8*340),2]+0.02
419 | z[(1+8*340):(9*340),2]<-z[(1+8*340):(9*340),2]-0
420 | z[(1+9*340):(10*340),2]<-z[(1+9*340):(10*340),2]-0.275
421 | write.csv(z,"bci_submission.csv",row.names=FALSE,quote=FALSE)
422 | 
423 | 
424 | ## Table of rate of normal lengths vs GBM mean
425 | ## 1	0.67	0.64530
426 | ## 2	0.41	0.64200
427 | ## 3	0.83	0.69560
428 | ## 4	0.31	0.65300
429 | ## 5	0.68	0.71900
430 | ## 6	0.79	0.66480
431 | ## 7	0.92	0.93950
432 | ## 8	0.91	0.94390
433 | ## 9	0.64	0.69590
434 | ## 10	0.54	0.65420
435 | 


--------------------------------------------------------------------------------
/bci_challenge/use_tf_kernel.m:
--------------------------------------------------------------------------------
 1 | % The purpose of this script is as follows:
 2 | %   1) Used for EEG Kaggle Competition
 3 | %   2) Data is then processed into "feature" sets using the B-J
 4 | %   time-frequency kernel.
 5 | %   3) Once the "feature" sets have been extracted, the "features" are
 6 | %   processed using a time-series model or decomposed into principal components.
 7 | %
 8 | % File Version History:
 9 | %   01/27/2015      Original version written by Robert Chong
10 | %   02/11/2015      Starting to work with the team
11 | %   02/20/2015		adapting slightly to take in R's as.matrix .mat files
12 | 
13 | % House keeping
14 | clear all; clc; close all
15 | 
16 | 
17 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
18 | %   Set paths that contain m-files
19 | %addpath(genpath('/Users/rchong/Documents/MATLAB/'))         % Path adds the time-frequency toolboxes
20 | %addpath(genpath('/Users/rchong/Downloads/BCI_Challenge/'))  % Path adds the current script and data
21 | addpath(genpath('/home/mark/Documents/bci'))
22 | %addpath(genpath('/home/mark/Documents/bci/tfdn'))
23 | %addpath(genpath('/home/mark/Documents/bci/NaN'))
24 | %addpath(genpath('/home/mark/Documents/bci/statistics'))
25 | 
26 | %% loop through each person and session; to speed this up, I broke this list into four pieces and
27 | %%   had separate sessions working on each piece; if you do that you need to adjust the personList and k length
28 | personList = ["S01","S02","S03","S04","S05","S06","S07","S08","S09","S10","S11","S12","S13","S14","S15","S16","S17","S18","S19","S20","S21","S22","S23","S24","S25","S26"];
29 | sessionList= ["01","02","03","04","05"];
30 | 
31 | for k = 1:26
32 |     for j = 1:5
33 |         fileBase = strcat('train/Data_',personList((1+(k-1)*3):k*3),'_Sess',sessionList((1+(j-1)*2):j*2));
34 |         %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
35 |         %   Load dataset & Assign Variable names
36 |         load(strcat(fileBase,'.mat'));
37 |         %tsFile = csvread('/Users/rchong/Downloads/BCI_Challenge/train/Data_S02_Sess01.csv')
38 | 
39 |         % Find all the index where FeedBackEvent equals 1
40 |         idx = find(x(1:length(x),59));
41 | 
42 |         for i = 1:length(idx)
43 |             idxPos = idx(i);
44 |             data = x(idxPos:(idxPos+260),30);
45 |             %30 is the column for Cz; eventually loop through the columns of choice, if time
46 | 
47 |             fSample = 200;
48 |             windowSize = 1000;
49 |             alpha = 0.27;
50 | 
51 |             %tfr_AF3 = rgk(data,alpha);
52 |             tfr_AF3 = bintfd(data,windowSize,'Hamming','Analytic',0,'PosOnly','NoMex');
53 |             tfrRGK = tfr_AF3;
54 | 
55 |             %   Time
56 |             TAtimeMargRGK = t_margin(tfrRGK); % tfdn-toolbox -- not divided [m x n]
57 |             TAtimeMargRGK = TAtimeMargRGK';
58 |     
59 |             % Marginals
60 |             %   Frequency
61 |             TAfreqMargRGK = f_margin(tfrRGK); % tfdn-toolbox -- not divided [m x n]
62 |     
63 |             % Run PLS Modeling here
64 |             y = zscore(x(idxPos:idxPos+260,58));
65 |             X = zscore(tfrRGK');
66 |             [XL,yl,XS,YS,beta,PCTVAR] = plsregress(X,y,140);
67 | 
68 |             % save the top 200 PLS components; continually append in the loop for later writing to CSV
69 |             if(i==1)
70 |                 outVec = rot90(beta(1:200));
71 |             else
72 |                 outVec = vertcat(outVec,rot90(beta(1:200)));
73 |             endif
74 |         end
75 | 	% write out the matrix of all PLS components for use later (in R, or anywhere)
76 | 	csvwrite(strcat(fileBase,'_pls.csv'),outVec);
77 |         strcat('Done with: ',fileBase)
78 |     end
79 | end
80 | 


--------------------------------------------------------------------------------
/loan_default_loss_basics.R:
--------------------------------------------------------------------------------
 1 | ## Posting the basics of our loss model. 
 2 | ## R objects were often saved to speed the process up, so currently the pieces are being put in, 
 3 | ##  without any cohesion to get the content in one place.
 4 | 
 5 | gbmCv<-function(xTrain,yTrain,n=10,trees=500,shrink=0.05,depth=6,minobs=30,dist="laplace"){
 6 | require(gbm,Metrics)
 7 | t<-seq(1:nrow(xTrain))
 8 | for(i in (1:n)){
 9 |     xH<-xTrain[t%%n==(i-1),]
10 |     xT<-xTrain[t%%n!=(i-1),]
11 |     yH<-yTrain[t%%n==(i-1)]
12 |     yT<-yTrain[t%%n!=(i-1)]
13 |     GBM_model <- gbm.fit(x=xT,y=yT,distribution=dist,n.trees=trees,shrinkage=shrink,interaction.depth=depth,n.minobsinnode=minobs,verbose=FALSE)
14 |     pT<-predict.gbm(object=GBM_model,newdata=xT,trees)
15 |     pH<-predict.gbm(object=GBM_model,newdata=xH,trees)
16 |     if(i==1){cvDf<-as.data.frame(cbind(i,mae(yT,pT),mae(yH,pH)))}
17 |     else{cvDf<-rbind(cvDf,as.data.frame(cbind(i,mae(yT,pT),mae(yH,pH))))}
18 |     }
19 | return(cvDf)
20 | }
21 | 
22 | gbmDualCv<-function(xTrain,yTrain,n=10,trees=500,shrink=0.05,depth=6,minobs=30){
23 | require(gbm,Metrics)
24 | t<-seq(1:nrow(final_x))
25 | for(i in (1:n)){
26 |     xH<-final_x[t%%n==(i-1),]
27 |     xT<-final_x[t%%n!=(i-1),]
28 |     yH<-y[t%%n==(i-1)]
29 |     yT<-y[t%%n!=(i-1)]
30 |     GBM_laplace <- gbm.fit(x=xT,y=((yT)^0.5),distribution="laplace",n.trees=trees,shrinkage=0.05,interaction.depth=depth,n.minobsinnode=30,verbose=FALSE)
31 |     GBM_gauss <- gbm.fit(x=xT,y=(yT^0.5),distribution="gaussian",n.trees=trees,shrinkage=0.05,interaction.depth=depth,n.minobsinnode=30,verbose=FALSE)
32 |     pT<-predict.gbm(object=GBM_laplace,newdata=xT,trees)
33 |     pH<-predict.gbm(object=GBM_laplace,newdata=xH,trees)
34 |     pTg<-predict.gbm(object=GBM_gauss,newdata=xT,trees)
35 |     pHg<-predict.gbm(object=GBM_gauss,newdata=xH,trees)
36 | 
37 |     if(i==1){cvDf<-as.data.frame(cbind(i,
38 |         mae(yT,pmax(0,pT^2)),    
39 |         mae(yH,pmax(0,pH^2)),   
40 |         mae(yT,pmax(0,pTg^2)),   
41 |         mae(yH,pmax(0,pHg^2)),   
42 |         mae(yH,pmax(0,(pH^2+pHg^2)/2))))}    
43 |     else{cvDf<-rbind(cvDf,as.data.frame(cbind(i,
44 |         mae(yT,pmax(0,pT^2)),    
45 |         mae(yH,pmax(0,pH^2)),   
46 |         mae(yT,pmax(0,pTg^2)),   
47 |         mae(yH,pmax(0,pHg^2)),   
48 |         mae(yH,pmax(0,(pH^2+pHg^2)/2))))}    
49 |     }
50 | return(cvDf)
51 | }  
52 | 
53 | ##Find initial features for GBM using those most highly correlated with loss
54 | ##  this is overkill to get the full correlation matrix just to find the variables
55 | ##  correlated with the target, but much time was spent looking through the rest of the matrix
56 | ##  though it was never used. 
57 | topCorrelatedFeatures<-function(train,threshold=0.25){
58 |     require(Hmisc)
59 |     rc<-rcorr(as.matrix(train))
60 |     rcdf<-as.data.frame(rc$r)
61 |     r2<-as.data.frame(t(rcdf[1:2,]))
62 |     r3<-r2[abs(r2$loss)>0.25,]
63 |     attach(r3)
64 |     out<-r3[order(-loss),]
65 |     detach(r3)
66 |     return(out)
67 | }    
68 | 
69 | ##Impute values for algorithms that require it, using caret's imputation method
70 | require(caret); require(Metrics)
71 | pp<-preProcess(final_x,method="medianImpute")   ##very fast
72 | x2<-predict(pp,final_x)
73 | x3<-predict(pp,test[,2:ncol(test)])
74 | 
75 | 
76 | ## Main driver section
77 | dirs<-read.csv("project_locations.csv")
78 | setwd(as.character(dirs[1,2]))
79 | rm(dirs)
80 | 
81 | ##to do: replace this with code to create the data set
82 | load("final_x.Rdata")
83 | load("final_y.Rdata")
84 | load("test_final_with_id.Rdata")  ##test
85 | 
86 | ## example CV test
87 | df<-gbmDualCv(final_x,y,10,1000,0.05,10,30); sapply(df,mean)
88 | 
89 | ## run final data
90 | fp<-gbmFull(final_x,y,test,10,700,0.05,7,30,"laplace")
91 | write.table(fp,"gbm_20140227b.csv",append = FALSE, sep=',', row.names=FALSE, quote=TRUE, col.names=TRUE)
92 | 


--------------------------------------------------------------------------------
/modern_logistic_regression/README.md:
--------------------------------------------------------------------------------
 1 | Modern Logistic Regression
 2 | ======
 3 | 
 4 | This is a small collection of files that support the talk I gave to the Austin Machine Learning Meetup on 1/19/2015. 
 5 | 
 6 | These scripts are used directly (OK by the amusing license) from Kaggle forums, and provide a fantastic entry point for robust classification tasks.
 7 | 
 8 | * First version: easiest to start adapting
 9 |   * link: http://www.kaggle.com/c/criteo-display-ad-challenge/forums/t/10322/beat-the-benchmark-with-less-then-200mb-of-memory
10 |   * techniques: online learning, logistic regression, hash trick, adaptive learning rate
11 | * Second version: same plus multiclass output handling
12 |   * link: http://www.kaggle.com/c/tradeshift-text-classification/forums/t/10537/beat-the-benchmark-with-less-than-400mb-of-memory
13 |   * techniques: online learning, logistic regression, hash trick, adaptive learning rate, multiclass output
14 | * Third version: FTRL-proximal
15 |   * link: http://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
16 |   * techniques: online learning, logistic regression, hash trick, FTRL-proximal (stochastic gradient descent, L1 & L2 regularization)
17 | 


--------------------------------------------------------------------------------
/modern_logistic_regression/fast_solution.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
  3 |                    Version 2, December 2004
  4 | 
  5 | Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
  6 | 
  7 | Everyone is permitted to copy and distribute verbatim or modified
  8 | copies of this license document, and changing it is allowed as long
  9 | as the name is changed.
 10 | 
 11 |            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 12 |   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 13 | 
 14 |  0. You just DO WHAT THE FUCK YOU WANT TO.
 15 | '''
 16 | 
 17 | 
 18 | from datetime import datetime
 19 | from csv import DictReader
 20 | from math import exp, log, sqrt
 21 | 
 22 | 
 23 | # parameters #################################################################
 24 | 
 25 | train = 'train.csv'  # path to training file
 26 | test = 'test.csv'  # path to testing file
 27 | 
 28 | D = 2 ** 20   # number of weights use for learning
 29 | alpha = .1    # learning rate for sgd optimization
 30 | 
 31 | 
 32 | # function definitions #######################################################
 33 | 
 34 | # A. Bounded logloss
 35 | # INPUT:
 36 | #     p: our prediction
 37 | #     y: real answer
 38 | # OUTPUT
 39 | #     logarithmic loss of p given y
 40 | def logloss(p, y):
 41 |     p = max(min(p, 1. - 10e-12), 10e-12)
 42 |     return -log(p) if y == 1. else -log(1. - p)
 43 | 
 44 | 
 45 | # B. Apply hash trick of the original csv row
 46 | # for simplicity, we treat both integer and categorical features as categorical
 47 | # INPUT:
 48 | #     csv_row: a csv dictionary, ex: {'Lable': '1', 'I1': '357', 'I2': '', ...}
 49 | #     D: the max index that we can hash to
 50 | # OUTPUT:
 51 | #     x: a list of indices that its value is 1
 52 | def get_x(csv_row, D):
 53 |     x = [0]  # 0 is the index of the bias term
 54 |     for key, value in csv_row.items():
 55 |         index = int(value + key[1:], 16) % D  # weakest hash ever ;)
 56 |         x.append(index)
 57 |     return x  # x contains indices of features that have a value of 1
 58 | 
 59 | 
 60 | # C. Get probability estimation on x
 61 | # INPUT:
 62 | #     x: features
 63 | #     w: weights
 64 | # OUTPUT:
 65 | #     probability of p(y = 1 | x; w)
 66 | def get_p(x, w):
 67 |     wTx = 0.
 68 |     for i in x:  # do wTx
 69 |         wTx += w[i] * 1.  # w[i] * x[i], but if i in x we got x[i] = 1.
 70 |     return 1. / (1. + exp(-max(min(wTx, 20.), -20.)))  # bounded sigmoid
 71 | 
 72 | 
 73 | # D. Update given model
 74 | # INPUT:
 75 | #     w: weights
 76 | #     n: a counter that counts the number of times we encounter a feature
 77 | #        this is used for adaptive learning rate
 78 | #     x: feature
 79 | #     p: prediction of our model
 80 | #     y: answer
 81 | # OUTPUT:
 82 | #     w: updated model
 83 | #     n: updated count
 84 | def update_w(w, n, x, p, y):
 85 |     for i in x:
 86 |         # alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic
 87 |         # (p - y) * x[i] is the current gradient
 88 |         # note that in our case, if i in x then x[i] = 1
 89 |         w[i] -= (p - y) * alpha / (sqrt(n[i]) + 1.)
 90 |         n[i] += 1.
 91 | 
 92 |     return w, n
 93 | 
 94 | 
 95 | # training and testing #######################################################
 96 | 
 97 | # initialize our model
 98 | w = [0.] * D  # weights
 99 | n = [0.] * D  # number of times we've encountered a feature
100 | 
101 | # start training a logistic regression model using on pass sgd
102 | loss = 0.
103 | for t, row in enumerate(DictReader(open(train))):
104 |     y = 1. if row['Label'] == '1' else 0.
105 | 
106 |     del row['Label']  # can't let the model peek the answer
107 |     del row['Id']  # we don't need the Id
108 | 
109 |     # main training procedure
110 |     # step 1, get the hashed features
111 |     x = get_x(row, D)
112 | 
113 |     # step 2, get prediction
114 |     p = get_p(x, w)
115 | 
116 |     # for progress validation, useless for learning our model
117 |     loss += logloss(p, y)
118 |     if t % 1000000 == 0 and t > 1:
119 |         print('%s\tencountered: %d\tcurrent logloss: %f' % (
120 |             datetime.now(), t, loss/t))
121 | 
122 |     # step 3, update model with answer
123 |     w, n = update_w(w, n, x, p, y)
124 | 
125 | # testing (build kaggle's submission file)
126 | with open('submission1234.csv', 'w') as submission:
127 |     submission.write('Id,Predicted\n')
128 |     for t, row in enumerate(DictReader(open(test))):
129 |         Id = row['Id']
130 |         del row['Id']
131 |         x = get_x(row, D)
132 |         p = get_p(x, w)
133 |         submission.write('%s,%f\n' % (Id, p))
134 | 


--------------------------------------------------------------------------------
/modern_logistic_regression/fast_solution_v2.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
  3 |                    Version 2, December 2004
  4 | 
  5 | Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
  6 | 
  7 | Everyone is permitted to copy and distribute verbatim or modified
  8 | copies of this license document, and changing it is allowed as long
  9 | as the name is changed.
 10 | 
 11 |            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 12 |   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 13 | 
 14 |  0. You just DO WHAT THE FUCK YOU WANT TO.
 15 | '''
 16 | 
 17 | 
 18 | from datetime import datetime
 19 | from math import log, exp, sqrt
 20 | 
 21 | 
 22 | # TL; DR
 23 | # the main learning process start at line 122
 24 | 
 25 | 
 26 | # parameters #################################################################
 27 | 
 28 | train = 'train.csv'  # path to training file
 29 | label = 'trainLabels.csv'  # path to label file of training data
 30 | test = 'test.csv'  # path to testing file
 31 | 
 32 | D = 2 ** 18  # number of weights use for each model, we have 32 of them
 33 | alpha = .1   # learning rate for sgd optimization
 34 | 
 35 | 
 36 | # function, generator definitions ############################################
 37 | 
 38 | # A. x, y generator
 39 | # INPUT:
 40 | #     path: path to train.csv or test.csv
 41 | #     label_path: (optional) path to trainLabels.csv
 42 | # YIELDS:
 43 | #     ID: id of the instance (can also acts as instance count)
 44 | #     x: a list of indices that its value is 1
 45 | #     y: (if label_path is present) label value of y1 to y33
 46 | def data(path, label_path=None):
 47 |     for t, line in enumerate(open(path)):
 48 |         # initialize our generator
 49 |         if t == 0:
 50 |             # create a static x,
 51 |             # so we don't have to construct a new x for every instance
 52 |             x = [0] * 146
 53 |             if label_path:
 54 |                 label = open(label_path)
 55 |                 label.readline()  # we don't need the headers
 56 |             continue
 57 |         # parse x
 58 |         for m, feat in enumerate(line.rstrip().split(',')):
 59 |             if m == 0:
 60 |                 ID = int(feat)
 61 |             else:
 62 |                 # one-hot encode everything with hash trick
 63 |                 # categorical: one-hotted
 64 |                 # boolean: ONE-HOTTED
 65 |                 # numerical: ONE-HOTTED!
 66 |                 # note, the build in hash(), although fast is not stable,
 67 |                 #       i.e., same value won't always have the same hash
 68 |                 #       on different machines
 69 |                 x[m] = abs(hash(str(m) + '_' + feat)) % D
 70 |         # parse y, if provided
 71 |         if label_path:
 72 |             # use float() to prevent future type casting, [1:] to ignore id
 73 |             y = [float(y) for y in label.readline().split(',')[1:]]
 74 |         yield (ID, x, y) if label_path else (ID, x)
 75 | 
 76 | 
 77 | # B. Bounded logloss
 78 | # INPUT:
 79 | #     p: our prediction
 80 | #     y: real answer
 81 | # OUTPUT
 82 | #     bounded logarithmic loss of p given y
 83 | def logloss(p, y):
 84 |     p = max(min(p, 1. - 10e-15), 10e-15)
 85 |     return -log(p) if y == 1. else -log(1. - p)
 86 | 
 87 | 
 88 | # C. Get probability estimation on x
 89 | # INPUT:
 90 | #     x: features
 91 | #     w: weights
 92 | # OUTPUT:
 93 | #     probability of p(y = 1 | x; w)
 94 | def predict(x, w):
 95 |     wTx = 0.
 96 |     for i in x:  # do wTx
 97 |         wTx += w[i] * 1.  # w[i] * x[i], but if i in x we got x[i] = 1.
 98 |     return 1. / (1. + exp(-max(min(wTx, 20.), -20.)))  # bounded sigmoid
 99 | 
100 | 
101 | # D. Update given model
102 | # INPUT:
103 | # alpha: learning rate
104 | #     w: weights
105 | #     n: sum of previous absolute gradients for a given feature
106 | #        this is used for adaptive learning rate
107 | #     x: feature, a list of indices
108 | #     p: prediction of our model
109 | #     y: answer
110 | # MODIFIES:
111 | #     w: weights
112 | #     n: sum of past absolute gradients
113 | def update(alpha, w, n, x, p, y):
114 |     for i in x:
115 |         # alpha / sqrt(n) is the adaptive learning rate
116 |         # (p - y) * x[i] is the current gradient
117 |         # note that in our case, if i in x then x[i] = 1.
118 |         n[i] += abs(p - y)
119 |         w[i] -= (p - y) * 1. * alpha / sqrt(n[i])
120 | 
121 | 
122 | # training and testing #######################################################
123 | start = datetime.now()
124 | 
125 | # a list for range(0, 33) - 13, no need to learn y14 since it is always 0
126 | K = [k for k in range(33) if k != 13]
127 | 
128 | # initialize our model, all 32 of them, again ignoring y14
129 | w = [[0.] * D if k != 13 else None for k in range(33)]
130 | n = [[0.] * D if k != 13 else None for k in range(33)]
131 | 
132 | loss = 0.
133 | loss_y14 = log(1. - 10**-15)
134 | 
135 | for ID, x, y in data(train, label):
136 | 
137 |     # get predictions and train on all labels
138 |     for k in K:
139 |         p = predict(x, w[k])
140 |         update(alpha, w[k], n[k], x, p, y[k])
141 |         loss += logloss(p, y[k])  # for progressive validation
142 |     loss += loss_y14  # the loss of y14, logloss is never zero
143 | 
144 |     # print out progress, so that we know everything is working
145 |     if ID % 100000 == 0:
146 |         print('%s\tencountered: %d\tcurrent logloss: %f' % (
147 |             datetime.now(), ID, (loss/33.)/ID))
148 | 
149 | with open('./submission1234.csv', 'w') as outfile:
150 |     outfile.write('id_label,pred\n')
151 |     for ID, x in data(test):
152 |         for k in K:
153 |             p = predict(x, w[k])
154 |             outfile.write('%s_y%d,%s\n' % (ID, k+1, str(p)))
155 |             if k == 12:
156 |                 outfile.write('%s_y14,0.0\n' % ID)
157 | 
158 | print('Done, elapsed time: %s' % str(datetime.now() - start))
159 | 


--------------------------------------------------------------------------------
/modern_logistic_regression/fast_solution_v3.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
  3 |                    Version 2, December 2004
  4 | 
  5 | Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
  6 | 
  7 | Everyone is permitted to copy and distribute verbatim or modified
  8 | copies of this license document, and changing it is allowed as long
  9 | as the name is changed.
 10 | 
 11 |            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 12 |   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 13 | 
 14 |  0. You just DO WHAT THE FUCK YOU WANT TO.
 15 | '''
 16 | 
 17 | 
 18 | from datetime import datetime
 19 | from csv import DictReader
 20 | from math import exp, log, sqrt
 21 | 
 22 | 
 23 | # TL; DR, the main training process starts on line: 250,
 24 | # you may want to start reading the code from there
 25 | 
 26 | 
 27 | ##############################################################################
 28 | # parameters #################################################################
 29 | ##############################################################################
 30 | 
 31 | # A, paths
 32 | train = 'train_rev2'               # path to training file
 33 | test = 'test_rev2'                 # path to testing file
 34 | submission = 'submission1234.csv'  # path of to be outputted submission file
 35 | 
 36 | # B, model
 37 | alpha = .1  # learning rate
 38 | beta = 1.   # smoothing parameter for adaptive learning rate
 39 | L1 = 1.     # L1 regularization, larger value means more regularized
 40 | L2 = 1.     # L2 regularization, larger value means more regularized
 41 | 
 42 | # C, feature/hash trick
 43 | D = 2 ** 20             # number of weights to use
 44 | interaction = False     # whether to enable poly2 feature interactions
 45 | 
 46 | # D, training/validation
 47 | epoch = 1       # learn training data for N passes
 48 | holdafter = 9   # data after date N (exclusive) are used as validation
 49 | holdout = None  # use every N training instance for holdout validation
 50 | 
 51 | 
 52 | ##############################################################################
 53 | # class, function, generator definitions #####################################
 54 | ##############################################################################
 55 | 
 56 | class ftrl_proximal(object):
 57 |     ''' Our main algorithm: Follow the regularized leader - proximal
 58 | 
 59 |         In short,
 60 |         this is an adaptive-learning-rate sparse logistic-regression with
 61 |         efficient L1-L2-regularization
 62 | 
 63 |         Reference:
 64 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
 65 |     '''
 66 | 
 67 |     def __init__(self, alpha, beta, L1, L2, D, interaction):
 68 |         # parameters
 69 |         self.alpha = alpha
 70 |         self.beta = beta
 71 |         self.L1 = L1
 72 |         self.L2 = L2
 73 | 
 74 |         # feature related parameters
 75 |         self.D = D
 76 |         self.interaction = interaction
 77 | 
 78 |         # model
 79 |         # n: squared sum of past gradients
 80 |         # z: weights
 81 |         # w: lazy weights
 82 |         self.n = [0.] * D
 83 |         self.z = [0.] * D
 84 |         self.w = {}
 85 | 
 86 |     def _indices(self, x):
 87 |         ''' A helper generator that yields the indices in x
 88 | 
 89 |             The purpose of this generator is to make the following
 90 |             code a bit cleaner when doing feature interaction.
 91 |         '''
 92 | 
 93 |         # first yield index of the bias term
 94 |         yield 0
 95 | 
 96 |         # then yield the normal indices
 97 |         for index in x:
 98 |             yield index
 99 | 
100 |         # now yield interactions (if applicable)
101 |         if self.interaction:
102 |             D = self.D
103 |             L = len(x)
104 | 
105 |             x = sorted(x)
106 |             for i in xrange(L):
107 |                 for j in xrange(i+1, L):
108 |                     # one-hot encode interactions with hash trick
109 |                     yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D
110 | 
111 |     def predict(self, x):
112 |         ''' Get probability estimation on x
113 | 
114 |             INPUT:
115 |                 x: features
116 | 
117 |             OUTPUT:
118 |                 probability of p(y = 1 | x; w)
119 |         '''
120 | 
121 |         # parameters
122 |         alpha = self.alpha
123 |         beta = self.beta
124 |         L1 = self.L1
125 |         L2 = self.L2
126 | 
127 |         # model
128 |         n = self.n
129 |         z = self.z
130 |         w = {}
131 | 
132 |         # wTx is the inner product of w and x
133 |         wTx = 0.
134 |         for i in self._indices(x):
135 |             sign = -1. if z[i] < 0 else 1.  # get sign of z[i]
136 | 
137 |             # build w on the fly using z and n, hence the name - lazy weights
138 |             # we are doing this at prediction instead of update time is because
139 |             # this allows us for not storing the complete w
140 |             if sign * z[i] <= L1:
141 |                 # w[i] vanishes due to L1 regularization
142 |                 w[i] = 0.
143 |             else:
144 |                 # apply prediction time L1, L2 regularization to z and get w
145 |                 w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
146 | 
147 |             wTx += w[i]
148 | 
149 |         # cache the current w for update stage
150 |         self.w = w
151 | 
152 |         # bounded sigmoid function, this is the probability estimation
153 |         return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
154 | 
155 |     def update(self, x, p, y):
156 |         ''' Update model using x, p, y
157 | 
158 |             INPUT:
159 |                 x: feature, a list of indices
160 |                 p: click probability prediction of our model
161 |                 y: answer
162 | 
163 |             MODIFIES:
164 |                 self.n: increase by squared gradient
165 |                 self.z: weights
166 |         '''
167 | 
168 |         # parameter
169 |         alpha = self.alpha
170 | 
171 |         # model
172 |         n = self.n
173 |         z = self.z
174 |         w = self.w
175 | 
176 |         # gradient under logloss
177 |         g = p - y
178 | 
179 |         # update z and n
180 |         for i in self._indices(x):
181 |             sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
182 |             z[i] += g - sigma * w[i]
183 |             n[i] += g * g
184 | 
185 | 
186 | def logloss(p, y):
187 |     ''' FUNCTION: Bounded logloss
188 | 
189 |         INPUT:
190 |             p: our prediction
191 |             y: real answer
192 | 
193 |         OUTPUT:
194 |             logarithmic loss of p given y
195 |     '''
196 | 
197 |     p = max(min(p, 1. - 10e-15), 10e-15)
198 |     return -log(p) if y == 1. else -log(1. - p)
199 | 
200 | 
201 | def data(path, D):
202 |     ''' GENERATOR: Apply hash-trick to the original csv row
203 |                    and for simplicity, we one-hot-encode everything
204 | 
205 |         INPUT:
206 |             path: path to training or testing file
207 |             D: the max index that we can hash to
208 | 
209 |         YIELDS:
210 |             ID: id of the instance, mainly useless
211 |             x: a list of hashed and one-hot-encoded 'indices'
212 |                we only need the index since all values are either 0 or 1
213 |             y: y = 1 if we have a click, else we have y = 0
214 |     '''
215 | 
216 |     for t, row in enumerate(DictReader(open(path))):
217 |         # process id
218 |         ID = row['id']
219 |         del row['id']
220 | 
221 |         # process clicks
222 |         y = 0.
223 |         if 'click' in row:
224 |             if row['click'] == '1':
225 |                 y = 1.
226 |             del row['click']
227 | 
228 |         # extract date
229 |         date = int(row['hour'][4:6])
230 | 
231 |         # turn hour really into hour, it was originally YYMMDDHH
232 |         row['hour'] = row['hour'][6:]
233 | 
234 |         # build x
235 |         x = []
236 |         for key in row:
237 |             value = row[key]
238 | 
239 |             # one-hot encode everything with hash trick
240 |             index = abs(hash(key + '_' + value)) % D
241 |             x.append(index)
242 | 
243 |         yield t, date, ID, x, y
244 | 
245 | 
246 | ##############################################################################
247 | # start training #############################################################
248 | ##############################################################################
249 | 
250 | start = datetime.now()
251 | 
252 | # initialize ourselves a learner
253 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
254 | 
255 | # start training
256 | for e in xrange(epoch):
257 |     loss = 0.
258 |     count = 0
259 | 
260 |     for t, date, ID, x, y in data(train, D):  # data is a generator
261 |         #    t: just a instance counter
262 |         # date: you know what this is
263 |         #   ID: id provided in original data
264 |         #    x: features
265 |         #    y: label (click)
266 | 
267 |         # step 1, get prediction from learner
268 |         p = learner.predict(x)
269 | 
270 |         if (holdafter and date > holdafter) or (holdout and t % holdout == 0):
271 |             # step 2-1, calculate validation loss
272 |             #           we do not train with the validation data so that our
273 |             #           validation loss is an accurate estimation
274 |             #
275 |             # holdafter: train instances from day 1 to day N
276 |             #            validate with instances from day N + 1 and after
277 |             #
278 |             # holdout: validate with every N instance, train with others
279 |             loss += logloss(p, y)
280 |             count += 1
281 |         else:
282 |             # step 2-2, update learner with label (click) information
283 |             learner.update(x, p, y)
284 | 
285 |     print('Epoch %d finished, validation logloss: %f, elapsed time: %s' % (
286 |         e, loss/count, str(datetime.now() - start)))
287 | 
288 | 
289 | ##############################################################################
290 | # start testing, and build Kaggle's submission file ##########################
291 | ##############################################################################
292 | 
293 | with open(submission, 'w') as outfile:
294 |     outfile.write('id,click\n')
295 |     for t, date, ID, x, y in data(test, D):
296 |         p = learner.predict(x)
297 |         outfile.write('%s,%s\n' % (ID, str(p)))
298 | 


--------------------------------------------------------------------------------
/rain/README.md:
--------------------------------------------------------------------------------
1 | Code for Kaggle Will it Rain competition
2 | 


--------------------------------------------------------------------------------
/rain/prepareTrain.R:
--------------------------------------------------------------------------------
  1 | ##############################
  2 | ### Large script for moving through aggregations by applying a similar methodology to each
  3 | ### Appears as a large script because that's how it really was worked on; each pattern was 
  4 | ###  repeated on new features
  5 | ###
  6 | ### Makes a lot of assumptions about having data in place. Working to streamline this all
  7 | ###  to look like a production process. But getting "something" out now.
  8 | ##############################
  9 | 
 10 | source("rain_functions.R")
 11 | 
 12 | q<-lapply(train.RadarQualityIndex, function(x) as.numeric(removeInvalids(x)))
 13 | 
 14 | HybridScan<-lapply(train.HybridScan, function(x) as.numeric(removeInvalids(x)))
 15 | wtHybridScan<-mapply(function(x,y) getAggregates(x,"weightedMean",y),x=HybridScan,y=q)
 16 | wtHybridScan[is.na(wtHybridScan)]<-NA
 17 | meanHybridScan<-mapply(function(x,y) getAggregates(x,"mean",y),x=HybridScan,y=q)
 18 | meanHybridScan[is.na(meanHybridScan)]<-NA
 19 | wtSquaredHybridScan<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=HybridScan,y=q)
 20 | wtSquaredHybridScan[is.na(wtSquaredHybridScan)]<-NA
 21 | rm(HybridScan)
 22 | 
 23 | 
 24 | minDistanceToRadar<-unlist(lapply(train.DistanceToRadar,min))
 25 | 
 26 | Reflectivity<-lapply(train.Reflectivity, function(x) as.numeric(removeInvalids(x)))
 27 | wtReflectivity<-mapply(function(x,y) getAggregates(x,"weightedMean",y),x=Reflectivity,y=q)
 28 | wtReflectivity[is.na(wtReflectivity)]<-NA
 29 | meanReflectivity<-mapply(function(x,y) getAggregates(x,"mean",y),x=Reflectivity,y=q)
 30 | meanReflectivity[is.na(meanReflectivity)]<-NA
 31 | rm(Reflectivity); gc()
 32 | 
 33 | ReflectivityQC<-lapply(train.ReflectivityQC, function(x) as.numeric(removeInvalids(x)))
 34 | wtReflectivityQC<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=ReflectivityQC,y=q)
 35 | wtReflectivityQC[is.na(wtReflectivityQC)]<-NA
 36 | meanReflectivityQC<-mapply(function(x,y) getAggregates(x,"mean",y),x=ReflectivityQC,y=q)
 37 | meanReflectivityQC[is.na(meanReflectivityQC)]<-NA
 38 | rm(ReflectivityQC); gc()
 39 | 
 40 | Zdr<-lapply(train.Zdr, function(x) as.numeric(removeInvalids(x)))
 41 | wtZdr<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=Zdr,y=q)
 42 | wtZdr[is.na(wtZdr)]<-NA
 43 | meanZdr<-mapply(function(x,y) getAggregates(x,"mean",y),x=Zdr,y=q)
 44 | meanZdr[is.na(meanZdr)]<-NA
 45 | rm(Zdr); gc()
 46 | 
 47 | LogWaterVolume<-lapply(train.LogWaterVolume, function(x) as.numeric(removeInvalids(x)))
 48 | wtLogWaterVolume<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=LogWaterVolume,y=q)
 49 | wtLogWaterVolume[is.na(wtLogWaterVolume)]<-NA
 50 | meanLogWaterVolume<-mapply(function(x,y) getAggregates(x,"mean",y),x=LogWaterVolume,y=q)
 51 | meanLogWaterVolume[is.na(meanLogWaterVolume)]<-NA
 52 | rm(LogWaterVolume); gc()
 53 | 
 54 | Composite<-lapply(train.Composite, function(x) as.numeric(removeInvalids(x)))
 55 | wtComposite<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=Composite,y=q)
 56 | wtComposite[is.na(wtComposite)]<-NA
 57 | meanComposite<-mapply(function(x,y) getAggregates(x,"mean",y),x=Composite,y=q)
 58 | meanComposite[is.na(meanComposite)]<-NA
 59 | rm(Composite); gc()
 60 | 
 61 | RR1<-lapply(train.RR1, function(x) as.numeric(removeInvalids(x)))
 62 | wtRR1<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=RR1,y=q)
 63 | wtRR1[is.na(wtRR1)]<-NA
 64 | meanRR1<-mapply(function(x,y) getAggregates(x,"mean",y),x=RR1,y=q)
 65 | meanRR1[is.na(meanRR1)]<-NA
 66 | rm(RR1); gc()
 67 | 
 68 | RR2<-lapply(train.RR2, function(x) as.numeric(removeInvalids(x)))
 69 | wtRR2<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=RR2,y=q)
 70 | wtRR2[is.na(wtRR2)]<-NA
 71 | meanRR2<-mapply(function(x,y) getAggregates(x,"mean",y),x=RR2,y=q)
 72 | meanRR2[is.na(meanRR2)]<-NA
 73 | rm(RR2); gc()
 74 | 
 75 | RR3<-lapply(train.RR3, function(x) as.numeric(removeInvalids(x)))
 76 | wtRR3<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=RR3,y=q)
 77 | wtRR3[is.na(wtRR3)]<-NA
 78 | meanRR3<-mapply(function(x,y) getAggregates(x,"mean",y),x=RR3,y=q)
 79 | meanRR3[is.na(meanRR3)]<-NA
 80 | rm(RR3); gc()
 81 | 
 82 | RhoHV<-lapply(train.RhoHV, function(x) as.numeric(removeInvalids(x)))
 83 | wtRhoHV<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=RhoHV,y=q)
 84 | wtRhoHV[is.na(wtRhoHV)]<-NA
 85 | meanRhoHV<-mapply(function(x,y) getAggregates(x,"mean",y),x=RhoHV,y=q)
 86 | meanRhoHV[is.na(meanRhoHV)]<-NA
 87 | rm(RhoHV); gc()
 88 | 
 89 | Velocity<-lapply(train.Velocity, function(x) as.numeric(removeInvalids(x)))
 90 | wtVelocity<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=Velocity,y=q)
 91 | wtVelocity[is.na(wtVelocity)]<-NA
 92 | meanVelocity<-mapply(function(x,y) getAggregates(x,"mean",y),x=Velocity,y=q)
 93 | meanVelocity[is.na(meanVelocity)]<-NA
 94 | rm(Velocity); gc()
 95 | 
 96 | modRain<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x) %in% c(1,2),1,0))))
 97 | heavyRain<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x)==3,1,0))))
 98 | rainHail<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x)==4,1,0))))
 99 | bigDrops<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x)==5,1,0))))
100 | snow<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x) %in% c(10,11),1,0))))
101 | graupel<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x) %in% c(13,14),1,0))))
102 | validReadings<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x) >0,1,0))))
103 | allPrecipitation<-unlist(lapply(train.HydrometeorType,function(x) sum(ifelse(as.numeric(x) %in% c(1,2,3,4,5,10,11,12,13,14),1,0))))
104 | 
105 | #train.MassWeightedMean<-lapply(as.character(train[,MassWeightedMean]),function(x) as.numeric(strsplit(x," ")[[1]])); train[,MassWeightedMean:=NULL]; gc()
106 | #train.MassWeightedSD<-lapply(as.character(train[,MassWeightedSD]),function(x) as.numeric(strsplit(x," ")[[1]])); train[,MassWeightedSD:=NULL]; gc()
107 | #train.HydrometeorType<-lapply(as.character(train[,HydrometeorType]),function(x) as.numeric(strsplit(x," ")[[1]])); train[,HydrometeorType:=NULL]; gc()
108 | 
109 | maxRadarQualityIndex<-mapply(function(x,y) getAggregates(x,"max",y),x=q,y=q)
110 | maxRadarQualityIndex[is.na(maxRadarQualityIndex)]<-NA
111 | avgRadarQualityIndex<-mapply(function(x,y) getAggregates(x,"mean",y),x=q,y=q)
112 | avgRadarQualityIndex[is.na(avgRadarQualityIndex)]<-NA
113 | maxTimeToEnd<-mapply(function(x,y) getAggregates(x,"max",y),x=train.TimeToEnd,y=q)
114 | maxTimeToEnd[is.na(maxTimeToEnd)]<-NA
115 | minTimeToEnd<-mapply(function(x,y) getAggregates(x,"min",y),x=train.TimeToEnd,y=q)
116 | minTimeToEnd[is.na(minTimeToEnd)]<-NA
117 | diffTimeToEnd<-maxTimeToEnd-minTimeToEnd
118 | diffTimeToEnd[is.na(diffTimeToEnd)]<-NA
119 | 
120 | t5<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x>49 & x<61,1,0))))
121 | t4<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x>39 & x<51,1,0))))
122 | t3<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x>29 & x<41,1,0))))
123 | t2<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x>19 & x<31,1,0))))
124 | t1<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x>9 & x<21,1,0))))
125 | t0<-unlist(lapply(train.TimeToEnd,function(x) max(ifelse(x<11,1,0))))
126 | time10s<-t5+t4+t3+t2+t1+t0
127 | 
128 | load("RdataFiles/train.Reflectivity.Rdata")
129 | load("RdataFiles/train.TimeToEnd.Rdata")
130 | Ref<-lapply(train.Reflectivity, function(x) as.numeric(removeInvalids(x)))
131 | r5<-unlist(mapply(function(x,y) max(ifelse(x>49 & x<61,y,0)),x=train.TimeToEnd,y=Ref))
132 | r4<-unlist(mapply(function(x,y) max(ifelse(x>39 & x<50,y,0)),x=train.TimeToEnd,y=Ref))
133 | r3<-unlist(mapply(function(x,y) max(ifelse(x>29 & x<40,y,0)),x=train.TimeToEnd,y=Ref))
134 | r2<-unlist(mapply(function(x,y) max(ifelse(x>19 & x<30,y,0)),x=train.TimeToEnd,y=Ref))
135 | r1<-unlist(mapply(function(x,y) max(ifelse(x>9 & x<20,y,0)),x=train.TimeToEnd,y=Ref))
136 | r0<-unlist(mapply(function(x,y) max(ifelse(x<10,y,0)),x=train.TimeToEnd,y=Ref))
137 | refMax10s<-r5+r4+r3+r2+r1+r0
138 | refMaxSql10s<-r5^2+r4^2+r3^2+r2^2+r1^2+r0^2
139 | save(refMax10s,file="RdataFiles/refMax10s.Rdata")
140 | save(refMaxSql10s,file="RdataFiles/refMaxSql10s.Rdata")
141 | rm(r5,r4,r3,r2,r1,r0,Ref,train.Reflectivity)
142 | 
143 | 
144 | load("RdataFiles/train.RhoHV.Rdata")
145 | Rho<-train.RhoHV
146 | Rho5<-unlist(mapply(function(x,y) max(ifelse(x>49 & x<61,y,0)),x=train.TimeToEnd,y=Rho))
147 | Rho4<-unlist(mapply(function(x,y) max(ifelse(x>39 & x<50,y,0)),x=train.TimeToEnd,y=Rho))
148 | Rho3<-unlist(mapply(function(x,y) max(ifelse(x>29 & x<40,y,0)),x=train.TimeToEnd,y=Rho))
149 | Rho2<-unlist(mapply(function(x,y) max(ifelse(x>19 & x<30,y,0)),x=train.TimeToEnd,y=Rho))
150 | Rho1<-unlist(mapply(function(x,y) max(ifelse(x>9 & x<20,y,0)),x=train.TimeToEnd,y=Rho))
151 | Rho0<-unlist(mapply(function(x,y) max(ifelse(x<10,y,0)),x=train.TimeToEnd,y=Rho))
152 | RhoMax10s<-Rho5+Rho4+Rho3+Rho2+Rho1+Rho0
153 | RhoMaxSq10s<-Rho5^2+Rho4^2+Rho3^2+Rho2^2+Rho1^2+Rho0^2
154 | save(RhoMax10s,file="RdataFiles/RhoMax10s.Rdata")
155 | save(RhoMaxSq10s,file="RdataFiles/RhoMaxSq10s.Rdata")
156 | rm(Rho5,Rho4,Rho3,Rho2,Rho1,Rho0,Rho,train.RhoHV)
157 | 
158 | 
159 | load("RdataFiles/train.Reflectivity.Rdata")
160 | load("RdataFiles/train.DistanceToRadar.Rdata")
161 | Sys.time(); vec<-mapply(function(x,y) getBestRadarScalar(cbind(x,y)),x=train.DistanceToRadar,y=train.Reflectivity); Sys.time()	## 14 minutes
162 | Sys.time(); siteRef<-mapply(function(x,y) getAggregates(x,"weightedMean",y,2),x=train.Reflectivity,y=vec); Sys.time()	## 3 minutes
163 | 
164 | yBinary<-ifelse(Expected<=0,1,0)
165 | colnames(yBinary)<-"yBinary"
166 | ##for H2O; for R, remove first two columns
167 | x<-as.data.frame(cbind(Expected,yBinary,wtHybridScan,meanHybridScan,wtSquaredHybridScan,minDistanceToRadar,wtReflectivity,meanReflectivity,
168 | 	wtReflectivityQC,meanReflectivityQC,meanLogWaterVolume,wtLogWaterVolume,meanZdr,wtZdr,
169 | 	wtComposite,meanComposite,wtRR1,meanRR1,wtRR2,meanRR2,wtRR3,meanRR3,wtRhoHV,meanRhoHV,wtVelocity,meanVelocity,
170 | 	modRain,heavyRain,rainHail,bigDrops,snow,graupel,validReadings,allPrecipitation,
171 | 	avgRadarQualityIndex,maxRadarQualityIndex,maxTimeToEnd,minTimeToEnd,diffTimeToEnd,time10s
172 | 	,refMax10s,refMaxSql10s,RhoMax10s,RhoMaxSq10s,siteRef
173 | 	))
174 | x1<-x[1:(nrow(x)*0.9),]
175 | x2<-x[(1+nrow(x1)):nrow(x),]
176 | 
177 | #if testing, this makes things easier:
178 |  x0<-x2
179 | 


--------------------------------------------------------------------------------
/rain/rain_functions.R:
--------------------------------------------------------------------------------
  1 | trimSpaces <- function(x) return(gsub("^ *|(?<= ) | *$", "", x, perl=T))
  2 | 
  3 | removeInvalids<-function(x,xNew="") return(gsub("999",xNew,
  4 | gsub("-99901",xNew,
  5 | gsub("-99902",xNew,
  6 | gsub("-99903",xNew,
  7 | gsub("-99900",xNew,
  8 | gsub("999.0",xNew,
  9 | gsub("-99901.0",xNew,
 10 | gsub("-99902.0",xNew,
 11 | gsub("-99903.0",xNew,
 12 | gsub("-99900.0",xNew,
 13 | gsub("nan",xNew,
 14 |  x))))))))))))
 15 | 
 16 | getAggregates<-function(x,type="weightedMean",wts=NA,exponent=1){
 17 |   ## pass in a vector of values and vector of weights
 18 |   ##   get back a single number, according to the specified type
 19 |   ##   type: mean, median, sd, max, min, meanSquared, sdSquared
 20 |   ## handles NAs, including fields with all NAs (returns NAs)
 21 | 
 22 |   if(type=="min") return(min(x^exponent,na.rm=TRUE))
 23 |   if(type=="max") return(max(x^exponent,na.rm=TRUE))
 24 |   if(type=="median") return(median(x^exponent,na.rm=TRUE))
 25 |   if(type=="sd") return(sd(x^exponent,na.rm=TRUE))
 26 |   if(type=="mean") return(mean(x^exponent,na.rm=TRUE))
 27 | 
 28 |   if(length(x)==sum(is.na(wts))) return(NA)
 29 |   wts[is.na(wts)]<-0
 30 |   x[is.na(x)]<-0
 31 |   if(type=="weightedMean") return(sum(x^exponent*wts)/sum(wts))
 32 | 
 33 |   return(NA)
 34 | }
 35 | 
 36 | getGlmByMM<-function(x1,x2,x3,raw1,raw2,threshold){
 37 |   library(glmnet)
 38 |   library(Metrics)
 39 |   library(Matrix)
 40 |   y1<-ifelse(raw1<=threshold,1,0)
 41 |   y2<-ifelse(raw2<=threshold,1,0)
 42 |   x1[is.na(x1)] <- 0
 43 |   x2[is.na(x2)] <- 0
 44 |   x3[is.na(x3)] <- 0
 45 |   g<-glmnet(as.matrix(x1),as.factor(y1),family=c("binomial"))
 46 |   p2<-pmin(1.0,pmax(0.0,predict(g,as.matrix(x2),s=g$lambda[length(g$lambda)],type="response")))
 47 |   p3<-pmin(1.0,pmax(0.0,predict(g,as.matrix(x3),s=g$lambda[length(g$lambda)],type="response")))
 48 |   print(mse(y2,p2))
 49 |   return(p3)
 50 | }
 51 | 
 52 | ## pretty bulky this way, but it's very simple to get things up and running.
 53 | getGbmByMM<-function(x1,x2,x3,raw1,raw2,threshold,trees=100,depth=5,minObs=30,shrink=0.1){
 54 |   ## x1: x values for the full training set
 55 |   ## x2: x values for the holdout set (for which we have answers)
 56 |   ## x3: x values for the prediction set (generally that for which we do not have answers)
 57 |   ## raw1: target for full training: rain, in mm; called raw because this will be converted to binary
 58 |   ## raw2: target for the holdout set: also rain, in mm
 59 |   ## threshold: which rain "bucket" we are predicted: 0 = p(rain<=0mm); 1 = p(rain<=1mm); etc.
 60 |   ## trees, depth, minObs,shrink: passthrough GBM parameters
 61 |   ##
 62 |   ## returns: prediction vector, bound between 0.0 and 1.0
 63 |   ##
 64 |   ## additional feedback: prints the holdout score, mse using raw2 and predictions given x2
 65 | 
 66 |   library(gbm)
 67 |   library(Metrics) ##can easily remove this dependency by using the simple MSE equation
 68 |   y1<-ifelse(raw1<=threshold,1,0)
 69 |   y2<-ifelse(raw2<=threshold,1,0)
 70 | 
 71 |   G<-gbm.fit(x1,y1,distribution="bernoulli",n.trees=trees, interaction.depth=depth,n.minobsinnode=minObs,shrinkage=shrink)
 72 |   p2<-pmin(1.0,pmax(0.0,predict(G,x2,n.trees=trees,type="response")))
 73 |   p3<-pmin(1.0,pmax(0.0,predict(G,x3,n.trees=trees,type="response")))
 74 |   print(summary(G,plotit=FALSE))
 75 |   print(mse(y2,p2))
 76 |   return(p3)
 77 | }
 78 | 
 79 | getH2OGbmByMM<-function(h2oInst,x1,x2,x3,xCols,threshold,trees=100,depth=5,minObs=20,shrink=0.1,printImportance=0,printGbm=TRUE){
 80 |   ## x1: parsed H2O object for the full training set
 81 |   ## x2: parsed H2O object for the holdout set (for which we have answers)
 82 |   ## x3: parsed H2O object for the prediction set (generally that for which we do not have answers)
 83 |   ## threshold: which rain "bucket" we are predicted: 0 = p(rain<=0mm); 1 = p(rain<=1mm); etc.
 84 |   ## trees, depth,shrink: passthrough GBM parameters
 85 |   ##
 86 |   ## returns: prediction vector, bound between 0.0 and 1.0
 87 |   ##
 88 |   ## additional feedback: prints the holdout score, mse using raw2 and predictions given x2 & importance (if desired)
 89 | 
 90 |   library(Metrics) ##can easily remove this dependency by using the simple MSE equation
 91 | 
 92 |   ## convert the yBinary column into the target, specific to the threshold value passed in
 93 |   train$yBinary<-ifelse(train$Expected<=threshold,1,0)
 94 |   hold$yBinary<-ifelse(hold$Expected<=threshold,1,0)
 95 | 
 96 |   ## train a gbm model, then use it to predict the passed in holdout (x2) and prediction sets (x3)
 97 |   my.gbm <- h2o.gbm(x=xCols,y="yBinary",distribution="bernoulli",data=x1,key="removeWhenDone.hex",n.trees=trees,interaction.depth=depth,shrinkage=shrink,n.minobsinnode=minObs,importance=T)
 98 |   holdVals <- h2o.predict(my.gbm,x2)   ## make predictions
 99 |   returnObj<-h2o.predict(my.gbm,x3)
100 |   returnVals<-returnObj[,3]
101 | 
102 |   ## print mse and importance values if requested
103 |   print(mse(hold$yBinary,holdVals[,3]))
104 |   if(printImportance<0){print(my.gbm@model$varimp)}
105 |   if(printImportance>0){print(my.gbm@model$varimp[printImportance,])}
106 | 
107 |   ## clean up unneeded cache objects
108 |   h2o.rm(h2oInst,"removeWhenDone.hex")
109 |   return(returnVals)
110 | }
111 | 
112 | getH2ODeepLearningByMM<-function(h2oInst,x1,x2,x3,xCols,threshold,
113 | 	hidden=c(200,200),epochs=1,activation="RectifierWithDropout", classification=TRUE, ...){
114 |   ## x1: parsed H2O object for the full training set
115 |   ## x2: parsed H2O object for the holdout set (for which we have answers)
116 |   ## x3: parsed H2O object for the prediction set (generally that for which we do not have answers)
117 |   ## threshold: which rain "bucket" we are predicted: 0 = p(rain<=0mm); 1 = p(rain<=1mm); etc.
118 |   ## remaining: passthrough to h2o.deeplearning
119 |   ##
120 |   ## returns: prediction vector, bound between 0.0 and 1.0
121 |   ##
122 |   ## additional feedback: prints the holdout score, mse using raw2 and predictions given x2 & importance (if desired)
123 | 
124 |   library(Metrics) ##can easily remove this dependency by using the simple MSE equation
125 |   slotNum<-ifelse(classification==TRUE,3,1)
126 | 
127 |   ## convert the yBinary column into the target, specific to the threshold value passed in
128 |   train$yBinary<-ifelse(train$Expected<=threshold,1,0)
129 |   hold$yBinary<-ifelse(hold$Expected<=threshold,1,0)
130 | 
131 |   ## train a gbm model, then use it to predict the passed in holdout (x2) and prediction sets (x3)
132 |   my.deepL <- h2o.deeplearning(x=xCols,y="yBinary",data=x1,key="removeWhenDone.hex",
133 | 	activation=activation,hidden=hidden,epochs=epochs, classification=classification, ...)
134 |   holdVals <- h2o.predict(my.deepL,x2)   ## make predictions
135 |   returnObj<-h2o.predict(my.deepL,x3)
136 |   returnVals<-returnObj[,slotNum]
137 | 
138 |   ## print mse and importance values if requested
139 |   print(mse(hold$yBinary,holdVals[,slotNum]))
140 | 
141 |   #if(printImportance<0){print(my.gbm@model$varimp)}
142 |   #if(printImportance>0){print(my.gbm@model$varimp[printImportance,])}
143 | 
144 |   ## clean up unneeded cache objects
145 |   h2o.rm(h2oInst,"removeWhenDone.hex")
146 |   return(returnVals)
147 | }
148 | 
149 | clearh2o<-function(h2oInst){
150 | h2o.rm(h2oInst, grep(pattern = "GBMPredict", x = h2o.ls(h2oInst)$Key, value = TRUE))
151 | h2o.rm(h2oInst, grep(pattern = "Last.value", x = h2o.ls(h2oInst)$Key, value = TRUE))
152 | }
153 | 
154 | ### Utility functions to help look at the data
155 | roundTexttoText<-function(x,decimals=1){
156 |  if(as.character(x) %in% c("-",""," ","NaN","nan", "-99901","-99902","-99903","999","-99901.0","-99902.0","-99903.0","-99900","-99900.0")){return("")}
157 |  else{return(round(as.numeric(as.character(x)),decimals))}
158 | }
159 | 
160 | getDfFromRaw<-function(i,train){
161 | ### This will show the raw data for a particular hourly reading in an easy to read format.
162 | ### It merges the vectors, discards garbage readings, and formats the output to try and make it concise
163 | ### Currently it requires the vectors to exist, but it would be easy to make this work from the raw data
164 | ###
165 | ### Usage: getDfFromRaw(8,train), which will show you the 13 radar readings for the 8th record
166 | ### it can take a second or two to go through all the parsing
167 | df<-as.data.frame(cbind(rep(train[i,Expected],length(train$TimeToEnd[[i]])),
168 | unlist(lapply(lapply(as.character(train$TimeToEnd),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
169 | unlist(lapply(lapply(as.character(train$DistanceToRadar),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
170 | unlist(lapply(lapply(as.character(train$RadarQualityIndex),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
171 | unlist(lapply(lapply(as.character(train$HydrometeorType),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
172 | unlist(lapply(lapply(as.character(train$HybridScan),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
173 | unlist(lapply(lapply(as.character(train$Reflectivity),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
174 | unlist(lapply(lapply(as.character(train$ReflectivityQC),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
175 | unlist(lapply(lapply(as.character(train$RR1),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
176 | unlist(lapply(lapply(as.character(train$RR2),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
177 | unlist(lapply(lapply(as.character(train$RR3),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
178 | unlist(lapply(lapply(as.character(train$Zdr),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
179 | unlist(lapply(lapply(as.character(train$MassWeightedMean),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
180 | unlist(lapply(lapply(as.character(train$MassWeightedSD),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
181 | unlist(lapply(lapply(as.character(train$Composite),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
182 | unlist(lapply(lapply(as.character(train$LogWaterVolume),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x))),
183 | unlist(lapply(lapply(as.character(train$Velocity),function(x) as.numeric(strsplit(x," ")[[1]]))[[i]],function(x) roundTexttoText(x)))
184 | ))
185 | colnames(df)<-c("y","Tm","Dist","Qual","Type","HybSc","Refl","RefQc","RR1","RR2","RR3","Zdr","MWM","MWsd","Cmp","LWV","V")
186 | return(df)
187 | }
188 | 
189 | 
190 | getDfFromVectors<-function(i){
191 | ### This will show the raw data for a particular hourly reading.
192 | ### It merges the vectors, discards garbage readings, and formats the output to try and make it concise
193 | ### This version is faster to use if you have the vectors pre-processed in the names provided by our pre-processing code
194 | ###
195 | ### Usage: getDfFromVectors(8), which will show you the 13 radar readings for the 8th record
196 | df<-as.data.frame(cbind(rep(train[i,Expected],length(train.TimeToEnd[[i]])),
197 | train.TimeToEnd[[i]],
198 | unlist(lapply(train.DistanceToRadar[[i]],function(x) roundTexttoText(x))),
199 | unlist(lapply(train.RadarQualityIndex[[i]],function(x) roundTexttoText(x))),
200 | unlist(lapply(train.HydrometeorType[[i]],function(x) roundTexttoText(x))),
201 | unlist(lapply(train.HybridScan[[i]],function(x) roundTexttoText(x))),
202 | unlist(lapply(train.Reflectivity[[i]],function(x) roundTexttoText(x))),
203 | unlist(lapply(train.ReflectivityQC[[i]],function(x) roundTexttoText(x))),
204 | unlist(lapply(train.RR1[[i]],function(x) roundTexttoText(x))),
205 | unlist(lapply(train.RR2[[i]],function(x) roundTexttoText(x))),
206 | unlist(lapply(train.RR3[[i]],function(x) roundTexttoText(x))),
207 | unlist(lapply(train.Zdr[[i]],function(x) roundTexttoText(x))),
208 | unlist(lapply(train.MassWeightedMean[[i]],function(x) roundTexttoText(x))),
209 | unlist(lapply(train.MassWeightedSD[[i]],function(x) roundTexttoText(x))),
210 | unlist(lapply(train.Composite[[i]],function(x) roundTexttoText(x))),
211 | unlist(lapply(train.LogWaterVolume[[i]],function(x) roundTexttoText(x))),
212 | unlist(lapply(train.Velocity[[i]],function(x) roundTexttoText(x)))
213 | ))
214 | colnames(df)<-c("y","Tm","Dist","Qual","Type","HybSc","Refl","RefQc","RR1","RR2","RR3","Zdr","MWM","MWsd","Cmp","LWV","V")
215 | return(df)
216 | }
217 | 


--------------------------------------------------------------------------------
/rossmann-store-sales/ACM-data-science-camp.R:
--------------------------------------------------------------------------------
  1 | library(data.table)  
  2 | library(h2o)
  3 | 
  4 | cat("reading the train and test data (with data.table) \n")
  5 | train <- fread("../input/train.csv",stringsAsFactors = T)
  6 | store <- fread("../input/store.csv",stringsAsFactors = T)
  7 | 
  8 | dim(train)     ## look at size
  9 | train          ## take a look at some example rows
 10 | store          ## examples from the store data
 11 | 
 12 | ## Merge (i.e. join) the train set with the store data
 13 | train <- merge(train,store,by="Store")
 14 | train          ## glance at merged data
 15 | 
 16 | ## Convert date field into a proper date format, then extract month and year
 17 | train[,Date:=as.Date(Date)]
 18 | train[,month:=as.integer(format(Date, "%m"))]
 19 | train[,year:=as.integer(format(Date, "%y"))]
 20 | train   ## make sure the calculations came out as expected
 21 | 
 22 | ## Use H2O's random forest
 23 | ## Start cluster with all available threads
 24 | h2o.init(nthreads=-1,max_mem_size='6G')
 25 | ## Load data into cluster from R
 26 | trainHex<-as.h2o(train[year <15 | month <6,],destination_frame = "trainHex")
 27 | validHex<-as.h2o(train[year == 15 & month >= 6,],destination_frame = "validHex")
 28 | dim(trainHex); dim(validHex)
 29 | 
 30 | ## Set up variable to use all features other than those specified here
 31 | features<-colnames(train)[!(colnames(train) %in% c("Id","Date","Sales","Customers"))]
 32 | ## Train a random forest using all default parameters
 33 | rfHex <- h2o.randomForest(x=features,
 34 |                           y="Sales", 
 35 |                           training_frame=trainHex,
 36 |                           validation_frame=validHex,
 37 |                           model_id="introRF")
 38 | 
 39 | ## Now let's see how it performed
 40 | 
 41 | ## The specific competition error metric is not defined, so we will define it
 42 | rmspe<-function(actuals,predictions){return(mean(((actuals[actuals>0]-predictions[actuals>0])/actuals[actuals>0])^2)^0.5)}
 43 | ## Create a data frame with the actuals and H2O predictions 
 44 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(rfHex,validHex))[,1]))
 45 | 
 46 | ## Score the predictions 
 47 | rmspe(judge[,1],judge[,2])
 48 | 
 49 | ## And now take a look at the model summary
 50 | summary(rfHex)
 51 | 
 52 | ## How good is that model?
 53 | theMean<-mean(trainHex$Sales)
 54 | theMedian<-median(as.data.frame(trainHex$Sales)[,1])
 55 | theLogMean<-expm1(mean(log1p(as.data.frame(trainHex$Sales)[,1])))
 56 | rmspe(judge[,1],rep(theMean,nrow(judge)))
 57 | rmspe(judge[,1],rep(theMedian,nrow(judge)))
 58 | rmspe(judge[,1],rep(theLogMean,nrow(judge)))
 59 | 
 60 | ## Seems like it's great. But is it?
 61 | 
 62 | train[Store==1,.(Sales,Date)][order(Date),plot(Sales)]
 63 | 
 64 | ## A lot of 0s
 65 | train[,.(.N,ZeroRate=sum(Sales==0)/.N,AvgSales=mean(Sales),AvgSalesNoZero=mean(ifelse(Sales==0,NA,Sales),na.rm=T))]
 66 | 
 67 | ## Look at example records with 0 Sales
 68 | train[Sales==0,]
 69 | 
 70 | ## Summarize by Open status 
 71 | train[,.(.N,mean(Sales)),Open]   ## data table syntax, SQL style: [i,j,by] = [WHERE,SELECT,GROUP BY]
 72 | 
 73 | ## So the model is learning the Open status well, as we saw from the summary showing it as the most important column
 74 | ##   In practice, this is fine, as it's an easy and obvious way to separate the data. 
 75 | ##   Nobody will be impressed by our ability to separate Open/Closed stores really well.
 76 | ## But there are two considerations, still. One is that it is so specific, it might be better to separate the data anyway
 77 | ##   via a rule-based system. The rationale is that with anything that samples columns, it might try and fit a tree without
 78 | ##   vital information. And with 17k records at exactly 0 sales, plus the intuitive concept that Closed = 0 sales, there
 79 | ##   should be no advantage leaving that data in. 
 80 | ##   So let's try removing it.
 81 | ## Additionally, we are not scored on results where the actuals are 0. That is for math simplicity on Kaggle's side.
 82 | ##   In practice, you do not want to be so leniant. If your model ignores non-zero predictions when the actual was zero
 83 | ##   you will have a problem with whatever action is connected to your predictions (e.g. setting production volume)
 84 | ## So for both reasons, we will remove the 0 sales and conduct all future modeling without these in the training set
 85 | 
 86 | train <- train[Sales > 0,]  ## We are not judged on 0 sales records in test set
 87 | 
 88 | ## Now let's try to retrain the model and see if there is a difference.
 89 | trainHex<-as.h2o(train[year <15 | month <6,],destination_frame = "trainHex")
 90 | validHex<-as.h2o(train[year == 15 & month >= 6,],destination_frame = "validHex")
 91 | dim(trainHex); dim(validHex)
 92 | rfHex <- h2o.randomForest(x=features,y="Sales",training_frame=trainHex,validation_frame=validHex,model_id="introRF")
 93 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(rfHex,validHex))[,1]))
 94 | rmspe(judge[,1],judge[,2])
 95 | 
 96 | ## Great. The model is better. What next?
 97 | summary(rfHex)
 98 | 
 99 | ## Odd that Store is less important than CompetitionDistance.
100 | ## We have chosen to solve a time series problem with decision trees, which should require heavy use
101 | ##  of the main two dimensions (store & time)
102 | class(train$Store)
103 | 
104 | ## Change integer encoding to a factor: should be no logical reason why an ID number encodes any value in the numbering scheme
105 | train[,Store:=as.factor(as.numeric(Store))]
106 | 
107 | ## Now let's try to retrain the model and see if there is a difference.
108 | trainHex<-as.h2o(train[year <15 | month <6,],destination_frame = "trainHex")
109 | validHex<-as.h2o(train[year == 15 & month >= 6,],destination_frame = "validHex")
110 | dim(trainHex); dim(validHex)
111 | rfHex <- h2o.randomForest(x=features,y="Sales",training_frame=trainHex,validation_frame=validHex,model_id="introRF",
112 |                           nbins_cats=1115)
113 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(rfHex,validHex))[,1]))
114 | rmspe(judge[,1],judge[,2])
115 | 
116 | ## More improvement. What next?
117 | 
118 | ## log transformation to not be as sensitive to high sales
119 | ## decent rule of thumb: 
120 | ##     if the data spans an order of magnitude, consider a log transform
121 | train[,logSales:=log1p(Sales)]
122 | features<-colnames(train)[!(colnames(train) %in% c("Id","Date","Sales","Customers","logSales"))]
123 | 
124 | ## Run through experimentation cycle again
125 | trainHex<-as.h2o(train[year <15 | month <6,],destination_frame = "trainHex")
126 | validHex<-as.h2o(train[year == 15 & month >= 6,],destination_frame = "validHex")
127 | dim(trainHex); dim(validHex)
128 | rfHex <- h2o.randomForest(x=features,y="logSales",training_frame=trainHex,validation_frame=validHex,model_id="introRF",
129 |                           nbins_cats=1115)
130 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(rfHex,validHex))[,1]))
131 | rmspe(judge[,1],expm1(judge[,2]))  ## added expm1 term to undo the log transformation for real results
132 | 
133 | ## There is a lot more experimentation that can be done with features. 
134 | ## But for the moment, we'll try some other modeling choices at this point.
135 | ## First, let's try a GBM
136 | gbmHex <- h2o.gbm(x=features,y="logSales",training_frame=trainHex,validation_frame=validHex,model_id="introGBM",
137 |                           nbins_cats=1115)
138 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(gbmHex,validHex))[,1]))
139 | rmspe(judge[,1],expm1(judge[,2]))  ## added expm1 term to undo the log transformation for real results
140 | 
141 | ## It ran faster, but is less accurate. Let's add a stochastic element to it
142 | gbmHex <- h2o.gbm(x=features,y="logSales",training_frame=trainHex,validation_frame=validHex,model_id="introGBM",
143 |                   nbins_cats=1115,sample_rate = 0.7,col_sample_rate = 0.7,max_depth = 10,learn_rate=0.05)
144 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(gbmHex,validHex))[,1]))
145 | rmspe(judge[,1],expm1(judge[,2]))  ## added expm1 term to undo the log transformation for real results
146 | 
147 | ## Still not matching RF accuracy
148 | gbmHex <- h2o.gbm(x=features,y="logSales",training_frame=trainHex,validation_frame=validHex,model_id="introGBM",
149 |                   nbins_cats=1115,sample_rate = 0.5,col_sample_rate = 0.5,max_depth = 15,learn_rate=0.05,ntrees = 50)
150 | judge<-as.data.frame(cbind(as.data.frame(validHex$Sales),as.data.frame(h2o.predict(gbmHex,validHex))[,1]))
151 | rmspe(judge[,1],expm1(judge[,2]))  ## added expm1 term to undo the log transformation for real results
152 | 
153 | ## Closer. To really compare the two, we'd likely want to do a grid search here.
154 | 
155 | 
156 | ## When we like our model, predict on test to see if our out of sample translates reasonably well
157 | ## Note: this wasn't run at the Data Science Camp
158 | test  <- fread("../input/test.csv",stringsAsFactors = T)
159 | test <- merge(test,store,by="Store")
160 | test[,Date:=as.Date(Date)]
161 | test[,month:=as.integer(format(Date, "%m"))]
162 | test[,year:=as.integer(format(Date, "%y"))]
163 | test[,Store:=as.factor(as.numeric(Store))]
164 | summary(test)
165 | 
166 | testHex<-as.h2o(test)
167 | testPredictions<-expm1(as.data.frame(h2o.predict(rfHex,testHex))[,1])
168 | summary(testPredictions)
169 | 


--------------------------------------------------------------------------------
/see_click_predict_fix_basis.R:
--------------------------------------------------------------------------------
  1 | train<-read.csv("train.csv")
  2 | train<-train[train$num_votes!=327,]                ##remove large outlier
  3 | train<-train[train$num_views<1500,]                ##remove large outlier
  4 | train$created_time<-strptime(train$created_time,"%Y-%m-%d %H:%M:%S")
  5 | #train<-train[as.Date(train$created_time)>as.Date("2012-10-14"),]
  6 | train<-train[as.Date(train$created_time)>as.Date("2013-01-31"),]
  7 | 
  8 | test<-read.csv("test.csv")
  9 | test$created_time<-strptime(test$created_time,"%Y-%m-%d %H:%M:%S")
 10 | 
 11 | #########################
 12 | ##### Functions #########
 13 | triRmsle<-function(p1,a1,p2,a2,p3,a3){
 14 |  require(Metrics)
 15 |  a<-rmsle(rbind(p1,p2,p3),rbind(a1,a2,a3))
 16 |  return(a)
 17 | }
 18 | quickScalarRmsle<-function(df,scalars){return(triRmsle(df[,1]*scalars[,1],df[,1],df[,2]*scalars[,1],df[,2],df[,3]*scalars[,1],df[,3]))}
 19 | ##usage: quickScalarRmsle(targets,cbind(targets[,1]*0,targets[,2]*0,targets[,3]*0)) ##0.7330985
 20 | 
 21 | getGbmFit<-function(x,y,t){require(gbm); GBM_NTREES = t; GBM_SHRINKAGE = 0.05; GBM_DEPTH = 4; GBM_MINOBS = 30; 
 22 |         return(gbm.fit(x = x,y = y,distribution = "gaussian",n.trees = GBM_NTREES,shrinkage = GBM_SHRINKAGE,interaction.depth = GBM_DEPTH,n.minobsinnode = GBM_MINOBS,verbose = FALSE))}
 23 | 
 24 | getDistributionTable<-function(singleColumnDF,n){
 25 |         require(sqldf)
 26 |         colnames(singleColumnDF)<-"x"
 27 |         out<-as.data.frame(sqldf(paste("SELECT x FROM singleColumnDF GROUP BY x ORDER BY COUNT(*) DESC LIMIT",n)))
 28 |         return(out)
 29 | }
 30 | 
 31 | binarizeColumn <- function(data,valListDF,colNum,remove=TRUE){
 32 |         for(i in 1:nrow(valListDF)){
 33 |                 newCol<-ifelse(data[,colNum]==valListDF[i,1],1,0)
 34 |                 data<-as.data.frame(cbind(data,newCol))
 35 |                 colnames(data)[ncol(data)]<-as.character(valListDF[i,1])
 36 |                 }
 37 |         if(remove) {data[,colNum]<-NULL}
 38 |         return(data)
 39 | }        
 40 | 
 41 | ###### End of functions #######        
 42 | ###############################
 43 | 
 44 | 
 45 | x<-test[,c(1,1,1,1:8)]
 46 | colnames(x)<-colnames(train)[c(6,7,8,1:5,9:11)]
 47 | x<-as.data.frame(rbind(train[,c(6,7,8,1:5,9:11)],x))
 48 | x<-as.data.frame(cbind(seq(1:nrow(x)),x))
 49 | colnames(x)[1]<-"sortNum"
 50 | 
 51 | a1<-1; a2<-nrow(train); b1<-nrow(train)+1; b2<-nrow(x)
 52 | a1;a2;b1;b2
 53 | 
 54 | city<-as.factor(ifelse(x$longitude<(-100),"Oakland",ifelse(x$longitude<(-82),"Chicago",ifelse(x$longitude<(-75),"Richmond","New Haven"))))
 55 | descLength<-round(log(nchar(as.character(x$description))+1),0)
 56 | descSummary<-round(log(nchar(as.character(x$summary))+1),0)
 57 | latlong2_str<-as.factor(paste(round(x$latitude/2,2)*2,round(x$longitude/2,2)*2,sep='_'))
 58 | hrOfDay<-as.factor(substr(x$created_time,12,13))
 59 | dayOfWeek <- as.factor(weekdays(as.Date(x$created_time)))
 60 | l10sum<-as.factor(tolower(substr(gsub(" ","",gsub("[[:punct:]]","",x$summary)),1,10)))
 61 |  ##left_10_desc<-as.factor(tolower(substr(gsub(" ","",gsub("[[:punct:]]","",x$description)),1,10)))
 62 | x$latitude<-NULL; x$longitude<-NULL
 63 | 
 64 | x<-as.data.frame(cbind(x,city,descLength,descSummary,latlong2_str,hrOfDay,dayOfWeek,l10sum))
 65 | 
 66 | levels(x$source)<-c(levels(x$source),"n/a"); x$source[is.na(x$source)]<-"n/a"
 67 | levels(x$tag_type)<-c(levels(x$tag_type),"n/a"); x$tag_type[is.na(x$tag_type)]<-"n/a"
 68 | 
 69 | ## binarization
 70 | distL10Summ<-getDistributionTable(as.data.frame(x$l10sum),50)
 71 | x<-binarizeColumn(x,distL10Summ,17)
 72 | 
 73 | distlatlong2_str<-getDistributionTable(as.data.frame(x$latlong2_str),50)
 74 | x<-binarizeColumn(x,distlatlong2_str,14)
 75 | 
 76 | distdayOfWeek<-getDistributionTable(as.data.frame(x$dayOfWeek),7)
 77 | x<-binarizeColumn(x,distdayOfWeek,15)
 78 | 
 79 | disthrOfDay<-getDistributionTable(as.data.frame(x$hrOfDay),24)
 80 | x<-binarizeColumn(x,disthrOfDay,14)
 81 | 
 82 | distdescSummary<-getDistributionTable(as.data.frame(x$descSummary),6)
 83 | x<-binarizeColumn(x,distdescSummary,13)
 84 | 
 85 | distdescLength<-getDistributionTable(as.data.frame(x$descLength),6)
 86 | x<-binarizeColumn(x,distdescLength,12)
 87 | 
 88 | distcity<-getDistributionTable(as.data.frame(x$city),4)
 89 | x<-binarizeColumn(x,distcity,11)
 90 | 
 91 | disttag_type<-getDistributionTable(as.data.frame(x$tag_type),20)
 92 | x<-binarizeColumn(x,disttag_type,10)
 93 | 
 94 | distsource<-getDistributionTable(as.data.frame(x$source),9)
 95 | x<-binarizeColumn(x,distsource,8)
 96 | 
 97 | a1<-1; a2<-nrow(train); b1<-nrow(train)+1; b2<-nrow(x)
 98 | a1;a2;b1;b2
 99 | train<-x[x$sortNum<b1,]
100 | test<-x[x$sortNum>=b1,]
101 | dim(train); dim(test)
102 | 
103 | trees<-100
104 | fitVotes<-getGbmFit(train[,9:ncol(train)], log(1+train[,2]), trees)
105 | fitComments<-getGbmFit(train[,9:ncol(train)], log(1+train[,3]), trees)
106 | fitViews<-getGbmFit(train[,9:ncol(train)], log(1+train[,4]), trees)
107 | 
108 | cvFullVotes<-predict(object = fitVotes,newdata=train[,9:ncol(train)], type="response",trees)
109 | cvFullComments<-predict(object = fitComments,newdata=train[,9:ncol(train)], type="response",trees)
110 | cvFullViews<-predict(object = fitViews,newdata=train[,9:ncol(train)], type="response",trees)
111 | 
112 | testVotesNS<-predict(object = fitVotes,newdata=test[,9:ncol(test)], type="response",trees)
113 | testCommentsNS<-predict(object = fitComments,newdata=test[,9:ncol(test)], type="response",trees)
114 | testViewsNS<-predict(object = fitViews,newdata=test[,9:ncol(test)], type="response",trees)
115 | 
116 | quickScalarRmsle(cbind(train[,2],train[,3],train[,4]),cbind(exp(cvFullVotes)-1,exp(cvFullComments)-1,exp(cvFullViews)-1))
117 | summary(fitViews)
118 | summary(fitVotes)
119 | summary(fitComments)
120 | 
121 | submission<-as.data.frame(cbind(test[,5],round(exp(testViews)-1,3),round(exp(testVotes)-1,3),round(exp(testComments)-1,3)))
122 | colnames(submission)<-c("id","num_views","num_votes","num_comments")
123 | submission$num_views<-pmax(submission$num_views,0)
124 | submission$num_votes<-pmax(submission$num_votes,0)
125 | submission$num_comments<-pmax(submission$num_comments,0)
126 | t<-gsub("[[:punct:]]", "", as.character(Sys.time()))
127 | fn<-paste("submission_",gsub(" ","_",t),".csv",sep='')
128 | write.table(submission,fn, append = FALSE, sep=',', row.names=FALSE, quote=FALSE, col.names=TRUE)
129 | 


--------------------------------------------------------------------------------
/will_it_rain/histogram_starter_model.R:
--------------------------------------------------------------------------------
 1 | #####################################################
 2 | ## Starter Script for Kaggle Will it Rain competition
 3 | #####################################################
 4 | ##
 5 | ## This is a script to show a simple alternative to
 6 | ##  using a sigmoid distribution.
 7 | ## It is essentially a histogram prediction, using
 8 | ##  coarse measurements of the same RR1 value used
 9 | ##  in the provided benchmark.
10 | ##
11 | ## Packages: data.table
12 | ##
13 | ## Performance: 0.00872817
14 | ## Relative performance: 37th out of 189; improves upon 
15 | ## provided benchmark (0.01177621), and using all 1's (0.01017651)
16 | ######################################################
17 | start<-Sys.time()
18 | #setwd("directory where the data files are located, if not getwd()")
19 | 
20 | ## read in a fixed number of rows (increase for improved accuracy)
21 | ## you can use read.csv in place of data.table; it's just much slower
22 | library(data.table)	
23 | train<-fread("train_2013.csv",nrows=250000,select=c("RR1","Expected"))
24 | 
25 | ## parse the readings inside each RR1 value, which are space-delimited, and take the mean of all values
26 | rr1.mean<-unlist(lapply(train[,RR1], function(x) mean(pmax(0,pmin(70,as.numeric(strsplit(x," ")[[1]]))))))
27 | ## in case there are any NAs, replace with 0
28 | rr1.mean[is.na(rr1.mean)]<-0
29 | 
30 | ## look at the data
31 | ## notice most observations are 0, and after that most between 0 and 0.5
32 | ## so using typical quantiles or breaks will merely subdivide a really small range of measurements
33 | ## So instead, we break off the most popular buckets, and then divide the remainder of the RR1 distribution
34 | ## So we are asking for the ranges of RR1, but forcing the first and second range starts to be 0 and 0.5
35 | table(round(rr1.mean,1))
36 | breaks<-c(-Inf,0,quantile(rr1.mean[rr1.mean>0.5],c(0:8)/8))
37 | names(breaks)<-NULL
38 | ##store the break length to use throughout the rest of the code, in case you want to measure more breaks than 10
39 | n<-length(breaks)-1   
40 | 
41 | ## create a matrix to store predictions for key points along the RR1 distribution
42 | observations<-matrix(,n,10)
43 | 
44 | ## now fill in with empirical values
45 | for(i in 1:n){	
46 | 	## for the various cutpoints (breaks) in the RR1 distribution, measure the frequency
47 | 	##	of the bottom 10 levels of the target value (0mm,1mm,2mm,...9mm)
48 | 	observations[i,1:10]<-hist(
49 | 		pmin(70,pmax(0,train[,Expected][rr1.mean>breaks[i]&rr1.mean<=breaks[i+1]])),
50 | 		plot=FALSE,
51 | 		breaks=c(0:(n-1),70)
52 | 		)$density
53 | }
54 | 
55 | ## convert histogram to cumulative distribution
56 | ## this ensures we have non-decreasing predictions as well (requirement)
57 | for(i in 2:n){observations[,i]<-observations[,(i-1)]+observations[,i]}
58 | 
59 | ## look over the values again
60 | ## looks a little peculiar in that some higher ranges have lower outcomes, but it is empirical
61 | observations
62 | 
63 | ## now construct a prediction by using the matrix as a lookup table to all 70 predictions
64 | ##  i.e. we'll find the RR1 value for every record, figure out which row in our small prediction table
65 | ##  it corresponds to, and use that entire row of the matrix as our prediction "vector"
66 | 
67 | test<-fread("test_2014.csv",select=c("Id","RR1"))
68 | 
69 | ## parse the readings inside each RR1 value, which are space-delimited
70 | rr1.mean<-unlist(lapply(test[,RR1], function(x) mean(pmax(0,pmin(70,as.numeric(strsplit(x," ")[[1]]))))))
71 | rr1.mean[is.na(rr1.mean)]<-0
72 | 
73 | ## seed the predictions with 1, which means it will rain <= each column 100% of the time
74 | predictions<-as.data.frame(cbind(test$Id,matrix(1,nrow=nrow(test),ncol=70)))
75 | colnames(predictions)<-c("Id",paste0("Predicted",(seq(1:70)-1)))
76 | 
77 | ## override the 100% values with our table lookup; want to improve this? smooth the edge
78 | for(i in 1:n){predictions[rr1.mean>breaks[i]&rr1.mean<=breaks[i+1],2:(n+1)]<-as.data.frame(t(observations[i,1:n]))}
79 | 
80 | ## output predictions; outputs as 184MB, but compresses to 4.3MB (lot of 1's)
81 | ##scipen turns off scientific notation (for ID values such as 100000); Kaggle rejects those
82 | options("scipen"=100, "digits"=8)  
83 | write.table(predictions,"histogram_benchmark.csv",quote = FALSE, sep = ",",row.names=FALSE)
84 | 
85 | ##how long did it take (2.9 minutes on a Windows i5 with 6GB memory and slow HDD)
86 | stop<-Sys.time()
87 | stop-start
88 | 


--------------------------------------------------------------------------------