├── inst ├── SVM.png ├── WtVsHt.png ├── tests │ ├── README │ ├── KNNTests │ ├── QuickTests.R~ │ └── QuickTests.R ├── BMIfitted.png ├── BMIhist.png ├── RpartVert.png ├── BMIfitwell.png ├── VertebraeNN.png ├── vert1hidden.png ├── images │ ├── ROC45.png │ ├── PrgEngFit.png │ ├── PrgengVar.png │ ├── ParVsNonpar.png │ ├── RedSurroundsBlack.png │ └── README ├── ObjFtnPlusTangent.png ├── vn.save │ └── vignettes │ │ ├── parvsnonpar.png │ │ ├── varvsmean.png │ │ ├── wagevsage.png │ │ ├── regtools.Rmd │ │ ├── regtools.Rmd.save │ │ ├── regtools.Rmd~ │ │ ├── regtools.Rnw │ │ └── regtools.Rnw.save ├── sdss2020 │ ├── BestK.R │ ├── z │ ├── WeightedDistCensus.R │ ├── PrgEng.R │ ├── Pima.R │ ├── WeightedDistDiab.R │ ├── MahalanobisSong.R │ ├── FineTune.R~ │ ├── ExpandGraph.R │ ├── FineTune.R │ ├── LocLinStudy.R │ └── LocLinStudy.R~ ├── README.ClearingConfusion ├── RecSysLinModels.md ├── ScalingInPCA.md ├── InterpretedR.md ├── PoissonReg.md ├── DstrFit.md └── ChoosingKinKFoldCV.md ├── data ├── day.RData ├── day1.RData ├── day2.RData ├── mlb.txt.gz ├── pef.RData ├── english.RData ├── mlens.RData ├── peDumms.RData ├── phoneme.RData ├── prgeng.RData ├── quizzes.RData ├── yell10k.RData ├── SwissRoll.RData ├── newadult.RData ├── peFactors.RData ├── quizDocs.RData ├── weatherTS.RData ├── oliveoils.txt.gz ├── courseRecords.RData ├── falldetection.RData └── ltrfreqs.txt ├── vignettes ├── PrgengVar.png └── ParVsNonpar.png ├── man ├── yell10k.Rd ├── falldetection.Rd ├── phoneme.Rd ├── weatherTS.Rd ├── ltrfreqs.Rd ├── SwissRoll.Rd ├── oliveoils.Rd ├── mlb.Rd ├── english.Rd ├── mlens.Rd ├── currency.Rd ├── courseRecords.Rd ├── day.Rd ├── newadult.Rd ├── quizDocs.Rd ├── nlshc.Rd ├── Penrose.Rd ├── ridgelm.Rd ├── Graphics.Rd ├── unscale.Rd ├── mm.Rd ├── prgeng.Rd ├── textToXY.Rd ├── TS.Rd ├── regtools-package.Rd ├── misc.Rd ├── krsFit.Rd ├── ovalogtrn.Rd ├── lmac.Rd ├── FineTuning.Rd ├── factorsDummies.Rd └── Quick.Rd~ ├── R ├── z.R ├── onAttach.R ├── FormulaWrappers.R ├── Nonlin.R ├── Ridge.R ├── DimRed.R ├── Text.R ├── MM.R ├── TS.R ├── Penrose.R └── AC.R ├── .gitignore ├── DESCRIPTION └── NAMESPACE /inst/SVM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/SVM.png -------------------------------------------------------------------------------- /data/day.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/day.RData -------------------------------------------------------------------------------- /data/day1.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/day1.RData -------------------------------------------------------------------------------- /data/day2.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/day2.RData -------------------------------------------------------------------------------- /data/mlb.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/mlb.txt.gz -------------------------------------------------------------------------------- /data/pef.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/pef.RData -------------------------------------------------------------------------------- /inst/WtVsHt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/WtVsHt.png -------------------------------------------------------------------------------- /inst/tests/README: -------------------------------------------------------------------------------- 1 | 2 | The file xTests.R consists of tests for ../R/x.R. 3 | 4 | -------------------------------------------------------------------------------- /data/english.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/english.RData -------------------------------------------------------------------------------- /data/mlens.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/mlens.RData -------------------------------------------------------------------------------- /data/peDumms.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/peDumms.RData -------------------------------------------------------------------------------- /data/phoneme.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/phoneme.RData -------------------------------------------------------------------------------- /data/prgeng.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/prgeng.RData -------------------------------------------------------------------------------- /data/quizzes.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/quizzes.RData -------------------------------------------------------------------------------- /data/yell10k.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/yell10k.RData -------------------------------------------------------------------------------- /inst/BMIfitted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/BMIfitted.png -------------------------------------------------------------------------------- /inst/BMIhist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/BMIhist.png -------------------------------------------------------------------------------- /inst/RpartVert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/RpartVert.png -------------------------------------------------------------------------------- /data/SwissRoll.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/SwissRoll.RData -------------------------------------------------------------------------------- /data/newadult.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/newadult.RData -------------------------------------------------------------------------------- /data/peFactors.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/peFactors.RData -------------------------------------------------------------------------------- /data/quizDocs.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/quizDocs.RData -------------------------------------------------------------------------------- /data/weatherTS.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/weatherTS.RData -------------------------------------------------------------------------------- /inst/BMIfitwell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/BMIfitwell.png -------------------------------------------------------------------------------- /inst/VertebraeNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/VertebraeNN.png -------------------------------------------------------------------------------- /inst/vert1hidden.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/vert1hidden.png -------------------------------------------------------------------------------- /data/oliveoils.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/oliveoils.txt.gz -------------------------------------------------------------------------------- /inst/images/ROC45.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/images/ROC45.png -------------------------------------------------------------------------------- /vignettes/PrgengVar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/vignettes/PrgengVar.png -------------------------------------------------------------------------------- /data/courseRecords.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/courseRecords.RData -------------------------------------------------------------------------------- /data/falldetection.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/data/falldetection.RData -------------------------------------------------------------------------------- /inst/images/PrgEngFit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/images/PrgEngFit.png -------------------------------------------------------------------------------- /inst/images/PrgengVar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/images/PrgengVar.png -------------------------------------------------------------------------------- /vignettes/ParVsNonpar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/vignettes/ParVsNonpar.png -------------------------------------------------------------------------------- /inst/ObjFtnPlusTangent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/ObjFtnPlusTangent.png -------------------------------------------------------------------------------- /inst/images/ParVsNonpar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/images/ParVsNonpar.png -------------------------------------------------------------------------------- /inst/images/RedSurroundsBlack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/images/RedSurroundsBlack.png -------------------------------------------------------------------------------- /inst/vn.save/vignettes/parvsnonpar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/vn.save/vignettes/parvsnonpar.png -------------------------------------------------------------------------------- /inst/vn.save/vignettes/varvsmean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/vn.save/vignettes/varvsmean.png -------------------------------------------------------------------------------- /inst/vn.save/vignettes/wagevsage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/vn.save/vignettes/wagevsage.png -------------------------------------------------------------------------------- /man/yell10k.Rd: -------------------------------------------------------------------------------- 1 | \name{yell10k} 2 | \alias{yell10k} 3 | 4 | \title{ 5 | New York Taxi Data 6 | } 7 | 8 | \description{ 9 | From public data on New York City taxi trips. 10 | } 11 | 12 | -------------------------------------------------------------------------------- /R/z.R: -------------------------------------------------------------------------------- 1 | 2 | library(mvtnorm) 3 | n <- 500 4 | cv <- rbind(c(1,0.2),c(0.2,1)) 5 | xy <- NULL 6 | for (i in 1:3) 7 | xy <- rbind(xy,rmvnorm(n,mean=rep(i*0.5,2),sigma=cv)) 8 | xy <- cbind(xy,rep(0:2,each=n)) -------------------------------------------------------------------------------- /inst/sdss2020/BestK.R: -------------------------------------------------------------------------------- 1 | 2 | library(regtools) 3 | data(peDumms) 4 | ped <- peDumms[,c(1,20,22,24:29,31,32)] 5 | x <- ped[,-10] 6 | y <- ped[,10] 7 | bkpp <- bestKperPoint(x,y,50) 8 | plot(density(bkpp)) 9 | 10 | 11 | -------------------------------------------------------------------------------- /inst/sdss2020/z: -------------------------------------------------------------------------------- 1 | 2 | tiff('LocLin.tiff',width=6,height=6,unit='in',res=800) 3 | plottingSim() 4 | dev.off() 5 | system('convert LocLin.tiff LocLinTiff.jpg') 6 | system('mv LocLin* ~/Research/SDSSknn') 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /inst/sdss2020/WeightedDistCensus.R: -------------------------------------------------------------------------------- 1 | 2 | library(regtools) 3 | data(peDumms) 4 | ped <- peDumms[,c(1,20,22,24:29,31,32)] 5 | x <- ped[,-10] 6 | y <- ped[,10] 7 | plotExpVars(x,y,x,y,25,c(1,2,8,9,10),1.5,lossFtn='MAPE',ylim=c(23000,25000)) 8 | 9 | -------------------------------------------------------------------------------- /man/falldetection.Rd: -------------------------------------------------------------------------------- 1 | \name{falldetection} 2 | \alias{falldetection} 3 | 4 | \title{ 5 | Fall Detection Data 6 | } 7 | 8 | \description{ 9 | Detection falls in the elderly via physiological measurements. 10 | Obtained from Kaggle. 11 | } 12 | 13 | -------------------------------------------------------------------------------- /R/onAttach.R: -------------------------------------------------------------------------------- 1 | 2 | .onAttach <- function(libname, pkgname) { 3 | packageStartupMessage( 4 | '\n\n\n\n\n*********************\n\n\n\nLatest version of regtools at GitHub.com/matloff\n\n\nType ?regtools to see function list by category\n\n\n\n') 5 | } 6 | 7 | -------------------------------------------------------------------------------- /man/phoneme.Rd: -------------------------------------------------------------------------------- 1 | \name{phoneme} 2 | \alias{phoneme} 3 | 4 | \title{ 5 | Phoneme Data 6 | } 7 | 8 | \description{ 9 | Phoneme detection, 2 types. Features are from harmonic analysis of th 10 | voice. From OpenML, \url{https://www.openml.org/d/1489}. 11 | } 12 | 13 | -------------------------------------------------------------------------------- /inst/sdss2020/PrgEng.R: -------------------------------------------------------------------------------- 1 | 2 | prgengExpVars <- function() 3 | { 4 | data(peDumms) 5 | ped <- peDumms[,c(1,20,22,24:29,31,32)] 6 | x <- ped[,-10] 7 | y <- ped[,10] 8 | plotExpVars(x,y,x,y,25,c(1,2,8,9,10),1.5,lossFtn='MAPE',ylim=c(23500,25000)) 9 | } 10 | 11 | -------------------------------------------------------------------------------- /man/weatherTS.Rd: -------------------------------------------------------------------------------- 1 | \name{weatherTS} 2 | \alias{weatherTS} 3 | 4 | \title{ 5 | Weather Time Series 6 | } 7 | 8 | \description{ 9 | Various measurements on weather variables collected by NASA. Downloaded 10 | via \code{nasapower}; see that package for documentation. 11 | } 12 | 13 | -------------------------------------------------------------------------------- /man/ltrfreqs.Rd: -------------------------------------------------------------------------------- 1 | \name{ltrfreqs} 2 | \alias{ltrfreqs} 3 | 4 | \title{ 5 | Letter Frequencies 6 | } 7 | 8 | \description{ 9 | 10 | This is data consists of capital letter frequencies obtained at 11 | http://www.math.cornell.edu/~mec/2003-2004/cryptography/subs/frequencies.h 12 | tml 13 | } 14 | 15 | -------------------------------------------------------------------------------- /man/SwissRoll.Rd: -------------------------------------------------------------------------------- 1 | \name{SwissRoll} 2 | \alias{SwissRoll} 3 | \alias{sw} 4 | 5 | \title{ 6 | Swiss Roll 7 | } 8 | 9 | \description{ 10 | See \url{http://people.cs.uchicago.edu/~dinoj/manifold/swissroll.html} 11 | for this version of Swiss Roll. 12 | 13 | Running \code{data(SwissRoll)} produces an object \code{sw}. 14 | 15 | } 16 | 17 | 18 | -------------------------------------------------------------------------------- /man/oliveoils.Rd: -------------------------------------------------------------------------------- 1 | \name{oliveoils} 2 | \alias{oliveoils} 3 | 4 | \title{ 5 | Italian olive oils data set. 6 | } 7 | 8 | \description{ 9 | Italian olive oils data set, as used in \emph{Graphics of Large 10 | Datasets: Visualizing a Million}, by Antony Unwin, Martin Theus and 11 | Heike Hofmann, Springer, 2006. Included here with permission of Dr. 12 | Martin Theus. 13 | } 14 | 15 | -------------------------------------------------------------------------------- /man/mlb.Rd: -------------------------------------------------------------------------------- 1 | \name{mlb} 2 | \alias{mlb} 3 | 4 | \title{ 5 | Major Leage Baseball player data set. 6 | } 7 | 8 | \description{ 9 | Heights, weights, ages etc. of major league baseball players. A new 10 | variable has been added, consolidating positions into Infielders, 11 | Outfielders, Catchers and Pitchers. 12 | 13 | Included here with the permission of the UCLA Statistics Department. 14 | } 15 | 16 | -------------------------------------------------------------------------------- /man/english.Rd: -------------------------------------------------------------------------------- 1 | \name{english} 2 | \alias{english} 3 | 4 | \title{ 5 | English vocabulary data 6 | } 7 | 8 | \description{ 9 | 10 | The Stanford WordBank data on vocabulary acquisition in young children. 11 | The file consists of about 5500 rows. (There are many NA values, though, 12 | and only about 2800 complete cases.) Variables are age, birth order, 13 | sex, mother's education and vocabulary size. 14 | 15 | } 16 | 17 | -------------------------------------------------------------------------------- /inst/images/README: -------------------------------------------------------------------------------- 1 | 2 | RedSurroundsBlack.png: 3 | 4 | Predict gender from various, then plot predicted values against 2 5 | features, age and income. Female spots surrounded! 6 | 7 | data(peDumms) 8 | pe <- peDumms 9 | pe <- pe[,c(1,20:29,31,32)] 10 | kout <- kNN(pe[,-11],pe[,11],kmax=25,classif=T) 11 | plot(pe$age,pe$wageinc,col=preds,pch=16) 12 | preds <- kout$ypreds + 1 13 | plot(pe$age,pe$wageinc,col=preds,pch=16) 14 | 15 | -------------------------------------------------------------------------------- /inst/sdss2020/Pima.R: -------------------------------------------------------------------------------- 1 | 2 | pimaExpVars <- function() 3 | { 4 | library(mlbench) 5 | data(PimaIndiansDiabetes2) 6 | diab <- PimaIndiansDiabetes2 7 | db <- diab[setdiff(names(diab),c('triceps','insulin'))] 8 | db <- db[complete.cases(db),] 9 | x <- as.matrix(db[,-7]) 10 | y <- as.numeric(db[,7] == 'pos') 11 | plotExpVars(x,y,x,y,25,1:6,1.5,'probIncorrectClass',c(0.2,0.35),leave1out=TRUE) 12 | } 13 | 14 | 15 | -------------------------------------------------------------------------------- /man/mlens.Rd: -------------------------------------------------------------------------------- 1 | \name{mlens} 2 | \alias{mlens} 3 | 4 | \title{ 5 | MovieLens User Summary Data 6 | } 7 | 8 | \description{ 9 | The MovieLens dataset, \url{https://grouplens.org/}, 10 | is a standard example in the recommender systems literature. Here we 11 | give demographic data for each user, plus the mean rating and number of 12 | ratings. One may explore, for instance, the relation between ratings 13 | and age. 14 | } 15 | 16 | -------------------------------------------------------------------------------- /man/currency.Rd: -------------------------------------------------------------------------------- 1 | \name{currency} 2 | \alias{currency} 3 | 4 | \title{ 5 | Pre-Euro Era Currency Fluctuations 6 | } 7 | 8 | \description{ 9 | From Wai Mun Fong and Sam Ouliaris, "Spectral Tests of the Martingale 10 | Hypothesis for Exchange Rates", Journal of Applied Econometrics, Vol. 11 | 10, No. 3, 1995, pp. 255-271. Weekly exchange rates against US dollar, 12 | over the period 7 August 1974 to 29 March 1989. 13 | } 14 | 15 | -------------------------------------------------------------------------------- /R/FormulaWrappers.R: -------------------------------------------------------------------------------- 1 | 2 | # basic idea: some popular packages for regression, classification and 3 | # machine learning do not acccommodate specifying Y and X vi an R 4 | # formula, e.g. weight ~ height+age; this file contains wrappers to 5 | # allow this 6 | 7 | # also allowed will be factor-valued X 8 | 9 | # note that the generic predict() functions must also be wrappers 10 | 11 | # the suffix 'W' will be used to indicate "wrapper" 12 | 13 | -------------------------------------------------------------------------------- /inst/sdss2020/WeightedDistDiab.R: -------------------------------------------------------------------------------- 1 | 2 | library(regtools) 3 | library(mlbench) 4 | data(PimaIndiansDiabetes2) 5 | diab <- PimaIndiansDiabetes2 6 | db <- diab[setdiff(names(diab),c('triceps','insulin'))] 7 | db <- db[complete.cases(db),] 8 | head(db) 9 | x <- as.matrix(db[,-7]) 10 | y <- as.numeric(db[,7] == 'pos') 11 | set.seed(9999) 12 | tstidxs <- sample(1:nrow(x),100) 13 | xtst <- x[tstidxs,] 14 | ytst <- y[tstidxs] 15 | xtrn <- x[-tstidxs,] 16 | ytrn <- y[-tstidxs] 17 | plotExpVars(xtrn, ytrn, xtst,ytst,5,1:6,1.5,'propMisclass',c(0.5,0.9)) 18 | 19 | 20 | -------------------------------------------------------------------------------- /man/courseRecords.Rd: -------------------------------------------------------------------------------- 1 | \name{courseRecords} 2 | \alias{courseRecords} 3 | 4 | \title{ 5 | Records from several offerings of a certain course. 6 | } 7 | 8 | \description{ 9 | The data are in the form of an R list. Each element of the list 10 | corresponds to one offering of the course. Fields are: Class level; 11 | major (two different computer science majors, LCSI in Letters and 12 | Science and ECSE in engineering); quiz grade average (scale of 4.0, A+ 13 | counting as 4.3); homework grade average (same scale); and course letter 14 | grade. 15 | } 16 | 17 | 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac system file 2 | .DS_Store 3 | 4 | # History files 5 | .Rhistory 6 | .Rapp.history 7 | 8 | # Session Data files 9 | .RData 10 | 11 | # User-specific files 12 | .Ruserdata 13 | 14 | # Output files from R CMD build 15 | /*.tar.gz 16 | 17 | # Output files from R CMD check 18 | /*.Rcheck/ 19 | 20 | # RStudio files 21 | .Rproj.user/ 22 | 23 | # knitr and R markdown default cache directories 24 | *_cache/ 25 | /cache/ 26 | 27 | # Temporary files created by R markdown 28 | *.utf8.md 29 | *.knit.md 30 | 31 | # R Environment Variables 32 | .Renviron 33 | 34 | # pkgdown site 35 | docs/ 36 | 37 | # translation temp files 38 | po/*~ 39 | -------------------------------------------------------------------------------- /data/ltrfreqs.txt: -------------------------------------------------------------------------------- 1 | 'ltr' 'percent' 2 | E 12.02 3 | T 9.10 4 | A 8.12 5 | O 7.68 6 | I 7.31 7 | N 6.95 8 | S 6.28 9 | R 6.02 10 | H 5.92 11 | D 4.32 12 | L 3.98 13 | U 2.88 14 | C 2.71 15 | M 2.61 16 | F 2.30 17 | Y 2.11 18 | W 2.09 19 | G 2.03 20 | P 1.82 21 | B 1.49 22 | V 1.11 23 | K 0.69 24 | X 0.17 25 | Q 0.11 26 | J 0.10 27 | Z 0.07 28 | -------------------------------------------------------------------------------- /inst/README.ClearingConfusion: -------------------------------------------------------------------------------- 1 | 2 | A number of the files here comprise our Clearing the Confusion series, 3 | clarifying many topics that are widely misunderstood in statistics. 4 | Cufrrently we have: 5 | 6 | ChoosingKinKFoldCV.md: Choosing the number of folds in k-fold 7 | cross-validation. 8 | 9 | NoPVals.md: Why p-values should not be used. 10 | 11 | ScalingInPCA.md: Arguing that use of scaling prior to PCA is problematic. 12 | 13 | PoissonReg.md: Why Poisson regression should be used only in some 14 | restrictive settings. 15 | 16 | UnbalancedClasses.md: Don't artificially balance data in classification 17 | problems. 18 | 19 | -------------------------------------------------------------------------------- /inst/sdss2020/MahalanobisSong.R: -------------------------------------------------------------------------------- 1 | 2 | ibrary(regtools) 3 | load('YearData.save') # obtain separately, data frame 'yr' 4 | yr <- yr[,seq(2,91,5)] 5 | idxs <- sample(1:nrow(yr),100000) 6 | yr1 <- yr[idxs,] 7 | idxs <- sample(1:nrow(yr1),5000) 8 | trn <- yr1[-idxs,] 9 | tst <- yr1[idxs,] 10 | xtrn <- trn[,-1] 11 | ytrn <- trn[,1] 12 | xtst <- tst[,-1] 13 | ytst <- tst[,1] 14 | knnout <- kNN(xtrn,ytrn,xtst,25) 15 | mhd <- knnout$mhdists 16 | far <- which(mhd > 150) 17 | xn <- xtst[far,] 18 | yn <- ytst[far] 19 | preds <- kNN(xtrn,ytrn,xn,25)$regests 20 | mean(abs(preds - yn)) 21 | preds <- kNN(xtrn,ytrn,xn,25,smoothingFtn=loclin)$regests 22 | mean(abs(preds - yn)) 23 | 24 | -------------------------------------------------------------------------------- /man/day.Rd: -------------------------------------------------------------------------------- 1 | \name{day,day1} 2 | \alias{day} 3 | \alias{day1} 4 | \alias{day2} 5 | 6 | \title{ 7 | Bike sharing data. 8 | } 9 | 10 | \description{ 11 | This is the Bike Sharing dataset (day records only) from the UC Irvine 12 | Machine Learning Dataset Repository. Included here with 13 | permission of Dr. Hadi Fanaee. 14 | 15 | The \code{day} data is as on UCI; \code{day1} is modified so that the 16 | numeric weather variables are on their original scale. 17 | 18 | The \code{day2} is the same as \code{day1}, except that \code{dteday} 19 | has been removed, and \code{season}, \code{mnth}, \code{weekday} and 20 | \code{weathersit} have been converted to R factors. 21 | 22 | See \url{https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset} 23 | for details. 24 | } 25 | 26 | 27 | -------------------------------------------------------------------------------- /inst/sdss2020/FineTune.R~: -------------------------------------------------------------------------------- 1 | 2 | # apply regtools:::fineTuning() to choosing the expansion factors in 3 | # kNN() 4 | 5 | knnFineTune <- function(data,yName,k,expandVars,ws,classif=FALSE, 6 | seed=9999) 7 | { 8 | if (classif) stop('not ready for classification problems') 9 | ycol <- which(names(data) == yName) 10 | 11 | theCall <- function(dtrn,dtst,cmbi) { 12 | x <- dtrn[,-ycol] 13 | y <- dtrn[,ycol] 14 | newx <- dtst[,-ycol] 15 | newy <- dtst[,ycol] 16 | knnout <- kNN(x,y,newx,k,expandVars=expandVars,expandVals=cmbi) 17 | mean(abs(knnout$regests - newy)) 18 | } 19 | 20 | wcols <- paste0('w',1:length(expandVars),'=ws',collapse=',') 21 | wcols <- paste0('list(',wcols,')') 22 | fineTuning(dataset=data,pars=eval(parse(text=wcols)),regCall=theCall, 23 | nXval=10) 24 | 25 | } 26 | 27 | fT <- fineTuning 28 | 29 | -------------------------------------------------------------------------------- /inst/tests/KNNTests: -------------------------------------------------------------------------------- 1 | 2 | data(mlb) 3 | mlb <- mlb[,c(4,6,5)] # height, age, weight 4 | # fit, then predict 75", age 21, and 72", age 32 5 | knnout <- kNN(mlb[,1:2],mlb[,3],rbind(c(75,21),c(72,32)),25) 6 | knnout$regests 7 | # [1] 201.84 195.72 8 | 9 | # fit now, predict later 10 | knnout <- kNN(mlb[,1:2],mlb[,3],NULL,25) 11 | predict(knnout,c(70,28)) 12 | # [1] 185.24 13 | 14 | # set saveNhbrs to TRUE to avoid re-doing the same computation 15 | knnout <- kNN(mlb[,1:2],mlb[,3],rbind(c(75,21),c(72,32)),25) 16 | knnout$regests 17 | # [1] 201.84 195.72 18 | # what about k = 20?; first, the direct way 19 | knnout <- kNN(mlb[,1:2],mlb[,3],rbind(c(75,21),c(72,32)),20) 20 | knnout$regests 21 | # [1] 202.05 196.05 22 | # now the computation-reusing way 23 | knnout25 <- kNN(mlb[,1:2],mlb[,3],rbind(c(75,21),c(72,32)),25, 24 | saveNhbrs=TRUE) 25 | knnout20 <- kNN(mlb[,1:2],mlb[,3],rbind(c(75,21),c(72,32)),20, 26 | savedNhbrs=knnout25$nhbrs) 27 | knnout20$regests 28 | 29 | -------------------------------------------------------------------------------- /man/newadult.Rd: -------------------------------------------------------------------------------- 1 | \name{newadult} 2 | \alias{newadult} 3 | \alias{newAdult} 4 | 5 | \title{ 6 | UCI adult income data set, adapted 7 | } 8 | 9 | \description{ 10 | This data set is adapted from 11 | the Adult data from the UCI Machine Learning Repository, 12 | which was in turn adapted from Census data on adult incomes and other 13 | demographic variables. The UCI data is used here with permission 14 | from Ronny Kohavi. 15 | 16 | The variables are: 17 | 18 | \itemize{ 19 | \item \code{gt50}, which converts the original \code{>50K} variable 20 | to an indicator variable; 1 for income greater than $50,000, else 0 21 | 22 | \item \code{edu}, which converts a set of education levels to 23 | approximate number of years of schooling 24 | 25 | \item \code{age} 26 | 27 | \item \code{gender}, 1 for male, 0 for female 28 | 29 | \item \code{mar}, 1 for married, 0 for single 30 | } 31 | 32 | Note that the education variable is now numeric. 33 | 34 | } 35 | 36 | -------------------------------------------------------------------------------- /R/Nonlin.R: -------------------------------------------------------------------------------- 1 | 2 | # uses output of R's nls() to get an asymptotic covariance 3 | # matrix in general heteroscedastic case 4 | 5 | # arguments: 6 | # 7 | # nlsout: object of type 'nls' 8 | # 9 | # value: approximate covariance matrix for the 10 | # estimated parameter vector 11 | 12 | nlshc <- function(nlsout,type='HC') { 13 | # notation: g(t,b) is the regression model, 14 | # where t is the vector of variables for a 15 | # given observation; b is the estimated parameter 16 | # vector; x is the matrix of predictor values 17 | b <- coef(nlsout) 18 | m <- nlsout$m 19 | # y - g: 20 | resid <- m$resid() 21 | # row i of hmat will be deriv of g(x[i,],b) 22 | # with respect to b 23 | hmat <- m$gradient() 24 | # calculate the artificial "x" and "y" of 25 | # the algorithm 26 | xhm <- hmat 27 | yresidhm <- resid + hmat %*% b 28 | # -1 means no constant term in the model 29 | lmout <- lm(yresidhm ~ xhm - 1) 30 | # vcovHC(lmout); was getting NAs for some data sets 31 | sandwich::vcovHC(lmout,type) 32 | } 33 | -------------------------------------------------------------------------------- /inst/tests/QuickTests.R~: -------------------------------------------------------------------------------- 1 | 2 | # built-in data on major league baseball players 3 | data(mlb) 4 | mlb <- mlb[,3:6] # position, height, weight, age 5 | 6 | set.seed(9999) 7 | 8 | # fit models 9 | knnout <- qeKNN(mlb,'Weight',k=25) 10 | rfout <- qeRF(mlb,'Weight') 11 | 12 | # mean abs. pred. error on holdout set, in pounds 13 | knnout$testAcc 14 | # [1] 11.75644 15 | rfout$testAcc 16 | # [1] 12.6787 17 | 18 | # predict a new case 19 | newx <- data.frame(Position='Catcher',Height=73.5,Age=26) 20 | predict(knnout,newx) 21 | [,1] 22 | # [1,] 204.04 23 | predict(rfout,newx) 24 | 11 25 | # 199.1714 26 | 27 | # how about some other ML methods? 28 | lassout <- qeLASSO(mlb,'Weight') 29 | lassout$testAcc 30 | # [1] 14.23122 31 | # poly reg, degree 3 32 | polyout <- qePoly(mlb,'Weight',3) 33 | polyout$testAcc 34 | # [1] 12.69412 35 | nnout <- qeNeural(mlb,'Weight') 36 | # ... 37 | nnout$testAcc 38 | # [1] 12.03419 39 | # try some nondefault hyperparams 40 | nnout <- qeNeural(mlb,'Weight',hidden=c(200,200),nEpoch=50) 41 | nnout$testAcc 42 | # [1] 15.8038 43 | 44 | -------------------------------------------------------------------------------- /man/quizDocs.Rd: -------------------------------------------------------------------------------- 1 | \name{quizDocs} 2 | \alias{quizDocs} 3 | \alias{quizzes} 4 | 5 | \title{ 6 | Course quiz documents 7 | } 8 | 9 | \description{ 10 | 11 | This data is suitable for NLP analysis. It consists of all the quizzes 12 | I've given in undergraduate courses, 143 quizzes in all. 13 | 14 | It is available in two forms. First, \code{quizzes} is a data.frame, 15 | 143 rows and 2 columns. Row i consists of a single character vector 16 | comprising the entire quiz i, followed by the course name (as an R 17 | factor). The second form is an R list, 143 elements. Each list element 18 | is a character vector, one vector element per line of the quiz. 19 | 20 | The original documents were LaTeX files. They have been run through the 21 | \code{detex} utility to remove most LaTeX commands, as well as removing 22 | the LaTeX preambles separately. 23 | 24 | The names of the list elements are the course names, as follows: 25 | 26 | ECS 50: a course in machine organization 27 | 28 | ECS 132: an undergraduate course in probabilistic modeling 29 | 30 | ECS 145: a course in scripting languages (Python, R) 31 | 32 | ECS 158: an undergraduate course in parallel computation 33 | 34 | ECS 256: a graduate course in probabilistic modeling 35 | 36 | } 37 | 38 | -------------------------------------------------------------------------------- /inst/sdss2020/ExpandGraph.R: -------------------------------------------------------------------------------- 1 | 2 | # explore use of the ExpandVars arg 3 | 4 | # arguments: 5 | 6 | # xtrn: vector or matrix for "X" portion of training data 7 | # ytrn: vector or matrix for "Y" portion of training data; matrix 8 | # case is for vector "Y", i.e. multiclass 9 | # xtst,ytst: test data analogs of xtrn, ytrn 10 | # k: number of nearest neighbors 11 | # eVar: column number of the predictor to be expanded 12 | # maxEVal: maximum expansion 13 | # lossFtn: loss function; internal offerings are 'MAPE' and 'propMisclass' 14 | # eValIncr: expansion value increment 15 | 16 | # value: 17 | 18 | # mean loss, evaluated from 0 to maxEVal, increments of eValIncr 19 | 20 | exploreExpVars <- 21 | function(xtrn,ytrn,xtst,ytst,k,eVar,maxEVal,loss,incr=0.05) 22 | { 23 | dfr <- data.frame(NULL,NULL) 24 | for (w in seq(0.05,1.5,eValIncr)) { 25 | preds <- kNN(xtrn,ytrn,xtst,k,expandVars=eVar,expandVals=w) 26 | dfr <- rbind(dfr,c(w,mean(loss(preds$regests,ytst) 27 | 28 | 29 | abs(preds$regests-ytst)))) 30 | } 31 | names(dfr) <- c('w',loss) 32 | frmla <- as.formula(paste0(loss, ' ~ w')) 33 | lwout <- loess(frmla,data=dfr) 34 | lwout$fitted 35 | } 36 | 37 | 38 | # plot accuracy of applying one or more instances of the ExpandVars arg 39 | 40 | 41 | -------------------------------------------------------------------------------- /inst/sdss2020/FineTune.R: -------------------------------------------------------------------------------- 1 | 2 | # apply regtools:::fineTuning() to choosing the expansion factors in 3 | # kNN() 4 | 5 | # arguments: 6 | 7 | # data: data frame (or matrix with col names), including both "X" and "Y" 8 | # yName: name of the data column containing "Y" 9 | # expandVars: indices of the column numbers 10 | 11 | knnFineTune <- function(data,yName,k,expandVars,ws,classif=FALSE, 12 | seed=9999) 13 | { 14 | if (classif) stop('not ready for classification problems') 15 | 16 | ycol <- which(names(data) == yName) 17 | # may need to shift some of expandVars over, once "Y" is removed 18 | if (ycol < ncol(data)) { 19 | topvars <- which(expandVars > ycol) 20 | if (length(topvars) > 0) { 21 | expandVars[topvars] <- expandVars[topvars] - 1 22 | } 23 | } 24 | expandNms <- colnames(data[,-ycol])[expandVars] 25 | 26 | theCall <- function(dtrn,dtst,cmbi) { 27 | x <- dtrn[,-ycol] 28 | y <- dtrn[,ycol] 29 | newx <- dtst[,-ycol] 30 | newy <- dtst[,ycol] 31 | cmbi <- as.numeric(cmbi) 32 | knnout <- kNN(x,y,newx,k,expandVars=expandVars,expandVals=cmbi) 33 | mean(abs(knnout$regests - newy)) 34 | } 35 | 36 | # wcols <- paste0('w',1:length(expandVars),'=ws',collapse=',') 37 | # wcols <- paste0('list(',wcols,')') 38 | wcols <- paste0(expandNms,'=ws',collapse=',') 39 | wcols <- paste0('list(',wcols,')') 40 | fineTuning(dataset=data,pars=eval(parse(text=wcols)),regCall=theCall, 41 | nXval=10) 42 | 43 | } 44 | 45 | -------------------------------------------------------------------------------- /R/Ridge.R: -------------------------------------------------------------------------------- 1 | 2 | # ridge regression, similar to MASS::lm.ridge() 3 | 4 | # X will be scaled and centered, using scale(); est. beta vector obtained by 5 | # solving 6 | 7 | # (X'X)/n + lambda I = X'Y/n 8 | 9 | # to make choice of lambda easier, since (X'X)/n will have 1s on diag 10 | 11 | # arguments: 12 | 13 | # xy: data matrix, "Y" in last column 14 | 15 | # lambda: set of lambda values to try 16 | 17 | # value: object of class 'rlm', with components 18 | 19 | # bhats: matrix of est reg coefs, one col for each lambda value; if 20 | # mapback is TRUE, these coefs will be mapped back to the 21 | # original predictors' scale 22 | # lambda: copy of the input lambda 23 | 24 | ridgelm <- function(xy,lambda=seq(0.01,1.00,0.01),mapback=TRUE) { 25 | p <- ncol(xy) - 1; n <- nrow(xy) 26 | x <- xy[,1:p] 27 | y <- xy[,p+1] 28 | x <- scale(x); y <- y - mean(y) 29 | tx <- t(x) 30 | xpx <- tx %*% x / n 31 | xpy <- tx %*% y / n 32 | mapftn <- function(lambval) 33 | qr.solve(xpx + lambval*diag(p),xpy) 34 | tmp <- Map(mapftn,lambda) 35 | tmp <- Reduce(cbind,tmp) 36 | if (mapback) { 37 | sds <- attr(x,'scaled:scale') 38 | for (i in 1:p) tmp[i,] <- tmp[i,] / sds[i] 39 | } 40 | result <- list(bhats=tmp,lambda=lambda) 41 | class(result) <- 'rlm' 42 | result 43 | } 44 | 45 | plot.rlm <- function(x,y,...) { 46 | lamb <- x$lambda 47 | bhs <- t(x$bhats) 48 | matplot(lamb,bhs,type='l',pch='.',xlab='lambda',ylab='beta-hat') 49 | } 50 | 51 | # print.rlm <- function(x,...) print(t(x$bhats)) 52 | 53 | -------------------------------------------------------------------------------- /man/nlshc.Rd: -------------------------------------------------------------------------------- 1 | \name{nlshc} 2 | \alias{nlshc} 3 | 4 | \title{Heteroscedastic Nonlinear Regression} 5 | 6 | \description{ 7 | Extension of \code{nls} to the heteroscedastic case. 8 | } 9 | 10 | \usage{ 11 | nlshc(nlsout,type='HC') 12 | } 13 | 14 | \arguments{ 15 | \item{nlsout}{Object of type 'nls'.} 16 | \item{type}{Eickert-White algorithm to use. See documentation for 17 | \pkg{nls}.} 18 | } 19 | 20 | \details{ 21 | Calls \code{nls} but then forms a different estimated covariance 22 | matrix for the estimated regression coefficients, applying the 23 | Eickert-White technique to handle heteroscedasticity. This then 24 | gives valid statistical inference in that setting. 25 | 26 | Some users may prefer to use \code{nlsLM} of the package 27 | \pkg{minpack.lm} instead of \code{nls}. This is fine, as both 28 | functions return objects of class 'nls'. 29 | } 30 | 31 | \value{ 32 | Estimated covariance matrix 33 | } 34 | 35 | \examples{ 36 | # simulate data from a setting in which mean Y is 37 | # 1 / (b1 * X1 + b2 * X2) 38 | n <- 250 39 | b <- 1:2 40 | x <- matrix(rexp(2*n),ncol=2) 41 | meany <- 1 / (x \%*\% b) # reg ftn 42 | y <- meany + (runif(n) - 0.5) * meany # heterosced epsilon 43 | xy <- cbind(x,y) 44 | xy <- data.frame(xy) 45 | # see nls() docs 46 | nlout <- nls(X3 ~ 1 / (b1*X1+b2*X2), 47 | data=xy,start=list(b1 = 1,b2=1)) 48 | nlshc(nlout) 49 | } 50 | 51 | \references{ Zeileis A (2006), Object-Oriented Computation of Sandwich 52 | Estimators. \emph{Journal of Statistical Software}, \bold{16}(9), 53 | 1--16, \url{https://www.jstatsoft.org/v16/i09/}. } 54 | 55 | \author{ 56 | Norm Matloff 57 | } 58 | 59 | -------------------------------------------------------------------------------- /man/Penrose.Rd: -------------------------------------------------------------------------------- 1 | \name{Penrose Linear} 2 | \alias{penroseLM} 3 | \alias{ridgePoly} 4 | \alias{penrosePoly} 5 | \alias{predict.penroseLM} 6 | \alias{predict.penrosePoly} 7 | 8 | \title{Penrose-Inverse Linear Models and Polynomial Regression} 9 | 10 | \description{ 11 | 12 | Provides mininum-norm solutions to linear models, identical to OLS in 13 | standard situations, but allowing exploration of overfitting in the 14 | overparameterized case. Also provides a wrapper for the polynomial 15 | case. 16 | } 17 | 18 | \usage{ 19 | penroseLM(d,yName) 20 | penrosePoly(d,yName,deg,maxInteractDeg=deg) 21 | ridgePoly(d,yName,deg,maxInteractDeg=deg) 22 | \method{predict}{penroseLM}(object,...) 23 | \method{predict}{penrosePoly}(object,...) 24 | 25 | } 26 | 27 | \arguments{ 28 | \item{...}{Arguments for the \code{predict} functions.} 29 | \item{d}{Dataframe, training set.} 30 | \item{yName}{Name of the class labels column.} 31 | \item{deg}{Polynomial degree.} 32 | \item{maxInteractDeg}{Maximum degree of interaction terms.} 33 | \item{object}{A value returned by \code{penroseLM} or 34 | \code{penrosePoly}.} 35 | } 36 | 37 | \details{ 38 | 39 | First, provides a convenient wrapper to the \pkg{polyreg} package for 40 | polynomial regression. (See \code{qePoly} here for an even higher-level 41 | wrapper.) Note that this computes true polynomials, with 42 | cross-product/interaction terms rather than just powers, and that dummy 43 | variables are handled properly (to NOT compute powers). 44 | 45 | Second, provides a tool for exploring the "double descent" phenomenon, 46 | in which prediction error may improve upon fitting past the 47 | interpolation point. 48 | 49 | } 50 | 51 | \author{ 52 | Norm Matloff 53 | } 54 | 55 | -------------------------------------------------------------------------------- /man/ridgelm.Rd: -------------------------------------------------------------------------------- 1 | \name{ridgelm,plot.rlm} 2 | \alias{ridgelm} 3 | \alias{plot.rlm} 4 | 5 | \title{Ridge Regression} 6 | 7 | \description{Similar to \code{lm.ridge} in \code{MASS} packaged included 8 | with R, but with a different kind of scaling and a little nicer 9 | plotting. 10 | } 11 | 12 | \usage{ 13 | ridgelm(xy,lambda = seq(0.01,1,0.01),mapback=TRUE) 14 | \method{plot}{rlm}(x,y,...) 15 | } 16 | 17 | \arguments{ 18 | \item{xy}{Data, response variable in the last column.} 19 | \item{lambda}{Vector of desired values for the ridge parameter.} 20 | \item{mapback}{If TRUE, the scaling that had been applied to the 21 | original data will be map back to the original scale, so that the 22 | estimated regression coefficients are now on the scale of the original 23 | data.} 24 | \item{x}{Object of type 'rlm', output of \code{ridgelm}.} 25 | \item{y}{Needed for consistency with the generic. Not used.} 26 | \item{...}{Needed for consistency with the generic. Not used.} 27 | } 28 | 29 | \details{ 30 | 31 | Centers and scales the predictors X, and centers the response 32 | variable Y. Computes X'X and then solves [(X'X)/n + lambda I]b = 33 | X'Y/n for b. The 1/n factors are important, making the diagonal 34 | elements of (X'X)/n all 1s and thus facilitating choices for the 35 | lambdas in a manner independent of the data. 36 | 37 | Calling \code{plot} on the output of \code{ridgelm} dispatches to 38 | \code{plot.rlm}, thus diplaying the ridge traces. 39 | 40 | } 41 | 42 | \value{ 43 | 44 | The function \code{ridgelm} returns an object of class 'rlm', with 45 | components \code{bhats}, the estimated beta vectors, one column per 46 | lambda value, and \code{lambda}, a copy of the input. 47 | 48 | } 49 | 50 | \author{ 51 | Norm Matloff 52 | } 53 | 54 | -------------------------------------------------------------------------------- /man/Graphics.Rd: -------------------------------------------------------------------------------- 1 | \name{xyzPlot} 2 | \alias{xyzPlot} 3 | 4 | \title{Misc. Graphics} 5 | 6 | \description{ 7 | Graphics utiliites. 8 | } 9 | 10 | \usage{ 11 | xyzPlot(xyz,clrs=NULL,cexText=1.0,xlim=NULL,ylim=NULL, 12 | xlab=NULL,ylab=NULL,legendPos=NULL,plotType='l') 13 | } 14 | 15 | \arguments{ 16 | \item{xyz}{A matrix or data frame of at least 3 columns, the first 17 | 3 serving as 'x', 'y' and 'z' coordinates of points to be plotted. 18 | Grouping, if any, is specified in column 4, in which case \code{xyz} 19 | must be a data frame.} 20 | \item{clrs}{Colors to be used in the grouped case.} 21 | \item{cexText}{Text size, proportional to standard.} 22 | \item{xlim}{As in \code{plot}.} 23 | \item{ylim}{As in \code{plot}.} 24 | \item{xlab}{As in \code{plot}.} 25 | \item{ylab}{As in \code{plot}.} 26 | \item{legendPos}{As in \code{legend}.} 27 | \item{plotType}{Coded 'l' for lines, 'p' for points.} 28 | } 29 | 30 | \details{ 31 | A way to display 3-dimensional data in 2 dimensions. For each plotted 32 | point (x,y), a z value is written in text over the point. A grouping 33 | variable is also allowed, with different colors used to plot different 34 | groups. 35 | 36 | A group (including the entire data in the case of one group) can be 37 | displayed either as a polygonal line, or just as a point cloud. The 38 | user should experiment with different argument settings to get the most 39 | visually impactful plot. 40 | } 41 | 42 | \examples{ 43 | 44 | \dontrun{ 45 | 46 | xyzPlot(mtcars[,c(3,6,1)],plotType='l',cexText=0.75) 47 | xyzPlot(mtcars[,c(3,6,1)],plotType='p',cexText=0.75) 48 | xyzPlot(mtcars[,c(3,6,1)],plotType='l',cexText=0.75) 49 | xyzPlot(mtcars[,c(3,6,1,2)],clrs=c('red','darkgreen','blue'),plotType='l',cexText=0.75) 50 | 51 | } 52 | 53 | } 54 | 55 | \author{ 56 | Norm Matloff 57 | } 58 | 59 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: regtools 2 | Version: 1.7.3 3 | Title: Regression and Classification Tools 4 | Authors@R: c(person("Norm", "Matloff", email = "matloff@cs.ucdavis.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9179-6785")), 5 | person("Robin", "Yancey", email = "reyancey@ucdavis.edu", role = c("aut")), 6 | person("Bochao", "Xin", email = "xinbochao97@outlook.com",role = c("ctb")), 7 | person("Kenneth", "Lee", email = "honlee@ucdavis.edu", role = c("ctb")), 8 | person("Rongkui", "Han", email = "rkbhan@ucdavis.edu", role = c("ctb"))) 9 | Maintainer: Norm Matloff 10 | Depends: R (>= 3.5.0),FNN,gtools 11 | Imports: R.utils,mvtnorm,sandwich,MASS,car,data.table,glmnet,rje,text2vec, 12 | polyreg 13 | Suggests: knitr, rmarkdown, OpenImageR, cdparcoord, keras, magick, partools 14 | VignetteBuilder: knitr 15 | License: GPL (>= 2) 16 | Description: Tools for linear, nonlinear and nonparametric regression 17 | and classification. Novel graphical methods for assessment 18 | of parametric models using nonparametric methods. One 19 | vs. All and All vs. All multiclass classification, optional 20 | class probabilities adjustment. Nonparametric regression 21 | (k-NN) for general dimension, local-linear option. Nonlinear 22 | regression with Eickert-White method for dealing with 23 | heteroscedasticity. Utilities for converting time series 24 | to rectangular form. Utilities for conversion between 25 | factors and indicator variables. Some code related to 26 | "Statistical Regression and Classification: from Linear 27 | Models to Machine Learning", N. Matloff, 2017, CRC, 28 | ISBN 9781498710916. 29 | URL: https://github.com/matloff/regtools 30 | BugReports: https://github.com/matloff/regtools/issues 31 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | 2 | import(FNN,mvtnorm) 3 | 4 | import("graphics") 5 | import("stats") 6 | import("utils") 7 | import("grDevices") 8 | import('MASS') 9 | import('glmnet') 10 | import('rje') 11 | import('text2vec') 12 | importFrom("data.table",'setDT','setkey') 13 | importFrom('car','vif') 14 | importFrom("sandwich","vcovHC") 15 | importFrom('gtools','defmacro') 16 | 17 | S3method(coef,lmac) 18 | S3method(vcov,lmac) 19 | S3method(plot,rlm) 20 | S3method(predict,knn) 21 | S3method(predict,kNN) 22 | S3method(predict,kNNallK) 23 | S3method(plot,kmin) 24 | S3method(predict,ovaknn) 25 | S3method(plot,tuner) 26 | S3method(predict,krsFit) 27 | S3method(predict,penroseLM) 28 | S3method(predict,penrosePoly) 29 | S3method(predict,ridgePoly) 30 | 31 | export(knnest,preprocessx,meany,mediany,vary,loclin, loclogit, predict.knn,nlshc, 32 | kmin,lmac,coef.lmac,vcov.lmac,pcac,loglinac,tbltofakedf, 33 | makeNA,ZerosToNAs,NAsTo0s, 34 | ovalogtrn,avalogtrn,avalogpred, 35 | doPCA,ridgePoly,replicMeans,PCAwithFactors, 36 | knntrn,ovaknntrn,predict.ovaknn,classadjust, 37 | knnFineTune,kNN,findOverallLoss,MAPE,probIncorrectClass,kNNxv,propMisclass, 38 | plotExpVars,exploreExpVars,multCols,kNNallK, 39 | bestKperPoint, parvsnonparplot,nonparvsxplot,nonparvarplot, 40 | ridgelm,plot.rlm, boundaryplot, 41 | l2,l1, 42 | mm,unscale,mmscale,constCols,catDFRow, 43 | confusion,factorTo012etc,allNumeric, 44 | penroseLM,predict.penroseLM,penrosePoly,predict.penrosePoly,ridgePoly, 45 | ulist,getNamedArgs,discretize, 46 | toAllNumeric,stopBrowser, 47 | factorsToDummies,factorToDummies,dummiesToFactor,hasFactors,dummiesToInt, 48 | xyDataframeToMatrix,dummiesToInt,charsToFactors,hasCharacters, 49 | stdErrPred,getDFclasses, 50 | toSuperFactor,toSubFactor, 51 | prToFile,pythonBlankSplit,xyzPlot, 52 | fineTuning,fineTuningPar,partTrnTst,fineTuningMult, 53 | krsFit,krsFitImg,diagNeural, 54 | TStoX,TStoXmv,textToXY,textToXYpred) 55 | -------------------------------------------------------------------------------- /man/unscale.Rd: -------------------------------------------------------------------------------- 1 | \name{unscale} 2 | \alias{unscale} 3 | \alias{mmscale} 4 | \alias{catDFRow} 5 | \alias{constCols} 6 | \alias{allNumeric} 7 | 8 | \title{Miscellaneous Utilities} 9 | 10 | \description{ 11 | Utilities. 12 | } 13 | 14 | \usage{ 15 | unscale(scaledx,ctrs=NULL,sds=NULL) 16 | mmscale(m,scalePars=NULL,p=NULL) 17 | catDFRow(dfRow) 18 | constCols(d) 19 | allNumeric(lst) 20 | } 21 | 22 | \arguments{ 23 | \item{scaledx}{A matrix.} 24 | \item{m}{A matrix.} 25 | \item{ctrs}{Take the original means to be \code{ctrs}} 26 | \item{lst}{An R list.} 27 | \item{sds}{Take the original standard deviations to be \code{sds}} 28 | \item{dfRow}{A row in a data frame.} 29 | \item{d}{A data frame or matrix.} 30 | \item{scalePars}{If not NULL, a 2-row matrix, with column \code{i} storing 31 | the min and max values to be used in scaling column \code{i} of \code{m}. 32 | Typically, one has previously called \code{mmscale} on a dataset and 33 | saved the resulting scale parameters, and we wish to use those 34 | same scale parameters on new data.} 35 | \item{p}{If \code{m} is a vector, this specifies the 36 | number of columns it should have as a matrix. The code will try to take 37 | care of this by itself if \code{p} is left at NULL.} 38 | } 39 | 40 | \value{ 41 | The function \code{unscale} returns the original object to which 42 | \code{scale} had been applied. Or, the attributes \code{ctrs} and 43 | \code{sds} can be specified by the user. 44 | } 45 | 46 | \details{ 47 | 48 | The function \code{mmscale} is meant as a better-behaved alternative to 49 | \code{scale}. Using minimum and maximum values, it maps variables to 50 | [0,1], thus avoiding the problems arising from very small standard 51 | deviations in \code{scale}. 52 | 53 | The function \code{catDFRow} nicely prints a row of a data frame. 54 | 55 | The function \code{constCols} determines which columns of a data frame 56 | or matrix are constant, if any. 57 | } 58 | 59 | \author{ 60 | Norm Matloff 61 | } 62 | 63 | -------------------------------------------------------------------------------- /R/DimRed.R: -------------------------------------------------------------------------------- 1 | 2 | # under construction 3 | 4 | ## # uniform wrapper for various dimension reduction methods, including 5 | ## # predict() functions 6 | ## 7 | ## # no centering/scaling is done; user may do separately 8 | ## 9 | ## # example 10 | ## 11 | ## # tg <- ToothGrowth 12 | ## # tg$supp <- as.numeric(tg$supp) 13 | ## # tg <- as.matrix(tg) 14 | ## # tgsvd <- dimRed(tg,method='svd',2) # 2 PCs out of a possible 3 15 | ## # newx <- c(8.8,1,0.5) 16 | ## # dimRedNewX(tgsvd,newx) # -8.860902 0.4095568, new coordinates 17 | ## # tg1 <- reduceComps(tgsvd,1) # go down to just 1 PC 18 | ## # dimRedNewX(tg1,newx) # -8.860902 19 | ## 20 | ## dimRed <- function(dat,method='prcomp',nComps) 21 | ## { 22 | ## compSizes <- NULL # eigenvalues etc. 23 | ## if (method == 'prcomp') { 24 | ## tmp <- prcomp(dat,center=FALSE,scale.=FALSE) 25 | ## tmp$method <- 'prcomp' 26 | ## tmp$rotation <- tmp$rotation[,1:nComps] 27 | ## } else if (method == 'svd') { 28 | ## tmp <- svd(dat,nu=nComps,nv=nComps) 29 | ## tmp$method <- 'svd' 30 | ## tmp$rotation <- tmp$v # equiv to PCA $rotation 31 | ## } else if (method == 'irlba') { 32 | ## require(irlba) 33 | ## tmp <- irlba(dat,nComps) 34 | ## tmp$method <- 'irlba' 35 | ## tmp$rotation <- tmp$v 36 | ## } else stop('no such method') 37 | ## tmp$compSizes <- compSizes 38 | ## class(tmp) <- c('dimRed',class(tmp)) 39 | ## tmp 40 | ## } 41 | ## 42 | ## # apply the same transformation to new X data 43 | ## dimRedNewX <- function(object,newxs) 44 | ## { 45 | ## method <- object$method 46 | ## if (method == 'prcomp' || method == 'svd' || method == 'irlba') { 47 | ## if (!is.matrix(newxs)) { 48 | ## newxs <- as.matrix(newxs) 49 | ## if (ncol(newxs) == 1) newxs <- t(newxs) 50 | ## } 51 | ## newxs %*% object$rotation 52 | ## } 53 | ## } 54 | ## 55 | ## # ask for further reduction in the number of components 56 | ## reduceComps <- function(object,nNewComps) 57 | ## { 58 | ## method <- object$method 59 | ## if (method == 'prcomp' || method == 'svd' || method == 'irlba') { 60 | ## object$rotation <- object$rotation[,1:nNewComps] 61 | ## } 62 | ## object 63 | ## } 64 | ## 65 | -------------------------------------------------------------------------------- /man/mm.Rd: -------------------------------------------------------------------------------- 1 | \name{mm} 2 | \alias{mm} 3 | 4 | \title{Method of Moments, Including Possible Regression Terms} 5 | 6 | \description{ 7 | 8 | Method of Moments computation for almost any statistical problem that 9 | has derivatives with respect to theta. Capable of handling models that 10 | include parametric regression terms, but not need be a regression 11 | problem. (This is not \emph{Generalized} Method of Moments; see the 12 | package \pkg{gmm} for the latter.) 13 | 14 | } 15 | 16 | \usage{ 17 | mm(m,g,x,init=rep(0.5,length(m)),eps=0.0001,maxiters=1000) 18 | } 19 | 20 | \arguments{ 21 | \item{m}{Vector of sample moments, "left-hand sides" of moment 22 | equations.} 23 | \item{g}{Function of parameter estimates, forming the "right-hand 24 | sides." This is a multivariate-valued function, of dimensionality 25 | equal to that of \code{m}}. 26 | \item{init}{Vector of initial guesses for parameter estimates. If 27 | components are named, these will be used as labels in the output.} 28 | \item{eps}{Convergence criterion.} 29 | \item{maxiters}{Maximum number of iterations.} 30 | \item{x}{Input data.} 31 | } 32 | 33 | \details{ 34 | 35 | Standard Newton-Raphson methods are used to solve for the parameter 36 | estimates, with \code{numericDeriv} being used to find the 37 | approximate derivatives. 38 | } 39 | 40 | \value{ 41 | 42 | R list consisting of components \code{tht}, the vector of parameter 43 | estimates, and \code{numiters}, the number of iterations performed. 44 | 45 | } 46 | 47 | \examples{ 48 | x <- rgamma(1000,2) 49 | m <- c(mean(x),var(x)) 50 | g <- function(x,theta) { # from theoretical properties of gamma distr. 51 | g1 <- theta[1] / theta[2] 52 | g2 <- theta[1] / theta[2]^2 53 | c(g1,g2) 54 | } 55 | # should output about 2 and 1 56 | mm(m,g,x) 57 | 58 | \dontrun{ 59 | library(mfp) 60 | data(bodyfat) 61 | # model as a beta distribution 62 | g <- function(x,theta) { 63 | t1 <- theta[1] 64 | t2 <- theta[2] 65 | t12 <- t1 + t2 66 | meanb <- t1 / t12 67 | m1 <- meanb 68 | m2 <- t1*t2 / (t12^2 * (t12+1)) 69 | c(m1,m2) 70 | } 71 | x <- bodyfat$brozek/100 72 | m <- c(mean(x),var(x)) 73 | # about 4.65 and 19.89 74 | mm(m,g,x) 75 | } 76 | 77 | } 78 | 79 | \author{ 80 | Norm Matloff 81 | } 82 | 83 | -------------------------------------------------------------------------------- /R/Text.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ############################ textToXY() ############################## 5 | 6 | # preparation for text classification; inputs text, label data; outputs 7 | # X matrix, Y vector 8 | 9 | # arguments: 10 | 11 | # docs: character vector, one element per document 12 | # labels: R factor, class labels corresponding to docs 13 | # stopWords: character vector of stop words; suggest 14 | # stopWords <- tm::stopwords('english') 15 | # kTop: number of most-frequent words to retain 16 | 17 | textToXY <- function(docs,labels,kTop=50,stopWords='a') 18 | { 19 | # prep 20 | if (is.null(labels)) labels <- rep(NA,length(docs)) 21 | id <- 1:length(docs) 22 | x <- data.frame(docs,labels,id=id) 23 | if (!is.character(x$docs)) x$docs <- as.character(x$docs) 24 | data.table::setDT(x) # make data frame a by-reference data.table 25 | data.table::setkey(x,id) # sort the table by id 26 | 27 | # compute vocab, doc term mat 28 | prep_fun <- tolower # change letters to lower-case 29 | tok_fun <- text2vec::word_tokenizer # break text into words 30 | itx <- text2vec::itoken(x$docs, 31 | preprocessor = prep_fun, 32 | tokenizer = tok_fun, 33 | ids = x$id, 34 | progressbar = FALSE) 35 | vocab <- text2vec::create_vocabulary(itx) 36 | vectorizer <- text2vec::vocab_vectorizer(vocab) 37 | dtm <- text2vec::create_dtm(itx, vectorizer) # document-term matrix, one row per doc 38 | 39 | # remove stop words 40 | vocab <- text2vec::create_vocabulary(itx, stopwords = stopWords) 41 | prunedVocab <- text2vec::prune_vocabulary(vocab) 42 | vectorizer <- text2vec::vocab_vectorizer(prunedVocab) 43 | dtm <- text2vec::create_dtm(itx, vectorizer) # new doc-term matrix 44 | 45 | nw <- ncol(dtm) 46 | if (kTop > 0) dtm <- dtm[,(nw-kTop+1):nw] 47 | dtm <- as.matrix(dtm) 48 | list(x=dtm,y=labels,stopWords=stopWords) 49 | } 50 | 51 | textToXYpred <- function(ttXYout,predDocs) 52 | { 53 | 54 | predX <- textToXY(predDocs,NULL,kTop=0,stopWords=ttXYout$stopWords)$x 55 | namesTrain <- colnames(ttXYout$x) 56 | namesTest <- colnames(predX) 57 | x <- matrix(0,nrow=length(predDocs),ncol=length(namesTrain)) 58 | colnames(x) <- namesTrain 59 | for (word in namesTrain) 60 | if (word %in% namesTest) x[,word] <- predX[,word] 61 | x 62 | } 63 | 64 | -------------------------------------------------------------------------------- /man/prgeng.Rd: -------------------------------------------------------------------------------- 1 | \name{prgeng} 2 | \alias{prgeng} 3 | \alias{peDumms} 4 | \alias{peFactors} 5 | \alias{pef} 6 | 7 | \title{ 8 | Silicon Valley programmers and engineers data 9 | } 10 | \usage{ 11 | data(prgeng) 12 | data(peDumms) 13 | data(peFactors) 14 | } 15 | 16 | \description{ 17 | This data set is adapted from the 2000 Census (5\% sample, person 18 | records). It is mainly restricted to programmers and engineers in the 19 | Silicon Valley area. (Apparently due to errors, there are some from 20 | other ZIP codes.) 21 | 22 | There are three versions: 23 | 24 | \itemize{ 25 | 26 | \item{\code{prgeng}, the original data, with categorical variables, 27 | e.g. Occupation, in their original codes} 28 | 29 | \item{\code{peDumms}, same but with categorical variables 30 | converted to dummies; due to the large number of levels the birth 31 | and PUMA data is not included} 32 | 33 | \item{\code{peFactors}, same but with categorical variables 34 | converted to factors} 35 | 36 | \item{\code{pef}, same as \code{peFactors}, but having only columns 37 | for age, education, occupation, gender, wage income and weeks 38 | worked. The education column has been collapsed to Master's degree, 39 | PhD and other.} 40 | 41 | } 42 | 43 | The variable codes, e.g. occupational codes, are available from 44 | \url{https://usa.ipums.org/usa/volii/occ2000.shtml}. 45 | (Short code lists are given in the record layout, but longer ones are in 46 | the appendix Code Lists.) 47 | 48 | The variables are: 49 | 50 | \itemize{ 51 | 52 | \item{\code{age}, with a U(0,1) variate added for jitter} 53 | 54 | \item{\code{cit}, citizenship; 1-4 code various categories of 55 | citizens; 5 means noncitizen (including permanent residents)} 56 | 57 | \item{\code{educ}: 01-09 code no college; 10-12 means some college; 58 | 13 is a bachelor's degree, 14 a master's, 15 a professional degree and 59 | 16 is a doctorate} 60 | 61 | \item{\code{occ}, occupation} 62 | 63 | \item{\code{birth}, place of birth} 64 | 65 | \item{\code{wageinc}, wage income} 66 | 67 | \item{\code{wkswrkd}, number of weeks worked} 68 | 69 | \item{\code{yrentry}, year of entry to the U.S. (0 for natives)} 70 | 71 | \item{\code{powpuma}, location of work} 72 | 73 | \item{\code{gender}, 1 for male, 2 for female} 74 | 75 | } 76 | 77 | } 78 | 79 | 80 | -------------------------------------------------------------------------------- /R/MM.R: -------------------------------------------------------------------------------- 1 | 2 | # Method of Moments, including regression terms 3 | 4 | # overview: 5 | 6 | # motivated by linear mixed effects models, but more general 7 | # 8 | # say the parameter vector theta has length k, so we need k 9 | # equations; their left-hand sides are specified by the argument g, 10 | # while the right-hand sides are given by mm; the user integrates 11 | # regression terms into these two function arguments 12 | 13 | # arguments: 14 | # 15 | # m: a vector of sample moments ("left-hand sides" of MM eqns); 16 | # x is the data, one observation per row; might be more 17 | # general than moments 18 | # g(x,theta): 19 | # a vector-valued function, specifying the "right-hand sides" 20 | # of the MM eqns; x as above, and theta is the vector of 21 | # parameters to be estimated; it is required that the second 22 | # argument of g() be named 'theta' 23 | # x: our x in m and g() 24 | # init: initial guess for theta; R list with names corresponding 25 | # to the parameters in g 26 | # eps: convergence criterion; iterations stop at 1000, or whe 27 | # sum(abs(g)) < eps 28 | # maxiters: max number of iterations 29 | 30 | mm <- function(m,g,x,init=rep(0.5,length(m)),eps=0.0001,maxiters=1000) { 31 | tht <- init 32 | # mvec <- m(data) 33 | mvec <- m 34 | for (i in 1:maxiters) { 35 | # g values for current iteration 36 | # gvec <- getgvec(g,tht) 37 | # browser() 38 | gvec <- g(x,tht) 39 | if (max(abs(mvec - gvec)) < eps) { 40 | if (!is.null(names(init))) 41 | names(tht) <- names(init) 42 | result <- list(tht=tht,numiters=i) 43 | return(tht) 44 | } 45 | # not done, so get new Jacobian and update tht 46 | jcb <- getjcb(g,x,tht) 47 | tht <- tht + solve(jcb,mvec-gvec) 48 | } 49 | print('max iterations exceeded') 50 | } 51 | 52 | # getgvec <- function(g,tht) { 53 | # theta <- tht 54 | # g(theta) 55 | # } 56 | 57 | getjcb <- function(g,x,tht) { 58 | theta <- tht 59 | attr(numericDeriv(quote(g(x,theta)),'theta'),'gradient') 60 | } 61 | 62 | # test case; should output about 2 and 1 63 | # x <- rgamma(1000,2) 64 | # m <- c(mean(x),var(x) 65 | # g <- function(theta) { 66 | # g1 <- theta[1] / theta[2] 67 | # g2 <- theta[1] / theta[2]^2 68 | # c(g1,g2) 69 | # } 70 | # mm(m,g,x) 71 | 72 | -------------------------------------------------------------------------------- /man/textToXY.Rd: -------------------------------------------------------------------------------- 1 | 2 | \name{textToXY,textToXYpred} 3 | \alias{textToXY} 4 | \alias{textToXYpred} 5 | 6 | \title{Tools for Text Classification} 7 | 8 | \description{ 9 | "R-style," classification-oriented wrappers for the \pkg{text2vec} package. 10 | } 11 | 12 | \usage{ 13 | textToXY(docs,labels,kTop=50,stopWords='a') 14 | textToXYpred(ttXYout,predDocs) 15 | } 16 | 17 | \arguments{ 18 | \item{docs}{Character vector, one element per document.} 19 | \item{predDocs}{Character vector, one element per document.} 20 | \item{labels}{Class labels, as numeric, character or factor. NULL is 21 | used at the prediction stage.} 22 | \item{kTop}{The number of most-frequent words to retain; 0 means 23 | retain all.} 24 | \item{stopWords}{Character vector of common words, e.g. prepositions 25 | to delete. Recommended is \code{tm::stopwords('english')}.} 26 | \item{ttXYout}{Output object from \code{textToXY}.} 27 | } 28 | 29 | \details{ 30 | 31 | A typical classification/machine learning package will have as arguments 32 | a feature matrix X and a labels vector/factor Y. For a "bag of 33 | words" analysis in the text case, each row of X would be a document 34 | and each column a word. 35 | 36 | The functions here are basically wrappers for generating X. Wrappers 37 | are convenient in that: 38 | 39 | \itemize{ 40 | \item The \pkg{text2vec} package is rather arcane, so a "R-style" 41 | wrapper would be useful. 42 | \item The \pkg{text2vec} are not directly set up to do 43 | classification, so the functions here provide the "glue" to do 44 | that. 45 | } 46 | 47 | The typical usage pattern is thus: 48 | 49 | \itemize{ 50 | \item Run the documents vector and labels vector/factor through 51 | \code{textToXY}, generating X and Y. 52 | \item Apply your favorite classification/machine learning package 53 | p to X and Y, returning o. 54 | \item When predicting a new document d, run o and d through 55 | \code{textToXY}, producing x. 56 | \item Run x on p's \code{predict} function. 57 | } 58 | } 59 | 60 | \value{ 61 | 62 | The function \code{textToXY} returns an R list with components 63 | \code{x} and \code{y} for X and Y, and a copy of the input 64 | \code{stopWords}. 65 | 66 | The function \code{textToXY} returns X. 67 | 68 | } 69 | 70 | \author{ 71 | Norm Matloff 72 | } 73 | 74 | -------------------------------------------------------------------------------- /inst/RecSysLinModels.md: -------------------------------------------------------------------------------- 1 | # Linear Models in Recommender Systems 2 | 3 | **N. Matloff, UC Davis** 4 | 5 | ## Overview 6 | 7 | In the collaborative filtering approach to recommender systems modeling, 8 | a very simple but common model for the rating user i gives to item j is 9 | 10 | Yij = μ + ui + vj + 11 | εij 12 | 13 | where 14 | 15 | - μ is the overall mean rating over all users and items 16 | 17 | - ui is the propensity of user i to rate items liberally or 18 | harshly 19 | 20 | - vj is the propensity of item j to be rated liberally or 21 | harshly 22 | 23 | - εij is an error term, incorporating all other 24 | factors 25 | 26 | - taken as random variables as i and j vary through all users and 27 | items, ui, vj, and εij 28 | are independent with mean 0 29 | 30 | The form of the above model suggests using linear model software, e.g. 31 | 32 | ``` r 33 | library(dslabs) 34 | data(movielens) 35 | ml <- movielens 36 | ml <- ml[,c(5,1,6)] 37 | ml$userId <- as.factor(ml$userId) 38 | ml$movieId <- as.factor(ml$movieId) 39 | lm(rating ~ .,data=ml) 40 | ``` 41 | 42 | At first glance, this seems like a questionable idea. In this version 43 | of the MovieLens data, there are 671 users and 9066 movies, thus nearly 44 | 10,000 dummy variables generated by **lm()**. With only 100,000 data 45 | points (and which are not independent), we run a real risk of 46 | overfitting. Worse, the code is quite long-running (over 2 hours in the 47 | run I tried on an ordinary PC). 48 | 49 | But it turns out there is a simple, fast, closed-form solution, both for 50 | this model and for some more advanced versions featuring interaction 51 | terms. 52 | 53 | ## Analysis: Noniteractive model 54 | 55 | Estimating μ is easy. From its definition, we take our estimate to 56 | be 57 | 58 | Y.. = 59 | Σi 60 | Σj 61 | Yij / n 62 | 63 | where is the total number of data points. 64 | 65 | Write the above model in population form. 66 | 67 | Y = μ + U + I + e 68 | 69 | Now consider user i, taking expectation conditioned on U = i: 70 | 71 | E(Y | U = i) = μ + ui 72 | 73 | The natural estimate of the LHS is 74 | 75 | Y.. = Σi Ni 76 | 77 | where Ni is the number of items rated by user i. 78 | 79 | Our estimate for ui is then 80 | 81 | Yi. - Y.. 82 | 83 | A similar derivation yields our estimate for vj, 84 | 85 | Y.j - Y.. 86 | 87 | (under construction) 88 | -------------------------------------------------------------------------------- /man/TS.Rd: -------------------------------------------------------------------------------- 1 | \name{TStoX} 2 | \alias{TStoX} 3 | \alias{TStoXmv} 4 | 5 | \title{Transform Time Series to Rectangular Form} 6 | 7 | \description{ 8 | Input a time series and transform it to a form suitable for prediction 9 | using \code{lm} etc. 10 | } 11 | 12 | \usage{ 13 | TStoX(x,lg) 14 | TStoXmv(xmat,lg,y) 15 | } 16 | 17 | \arguments{ 18 | \item{x}{A vector.} 19 | \item{lg}{Lag, a positive integer.} 20 | \item{xmat}{A matrix, data frame etc., a multivariate time series. 21 | Each column is a time series, over a common time period.} 22 | \item{y}{A time series, again on that common time period. If NULL in 23 | \code{TStoXmv}, then \code{y} is set to \code{x} (i.e. for a 24 | univariate time series in which older values predict newer ones).} 25 | } 26 | 27 | \details{ 28 | 29 | Similar to \code{stats::embed}, but in lagged form, with applications 30 | such as \code{lm} in mind. 31 | 32 | \code{TStoX} is for transforming vectors, while \code{TStoXmv} 33 | handles the multivariate time series case. Intended for use with 34 | \code{lm} or other regression/machine learning model, predicting 35 | \code{y[i]} from observations \code{i-lg, i-lg+1,...,i-1}. 36 | } 37 | 38 | \value{ 39 | 40 | 41 | As noted, the idea is to set up something like \code{lm(Y ~ X)}. 42 | Let \code{m} denote length of \code{x}, and in the matrix input 43 | case, the number of rows in \code{xmat}. Let \code{p} be 1 in the 44 | vector case, \code{ncol(xmat)} in the matrix case. The return value 45 | is a matrix with \code{m-lg} rows. There will be \code{p*lg+1} 46 | columns, with "Y," the numbers to be predicted in the last column. 47 | 48 | In the output in the multivariate case, let k denote 49 | \code{ncol(xmat)}. Then the first k columns of the output will be 50 | the k series at lag \code{lg}, the second k columns will be the k 51 | series at lag \code{lg-1}, ..., and the \code{lg}-th set of k 52 | columns will be the k series at lag 1, 53 | 54 | } 55 | 56 | \examples{ 57 | 58 | x1 <- c(5,12,13,8,88,6) 59 | x2 <- c(5,4,3,18,168,0) 60 | y <- 1:6 61 | xmat <- cbind(x1,x2) 62 | 63 | TStoX(x1,2) 64 | # [,1] [,2] [,3] 65 | # [1,] 5 12 13 66 | # [2,] 12 13 8 67 | # [3,] 13 8 88 68 | # [4,] 8 88 6 69 | 70 | xy <- TStoXmv(xmat,2,y) 71 | xy 72 | # [,1] [,2] [,3] [,4] [,5] 73 | # [1,] 5 5 12 4 3 74 | # [2,] 12 4 13 3 4 75 | # [3,] 13 3 8 18 5 76 | # [4,] 8 18 88 168 6 77 | 78 | lm(xy[,5] ~ xy[,-5]) 79 | # Coefficients: 80 | # (Intercept) xy[, -5]1 xy[, -5]2 xy[, -5]3 xy[, -5]4 81 | # -65.6 3.2 18.2 -3.2 NA 82 | # need n > 7 here for useful lm() call, but this illustrates the idea 83 | } 84 | 85 | \author{ 86 | Norm Matloff 87 | } 88 | 89 | -------------------------------------------------------------------------------- /man/regtools-package.Rd: -------------------------------------------------------------------------------- 1 | 2 | \name{regtools-package} 3 | \alias{regtools-package} 4 | \alias{regtools} 5 | \docType{package} 6 | 7 | \title{Overview and Package Reference Guide} 8 | \description{ 9 | 10 | This package provides a broad collection of functions useful for 11 | regression and classification analysis, and machine learning. 12 | 13 | } 14 | 15 | \section{Function List}{ 16 | 17 | \bold{Parametric modeling:} 18 | 19 | \itemize{ 20 | 21 | \item nonlinear regression: nlshc 22 | 23 | \item ridge regression: ridgelm, plot 24 | 25 | \item missing values (also see our \pkg{toweranNA} package): 26 | lmac,makeNA,coef.lmac,vcov.lmac,pcac 27 | 28 | } 29 | 30 | \bold{Diagnostic plots:} 31 | 32 | \itemize{ 33 | 34 | \item regression diagnostics: parvsnonparplot, nonparvsxplot, 35 | nonparvarplot 36 | 37 | \item other: boundaryplot, nonparvsxplot 38 | 39 | } 40 | 41 | \bold{Classification:} 42 | 43 | \itemize{ 44 | 45 | \item unbalanced data: classadjust (see \bold{UnbalancedClasses.md}) 46 | 47 | \item All vs. All: avalogtrn, avalogpred 48 | 49 | \item k-NN reweighting: exploreExpVars, plotExpVars, knnFineTune 50 | 51 | } 52 | 53 | \bold{Machine learning (also see qeML package):} 54 | 55 | \itemize{ 56 | 57 | \item k-NN: kNN, kmin, knnest, knntrn, preprocessx, meany, vary, loclin, 58 | predict, kmin, pwplot, bestKperPoint, knnFineTune 59 | 60 | \item neural networks: krsFit,multCol 61 | 62 | \item advanced grid search: fineTuning, fineTuningPar, plot.tuner, 63 | knnFineTune 64 | 65 | \item loss: l1, l2, MAPE, ROC 66 | 67 | } 68 | 69 | 70 | \bold{Dummies and R factors Utilities:} 71 | 72 | \itemize{ 73 | 74 | \item conversion between factors and dummies: dummiesToFactor, 75 | dummiesToInt, factorsToDummies, factorToDummies, factorTo012etc, 76 | dummiesToInt, hasFactors, charsToFactors, makeAllNumeric 77 | 78 | \item dealing with superset and subsets of factors: toSuperFactor, 79 | toSubFactor 80 | 81 | } 82 | 83 | \bold{Statistics:} 84 | 85 | \itemize{ 86 | 87 | \item mm 88 | 89 | } 90 | 91 | \bold{Matrix:} 92 | 93 | \itemize{ 94 | 95 | \item multCols, constCols 96 | 97 | } 98 | 99 | \bold{Time series:} 100 | 101 | \itemize{ 102 | 103 | \item convert rectangular to TS: TStoX 104 | 105 | } 106 | 107 | \bold{Text processing:} 108 | 109 | \itemize{ 110 | 111 | \item textToXY 112 | 113 | } 114 | 115 | \bold{Misc.:} 116 | 117 | \itemize{ 118 | 119 | \item scaling: mmscale, unscale 120 | 121 | \item data frames: catDFRow, tabletofakedf 122 | 123 | \item R: getNamedArgs, ulist 124 | 125 | \item discretize 126 | 127 | } 128 | 129 | 130 | } 131 | 132 | -------------------------------------------------------------------------------- /R/TS.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # routines to convert time series to rectangular data, so that we can 4 | # then fit using lm() or whatever, predicting from the last lg 5 | # observations 6 | 7 | # the first function, TStoX(x,lg,y), inputs a univariate time series and 8 | # outputs an "X" matrix in the sense of lm(Y ~ X); here the "Y" vector 9 | # is either supplied as an argument, or by default is x 10 | 11 | # consider for instance x = (5,12,13,8,88,6) and lg = 2, with y = x; we 12 | # want to redict x from itself, i.e. 13 | 14 | # predict the 13 from 5, 12 15 | # predict the 8 from 12, 13 16 | # predict the 88 from 13, 8 17 | 18 | # and 19 | 20 | # predict the 6 from 8, 88 21 | 22 | # our training set computed by TStoX() would then be 23 | # 24 | # X = 25 | # 26 | # 5 12 27 | # 12 13 28 | # 13 8 29 | # 8 88 30 | # 31 | # Y = (13,8,88,6) 32 | 33 | ########################## TStoX() ##################################### 34 | 35 | # inputs a time series, and transforms to rectangular shape suitable for 36 | # lm() or some other regression model, in which any current observation 37 | # is predicted from the last lg ones 38 | 39 | # arguments: 40 | # 41 | # x: a univariate time series; m is set to length(x) below 42 | # lg: lag, for fitting of a model in which observations at 43 | # time t will be predicted from observations at times 44 | # t-lg, t-lg+1,...,t-1 45 | 46 | # value: 47 | # 48 | # matrix, suitable for fitting a prediction model; m-lg rows, 49 | # lg+1 columns; x[lg+1], x[lg+2], ..., x[m] will be in the last column 50 | 51 | # the "X portion" will be 52 | # 53 | # x[1], x[2], ..., x[lg] 54 | # x[2], x[3], ..., x[lg+1] 55 | # ... 56 | # x[m-lg], x[m-lg+1], ..., x[m-1] 57 | 58 | TStoX <- function(x,lg) 59 | { 60 | # row k of the output 61 | onerow <- function(k) { 62 | s <- k 63 | e <- k + lg 64 | x[s:e] 65 | } 66 | lx <- length(x) 67 | outrows <- lapply(1:(lx-lg),onerow) 68 | do.call(rbind,outrows) 69 | 70 | } 71 | 72 | # k-variate time series version of TStoX (but y is not optional) 73 | 74 | # arguments: 75 | 76 | # each col of xmat is a time series, y is a vector (separate from x) 77 | 78 | # value: 79 | 80 | # the first k cols will be the k series at lag lg, 81 | # the second k cols will be the k series at lag lg-1, 82 | # ... 83 | # the lg-th k cols will be the k series at lag 1, 84 | 85 | TStoXmv <- function(xmat,lg,y) { 86 | k <- ncol(xmat) 87 | # take one time series, transform to "X" form, delete the "Y" col 88 | processOneTS <- function(xmatCol) TStoX(xmatCol,lg)[,1:lg] 89 | tmp <- lapply(as.data.frame(xmat),processOneTS) 90 | # now piece everything together 91 | rslt <- NULL 92 | for (lag in 1:lg) { 93 | for (tSer in 1:k) { 94 | rslt <- cbind(rslt,tmp[[tSer]][,lag]) 95 | } 96 | } 97 | cbind(rslt,y[-(1:lg)]) 98 | 99 | } 100 | 101 | -------------------------------------------------------------------------------- /man/misc.Rd: -------------------------------------------------------------------------------- 1 | \name{misc} 2 | \alias{replicMeans} 3 | \alias{stdErrPred} 4 | \alias{pythonBlankSplit} 5 | \alias{stopBrowser} 6 | \alias{doPCA} 7 | \alias{PCAwithFactors} 8 | \alias{ulist} 9 | \alias{prToFile} 10 | \alias{partTrnTst} 11 | \alias{findOverallLoss} 12 | \alias{getNamedArgs} 13 | \alias{multCols} 14 | \alias{probIncorrectClass} 15 | \alias{propMisclass} 16 | 17 | \title{Utilities} 18 | 19 | \description{ 20 | Various helper functions. 21 | } 22 | 23 | \usage{ 24 | replicMeans(nrep,toReplic,timing=FALSE) 25 | stdErrPred(regObj,xnew) 26 | pythonBlankSplit(s) 27 | stopBrowser(msg = stop("msg not supplied")) 28 | doPCA(x,pcaProp) 29 | PCAwithFactors(x, nComps = ncol(x)) 30 | ulist(lst) 31 | prToFile(filename) 32 | partTrnTst(fullData,nTest=min(1000,round(0.2*nrow(fullData)))) 33 | findOverallLoss(regests,y,lossFtn = MAPE) 34 | getNamedArgs(argVec) 35 | multCols(x,cols,vals) 36 | probIncorrectClass(yhat, y, startAt1 = TRUE) 37 | propMisclass(y,yhat) 38 | } 39 | 40 | \arguments{ 41 | \item{regests}{Fitted regression estimates, training set.} 42 | \item{y}{Y values, training set.} 43 | \item{yhat}{Predicted Y values} 44 | \item{startAt1}{TRUE if indexing starts at 1, FALSE if starting at 0.} 45 | \item{lossFtn}{Loss functin.} 46 | \item{fullData}{A data frame or matrix.} 47 | \item{nTest}{Number of rows for the test set.} 48 | \item{filename}{Name of output file.} 49 | \item{lst}{An R list.} 50 | \item{x}{Matrix or data frame.} 51 | \item{pcaProp}{Fraction in [0,1], specifying number of PCA components 52 | to compute, in terms of fraction of total variance.} 53 | \item{nComps}{Number of PCA components.} 54 | \item{regObj}{An object of class \code{'lm'} or similar, for which 55 | there is a \code{vcov} generic function.} 56 | \item{xnew}{New X value to be predicted.} 57 | \item{nrep}{Number of replications.} 58 | \item{s}{A character string.} 59 | \item{toReplic}{Function call(s), as a quoted string, separated by 60 | semicolons if more than one call.} 61 | \item{timing}{If TRUE, find average elapsed time over the replicates.} 62 | \item{msg}{Character string, error message for existing debug browser.} 63 | \item{argVec}{R list or vector with named elements.} 64 | \item{cols}{A set of column numbers.} 65 | \item{vals}{A set of positive expansion numbers.} 66 | } 67 | 68 | \details{ 69 | 70 | The function \code{PCAwithFactors} is a wrapper for 71 | \code{stats::prcomp}, to be used on data frames that contain at least on 72 | R factor. 73 | 74 | } 75 | 76 | \value{ 77 | 78 | The function \code{PCAwithFactors} returns an object of class 79 | 'PCAwithFactors'. with components \code{pcout}, the object returned by 80 | the wrapped call to \code{prcomp}; \code{factorsInfo}, factor conversion 81 | information to be used with \code{predict}; and \code{preds}, the PCA 82 | version of \code{x}. 83 | 84 | The function \code{getNamedArgs} will assign in the caller's space 85 | variables with the names and values in \code{argVec}. 86 | 87 | } 88 | 89 | \examples{ 90 | 91 | w <- list(a=3,b=8) 92 | getNamedArgs(w) 93 | a 94 | b 95 | u <- c(5,12,13) 96 | names(u) <- c('x','y','z') 97 | getNamedArgs(u) 98 | x 99 | y 100 | z 101 | 102 | } 103 | 104 | \author{ 105 | Norm Matloff 106 | } 107 | 108 | -------------------------------------------------------------------------------- /inst/sdss2020/LocLinStudy.R: -------------------------------------------------------------------------------- 1 | 2 | # arguments 3 | 4 | # x: matrix of numeric features 5 | # y: factor or numeric vector 6 | # newx: matrix of numeric features in new cases 7 | # k: number of nearest neighbors 8 | # mahaThresh: a proportion 9 | 10 | # each data point in newx will use as smoothing function loclin() 11 | # instead of mean() 12 | 13 | mixedPreds <- function(x,y,newx,k,mahaThresh,scaleX) 14 | { 15 | if (is.factor(y)) stop('classification case not yet implemented') 16 | 17 | if (scaleX) { 18 | x <- scale(x,center=TRUE,scale=TRUE) 19 | xcntr <- attr(x, "scaled:center") 20 | xscl <- attr(x, "scaled:scale") 21 | newx <- scale(newx, center = xcntr, scale = xscl) 22 | } 23 | 24 | # first get distribution of M-dist 25 | meanx <- mean(x) 26 | covx <- cov(x) 27 | mhdists <- mahalanobis(x, meanx,covx) 28 | outerThresh <- quantile(mhdists,1-mahaThresh) 29 | 30 | # which rows of newx are on the edge of the data? 31 | newxMhdists <- mahalanobis(newx,meanx,covx) 32 | outThere <- which(newxMhdists > outerThresh) 33 | # the rest 34 | mainstream <- setdiff(1:nrow(newx),outThere) 35 | 36 | # now predict 37 | predsMainstream <- kNN(x,y,newx[mainstream,],k,scaleX=FALSE) 38 | predsOutThere <- kNN(x,y,newx[outThere,],k,scaleX=FALSE, 39 | smoothingFtn=loclin) 40 | preds <- vector(length=nrow(newx)) 41 | preds[mainstream] <- predsMainstream$regests 42 | preds[outThere] <- predsOutThere$regests 43 | 44 | list(preds=preds,mainstream=mainstream,outThere=outThere, 45 | predsMainstream=predsMainstream,predsOutThere=predsOutThere, 46 | k=k,scaleX=scaleX,outerThresh=outerThresh) 47 | } 48 | 49 | # experiments on the value of expanding predictor weights 50 | 51 | # generate data, then fit both mean() and loclin(), at various levels of 52 | # the Mahalanobis distance threshold; return vector of ratios of MAPE, 53 | # loclin()/mean() 54 | 55 | simLocLin <- function(n,p,k,catOut=FALSE,seed=9999) 56 | { 57 | x <- matrix(rexp(n*p),nrow=n) 58 | y <- rowSums(x)^2 + p*rnorm(n) 59 | newx <- matrix(rexp(n*p),nrow=n) 60 | newy <- rowSums(newx)^2 + p*rnorm(n) 61 | predsZM <- kNN(x,y,newx,50,scaleX=T)$regests 62 | res <- vector(length = 25) 63 | mhs <- seq(0.001,0.25,0.001) 64 | for (i in 1:length(mhs)) { 65 | mhprop <- mhs[i] 66 | zLL <- mixedPreds(x,y,newx,50,mhprop,T) 67 | predszLL <- zLL$preds 68 | mapeLoclin <- mean(abs(predszLL[zLL$outThere] - newy[zLL$outThere])) 69 | mapeMean <- mean(abs(predsZM[zLL$outThere] - newy[zLL$outThere])) 70 | if (catOut) cat(mhprop,mapeLoclin,mapeMean,'\n') 71 | res[i] <- mapeLoclin / mapeMean 72 | } 73 | res 74 | } 75 | 76 | doSim <- function() 77 | { 78 | res.5000.2.25 <<- simLocLin(5000,2,25,catOut=T) 79 | res.5000.2.100 <<- simLocLin(5000,2,100,catOut=T) 80 | res.5000.10.25 <<- simLocLin(5000,10,25,catOut=T) 81 | res.5000.10.100 <<- simLocLin(5000,10,100,catOut=T) 82 | res.5000.20.25 <<- simLocLin(5000,20,25,catOut=T) 83 | res.5000.20.100 <<- simLocLin(5000,20,100,catOut=T) 84 | } 85 | 86 | plottingSim <- function() 87 | { 88 | 89 | plot(seq(0.001,0.25,0.001),res.5000.2.25, type = "l",ylim=c(0,1), 90 | xlab='MH distance threshold',ylab='MAPE.loclin/mean') 91 | lines(seq(0.001,0.25,0.001),res.5000.2.100, type = "l") 92 | lines(seq(0.001,0.25,0.001),res.5000.10.25, type = "l") 93 | lines(seq(0.001,0.25,0.001),res.5000.10.100, type = "l") 94 | lines(seq(0.001,0.25,0.001),res.5000.20.25, type = "l") 95 | lines(seq(0.001,0.25,0.001),res.5000.20.100, type = "l") 96 | abline(h=1) 97 | 98 | # see which line is which 99 | print(res.5000.2.25[250]) 100 | print(res.5000.2.100[250]) 101 | print(res.5000.10.25[250]) 102 | print(res.5000.10.100[250]) 103 | print(res.5000.20.25[250]) 104 | print(res.5000.20.100[250]) 105 | 106 | text(0.190,0.930,labels='p=2,k=25') 107 | text(0.241,0.893,labels='p=2,k=100') 108 | text(0.199,0.289,labels='p=10') 109 | text(0.118,0.066,labels='p=20') 110 | 111 | } 112 | 113 | 114 | -------------------------------------------------------------------------------- /inst/sdss2020/LocLinStudy.R~: -------------------------------------------------------------------------------- 1 | 2 | # arguments 3 | 4 | # x: matrix of numeric features 5 | # y: factor or numeric vector 6 | # newx: matrix of numeric features in new cases 7 | # k: number of nearest neighbors 8 | # mahaThresh: a proportion 9 | 10 | # each data point in newx will use as smoothing function loclin() 11 | # instead of mean() 12 | 13 | mixedPreds <- function(x,y,newx,k,mahaThresh,scaleX) 14 | { 15 | if (is.factor(y)) stop('classification case not yet implemented') 16 | 17 | if (scaleX) { 18 | x <- scale(x,center=TRUE,scale=TRUE) 19 | xcntr <- attr(x, "scaled:center") 20 | xscl <- attr(x, "scaled:scale") 21 | newx <- scale(newx, center = xcntr, scale = xscl) 22 | } 23 | 24 | # first get distribution of M-dist 25 | meanx <- mean(x) 26 | covx <- cov(x) 27 | mhdists <- mahalanobis(x, meanx,covx) 28 | outerThresh <- quantile(mhdists,1-mahaThresh) 29 | 30 | # which rows of newx are on the edge of the data? 31 | newxMhdists <- mahalanobis(newx,meanx,covx) 32 | outThere <- which(newxMhdists > outerThresh) 33 | # the rest 34 | mainstream <- setdiff(1:nrow(newx),outThere) 35 | 36 | # now predict 37 | predsMainstream <- kNN(x,y,newx[mainstream,],k,scaleX=FALSE) 38 | predsOutThere <- kNN(x,y,newx[outThere,],k,scaleX=FALSE, 39 | smoothingFtn=loclin) 40 | preds <- vector(length=nrow(newx)) 41 | preds[mainstream] <- predsMainstream$regests 42 | preds[outThere] <- predsOutThere$regests 43 | 44 | list(preds=preds,mainstream=mainstream,outThere=outThere, 45 | predsMainstream=predsMainstream,predsOutThere=predsOutThere, 46 | k=k,scaleX=scaleX,outerThresh=outerThresh) 47 | } 48 | 49 | # experiments on the value of expanding predictor weights 50 | 51 | # generate data, then fit both mean() and loclin(), at various levels of 52 | # the Mahalanobis distance threshold; return vector of ratios of MAPE, 53 | # loclin()/mean() 54 | 55 | simLocLin <- function(n,p,k,catOut=FALSE,seed=9999) 56 | { 57 | x <- matrix(rexp(n*p),nrow=n) 58 | y <- rowSums(x)^2 + p*rnorm(n) 59 | newx <- matrix(rexp(n*p),nrow=n) 60 | newy <- rowSums(newx)^2 + p*rnorm(n) 61 | predsZM <- kNN(x,y,newx,50,scaleX=T)$regests 62 | res <- vector(length = 25) 63 | mhs <- seq(0.001,0.25,0.001) 64 | for (i in 1:length(mhs)) { 65 | mhprop <- mhs[i] 66 | zLL <- mixedPreds(x,y,newx,50,mhprop,T) 67 | predszLL <- zLL$preds 68 | mapeLoclin <- mean(abs(predszLL[zLL$outThere] - newy[zLL$outThere])) 69 | mapeMean <- mean(abs(predsZM[zLL$outThere] - newy[zLL$outThere])) 70 | if (catOut) cat(mhprop,mapeLoclin,mapeMean,'\n') 71 | res[i] <- mapeLoclin / mapeMean 72 | } 73 | res 74 | } 75 | 76 | doSim <- function() 77 | { 78 | res.5000.2.25 <<- simLocLin(5000,2,25,catOut=T) 79 | res.5000.2.100 <<- simLocLin(5000,2,100,catOut=T) 80 | res.5000.10.25 <<- simLocLin(5000,10,25,catOut=T) 81 | res.5000.10.100 <<- simLocLin(5000,10,100,catOut=T) 82 | res.5000.20.25 <<- simLocLin(5000,20,25,catOut=T) 83 | res.5000.20.100 <<- simLocLin(5000,20,100,catOut=T) 84 | } 85 | 86 | plottingSim <- function() 87 | { 88 | 89 | plot(seq(0.001,0.25,0.001),res.5000.2.25, type = "l",ylim=c(0,1), 90 | xlab='MH distance threshold',ylab='MAPE.loclin/mean') 91 | lines(seq(0.001,0.25,0.001),res.5000.2.100, type = "l") 92 | lines(seq(0.001,0.25,0.001),res.5000.10.25, type = "l") 93 | lines(seq(0.001,0.25,0.001),res.5000.10.100, type = "l") 94 | lines(seq(0.001,0.25,0.001),res.5000.20.25, type = "l") 95 | lines(seq(0.001,0.25,0.001),res.5000.20.100, type = "l") 96 | abline(h=1) 97 | 98 | # see which line is which 99 | print(res.5000.2.25[250]) 100 | print(res.5000.2.100[250]) 101 | print(res.5000.10.25[250]) 102 | print(res.5000.10.100[250]) 103 | print(res.5000.20.25[250]) 104 | print(res.5000.20.100[250]) 105 | 106 | text(0.190,0.910,labels='p=2,k=25') 107 | text(0.241,0.893,labels='p=2,k=100') 108 | text(0.199,0.289,labels='p=10') 109 | text(0.118,0.066,labels='p=20') 110 | 111 | } 112 | 113 | 114 | -------------------------------------------------------------------------------- /man/krsFit.Rd: -------------------------------------------------------------------------------- 1 | \name{krsFit} 2 | \alias{krsFit} 3 | \alias{krsFitImg} 4 | \alias{diagNeural} 5 | \alias{predict.krsFit} 6 | 7 | \title{Tools for Neural Networks} 8 | 9 | \description{ 10 | Tools to complement existing neural networks software, notably 11 | a more "R-like" wrapper to fitting data with R's \pkg{keras} package. 12 | } 13 | 14 | \usage{ 15 | krsFit(x,y,hidden,acts=rep("relu",length(hidden)),learnRate=0.001, 16 | conv=NULL,xShape=NULL,classif=TRUE,nClass=NULL,nEpoch=30, 17 | scaleX=TRUE,scaleY=TRUE) 18 | krsFitImg(x,y,hidden=c(100,100),acts=rep("relu",length(hidden)), 19 | nClass,nEpoch=30) 20 | \method{predict}{krsFit}(object,...) 21 | diagNeural(krsFitOut) 22 | } 23 | 24 | \arguments{ 25 | \item{object}{An object of class 'krsFit'.} 26 | \item{...}{Data points to be predicted, 'newx'.} 27 | \item{x}{X data, predictors, one row per data point, in the training 28 | set. Must be a matrix.} 29 | \item{y}{Numeric vector of Y values. In classification case 30 | must be integers, not an R factor, and take on the values 0,1,2,..., 31 | \code{nClass}-1}. 32 | \item{hidden}{Vector of number of units per 33 | hidden layer, or the rate for a dropout layer.} 34 | \item{acts}{Vector of names of the activation functions, one per 35 | hidden layer. Choices inclde 'relu', 'sigmoid', 'tanh', 'softmax', 36 | 'elu', 'selu'.} 37 | \item{learnRate}{Learning rate.} 38 | \item{conv}{R list specifying the convolutional layers, if any.} 39 | \item{xShape}{Vector giving the number of rows and columns, and in the 40 | convolutional case with multiple channels, the number of channels.} 41 | \item{classif}{If TRUE, indicates a classification problem.} 42 | \item{nClass}{Number of classes.} 43 | \item{nEpoch}{Number of epochs.} 44 | \item{krsFitOut}{An object returned by \code{krstFit}.} 45 | \item{scaleX}{If TRUE, scale X columns.} 46 | \item{scaleY}{If TRUE, scale Y columns.} 47 | } 48 | 49 | \details{ 50 | 51 | The \code{krstFit} function is a wrapper for the entire pipeline 52 | in fitting the R \pkg{keras} package to a dataset: Defining the model, 53 | compiling, stating the inputs and so on. As a result, the wrapper 54 | allows the user to skip those details (or not need to even know them), 55 | and define the model in a manner more familiar to R users. 56 | 57 | The paired \code{predict.krsFit} takes as its first argument the output 58 | of \code{krstFit}, and \code{newx}, the points to be predicted. 59 | } 60 | 61 | \examples{ 62 | 63 | \dontrun{ 64 | library(keras) 65 | data(peDumms) 66 | ped <- peDumms[,c(1,20,22:27,29,32,31)] 67 | # predict wage income 68 | x <- ped[,-11] 69 | y <- ped[,11] 70 | z <- krsFit(x,y,c(50,50,50),classif=FALSE,nEpoch=25) 71 | preds <- predict(z,x) 72 | mean(abs(preds-y)) # something like 25000 73 | 74 | x <- ped[,-(4:8)] 75 | y <- ped[,4:8] 76 | y <- dummiesToInt(y,FALSE) - 1 77 | z <- krsFit(x,y,c(50,50,0.20,50),classif=TRUE,nEpoch=175,nClass=6) 78 | preds <- predict(z,x) 79 | mean(preds == y) # something like 0.39 80 | 81 | # obtain MNIST training and test sets; the following then uses the 82 | # example network of 83 | 84 | # https://databricks-prod-cloudfront.cloud.databricks.com/ 85 | # public/4027ec902e239c93eaaa8714f173bcfc/2961012104553482/ 86 | # 4462572393058129/1806228006848429/latest.html 87 | 88 | # converted to use the krsFit wrapper 89 | 90 | x <- mntrn[,-785] / 255 91 | y <- mntrn[,785] 92 | xShape <- c(28,28) 93 | 94 | # define convolutional layers 95 | conv1 <- list(type='conv2d',filters=32,kern=3) 96 | conv2 <- list(type='pool',kern=2) 97 | conv3 <- list(type='conv2d',filters=64,kern=3) 98 | conv4 <- list(type='pool',kern=2) 99 | conv5 <- list(type='drop',drop=0.5) 100 | 101 | # call wrapper, 1 dense hidden layer of 128 units, then dropout layer 102 | # with proportion 0.5 103 | z <- krsFit(x,y,conv=list(conv1,conv2,conv3,conv4,conv5),c(128,0.5), 104 | classif=TRUE,nClass=10,nEpoch=10,xShape=c(28,28),scaleX=FALSE,scaleY=FALSE) 105 | 106 | # try on test set 107 | preds <- predict(z,mntst[,-785]/255) 108 | mean(preds == mntst[,785]) # 0.98 in my sample run 109 | 110 | } 111 | 112 | } 113 | 114 | \author{ 115 | Norm Matloff 116 | } 117 | 118 | -------------------------------------------------------------------------------- /man/ovalogtrn.Rd: -------------------------------------------------------------------------------- 1 | \name{multiclass routines} 2 | \alias{boundaryplot} 3 | \alias{ovalogtrn} 4 | \alias{ovaknntrn} 5 | \alias{ovalogpred} 6 | \alias{avalogtrn} 7 | \alias{avalogpred} 8 | \alias{predict.ovaknn} 9 | \alias{classadjust} 10 | \alias{confusion} 11 | \alias{factorTo012ec} 12 | \alias{classadjust} 13 | 14 | \title{Classification with More Than 2 Classes} 15 | 16 | \description{ 17 | Tools for multiclass classification, parametric and nonparametric. 18 | } 19 | 20 | \usage{ 21 | avalogtrn(trnxy,yname) 22 | ovaknntrn(trnxy,yname,k,xval=FALSE) 23 | avalogpred() 24 | classadjust(econdprobs,wrongprob1,trueprob1) 25 | boundaryplot(y01,x,regests,pairs=combn(ncol(x),2),pchvals=2+y01,cex=0.5,band=0.10) 26 | } 27 | 28 | \arguments{ 29 | \item{pchvals}{Point size in base-R graphics.} 30 | \item{trnxy}{Data matrix, Y last.} 31 | \item{xval}{If TRUE, use leaving-one-out method.} 32 | \item{y01}{Y vector (1s and 0s).} 33 | \item{regests}{Estimated regression function values.} 34 | \item{x}{X data frame or matrix.} 35 | \item{pairs}{Two-row matrix, column i of which is a pair of predictor 36 | variables to graph.} 37 | \item{cex}{Symbol size for plotting.} 38 | \item{band}{If \code{band} is non-NULL, only points within \code{band}, 39 | say 0.1, of est. P(Y = 1) are displayed, for a contour-like effect.} 40 | \item{yname}{Name of the Y column.} 41 | \item{k}{Number of nearest neighbors.} 42 | \item{econdprobs}{Estimated conditional class probabilities, given the 43 | predictors.} 44 | \item{wrongprob1}{Incorrect, data-provenanced, unconditional P(Y = 1).} 45 | \item{trueprob1}{Correct unconditional P(Y = 1).} 46 | } 47 | 48 | \details{ 49 | 50 | These functions aid classification in the multiclass setting. 51 | 52 | The function \code{boundaryplot} serves as a visualization technique, 53 | for the two-class setting. It draws the boundary between predicted Y = 54 | 1 and predicted Y = 0 data points in 2-dimensional feature space, as 55 | determined by the argument \code{regests}. Used to visually assess 56 | goodness of fit, typically running this function twice, say one for 57 | \code{glm} then for \code{kNN}. If there is much discrepancy and the 58 | analyst wishes to still use glm(), he/she may wish to add polynomial 59 | terms. 60 | 61 | The functions not listed above are largely deprecated, e.g. in favor of 62 | \code{qeLogit} and the other \code{qe}-series functions. 63 | 64 | } 65 | 66 | \examples{ 67 | 68 | \dontrun{ 69 | 70 | 71 | data(oliveoils) 72 | oo <- oliveoils[,-1] 73 | 74 | # toy example 75 | set.seed(9999) 76 | x <- runif(25) 77 | y <- sample(0:2,25,replace=TRUE) 78 | xd <- preprocessx(x,2,xval=FALSE) 79 | kout <- ovaknntrn(y,xd,m=3,k=2) 80 | kout$regest # row 2: 0.0,0.5,0.5 81 | predict(kout,predpts=matrix(c(0.81,0.55,0.15),ncol=1)) # 0,2,0or2 82 | yd <- factorToDummies(as.factor(y),'y',FALSE) 83 | kNN(x,yd,c(0.81,0.55,0.15),2) # predicts 0, 1or2, 2 84 | 85 | data(peDumms) # prog/engr data 86 | ped <- peDumms[,-33] 87 | ped <- as.matrix(ped) 88 | x <- ped[,-(23:28)] 89 | y <- ped[,23:28] 90 | knnout <- kNN(x,y,x,25,leave1out=TRUE) 91 | truey <- apply(y,1,which.max) - 1 92 | mean(knnout$ypreds == truey) # about 0.37 93 | xd <- preprocessx(x,25,xval=TRUE) 94 | kout <- knnest(y,xd,25) 95 | preds <- predict(kout,predpts=x) 96 | hats <- apply(preds,1,which.max) - 1 97 | mean(yhats == truey) # about 0.37 98 | 99 | data(peFactors) 100 | # discard the lower educ-level cases, which are rare 101 | edu <- peFactors$educ 102 | numedu <- as.numeric(edu) 103 | idxs <- numedu >= 12 104 | pef <- peFactors[idxs,] 105 | numedu <- numedu[idxs] 106 | pef$educ <- as.factor(numedu) 107 | pef1 <- pef[,c(1,3,5,7:9)] 108 | 109 | # ovalog 110 | ovaout <- ovalogtrn(pef1,"occ") 111 | preds <- predict(ovaout,predpts=pef1[,-3]) 112 | mean(preds == factorTo012etc(pef1$occ)) # about 0.39 113 | 114 | # avalog 115 | 116 | avaout <- avalogtrn(pef1,"occ") 117 | preds <- predict(avaout,predpts=pef1[,-3]) 118 | mean(preds == factorTo012etc(pef1$occ)) # about 0.39 119 | 120 | # knn 121 | 122 | knnout <- ovalogtrn(pef1,"occ",25) 123 | preds <- predict(knnout,predpts=pef1[,-3]) 124 | mean(preds == factorTo012etc(pef1$occ)) # about 0.43 125 | 126 | data(oliveoils) 127 | oo <- oliveoils 128 | oo <- oo[,-1] 129 | knnout <- ovaknntrn(oo,'Region',10) 130 | # predict a new case that is like oo1[1,] but with palmitic = 950 131 | newx <- oo[1,2:9,drop=FALSE] 132 | newx[,1] <- 950 133 | predict(knnout,predpts=newx) # predicts class 2, South 134 | 135 | } 136 | 137 | } 138 | 139 | \author{ 140 | Norm Matloff 141 | } 142 | 143 | -------------------------------------------------------------------------------- /man/lmac.Rd: -------------------------------------------------------------------------------- 1 | \name{lmac,makeNA,coef.lmac,vcov.lmac,pcac,loglinac,tbltofakedf} 2 | \alias{lmac} 3 | \alias{pcac} 4 | \alias{coef.lmac} 5 | \alias{vcov.lmac} 6 | \alias{loglinac} 7 | \alias{tbltofakedf} 8 | \alias{makeNA} 9 | \alias{NAsTo0s} 10 | \alias{ZerosToNAs} 11 | 12 | \title{Available Cases Method for Missing Data} 13 | 14 | \description{ 15 | Various estimators that handle missing data via the Available Cases Method 16 | } 17 | 18 | \usage{ 19 | lmac(xy,nboot=0) 20 | makeNA(m,probna) 21 | NAsTo0s(x) 22 | ZerosToNAs(x,replaceVal=0) 23 | \method{coef}{lmac}(object,...) 24 | \method{vcov}{lmac}(object,...) 25 | pcac(indata,scale=FALSE) 26 | loglinac(x,margin) 27 | tbltofakedf(tbl) 28 | } 29 | 30 | \arguments{ 31 | \item{replaceVal}{Value to be replaced by NA.} 32 | \item{xy}{Matrix or data frame, X values in the first columns, Y 33 | in the last column.} 34 | \item{indata}{Matrix or data frame.} 35 | \item{x}{Matrix or data frame, one column per variable.} 36 | \item{nboot}{If positive, number of bootstrap samples to take.} 37 | \item{probna}{Probability that an element will be NA.} 38 | \item{scale}{If TRUE, call \code{cor} instead of \code{cov}.} 39 | \item{tbl}{An R table.} 40 | \item{m}{Number of synthetic NAs to insert.} 41 | \item{object}{Output from \code{lmac}.} 42 | \item{...}{Needed for consistency with generic function. Not used.} 43 | \item{margin}{A list of vectors specifying the model, as in 44 | \code{loglin}.} 45 | 46 | } 47 | 48 | \details{ 49 | 50 | The Available Cases (AC) approach applies to statistical methods that 51 | depend only on products of k of the variables, so that cases having 52 | non-NA values for those k variables can be used, as opposed to using 53 | only cases that are fully intact in all variables, the Complete Cases 54 | (CC) approach. In the case of linear regression, for instance, the 55 | estimated coefficients depend only on covariances between the 56 | variables (both predictors and response). This approach assumes thst 57 | the cases with missing values have the same distribution as the 58 | intact cases. 59 | 60 | The \code{lmac} function forms OLS estimates as with \code{lm}, but 61 | applying AC, in contrast to \code{lm}, which uses the CC method. 62 | 63 | The \code{pcac} function is an AC substitute for \code{prcomp}. The 64 | data is centered, corresponding to a fixed value of \code{center = 65 | TRUE} in \code{prcomp}. It is also scaled if \code{scale} is TRUE, 66 | corresponding \code{scale = TRUE} in \code{prcomp}. Due to AC, 67 | there is a small chance of negative eigenvalues, in which case 68 | \code{stop} will be called. 69 | 70 | The \code{loglinac} function is an AC substitute for \code{loglin}. 71 | The latter takes tables as input, but \code{loglinac} takes the raw 72 | data. If you have just the table, use \code{tbltofakedf} to 73 | regenerate a usable data frame. 74 | 75 | The \code{makeNA} function is used to insert random NA values into 76 | data, for testing purposes. 77 | 78 | } 79 | 80 | \value{ 81 | 82 | For \code{lmac}, an object of class \code{lmac}, with components 83 | 84 | \itemize{ 85 | 86 | \item{coefficients}, as with \code{lm}; 87 | accessible directly or by calling \code{coef}, as with \code{lm} 88 | 89 | \item{fitted.values}, as with \code{lm} 90 | 91 | \item{residuals}, as with \code{lm} 92 | 93 | \item{r2}, (unadjusted) R-squared 94 | 95 | \item{cov}, for \code{nboot > 0} the estimated covariance matrix 96 | of the vector of estimated regression coefficients; accessible 97 | directly or by calling \code{vcov}, as with \code{lm} 98 | 99 | } 100 | 101 | For \code{pcac}, an R list, with components 102 | 103 | \itemize{ 104 | 105 | \item{sdev}, as with \code{prcomp} 106 | 107 | \item{rotation}, as with \code{prcomp} 108 | 109 | } 110 | 111 | For \code{loglinac}, an R list, with components 112 | 113 | \itemize{ 114 | 115 | \item{param}, estimated coefficients, as in \code{loglin} 116 | 117 | \item{fit}, estimated expected call counts, as in \code{loglin} 118 | 119 | } 120 | 121 | } 122 | 123 | \examples{ 124 | n <- 25000 125 | w <- matrix(rnorm(2*n),ncol=2) # x and epsilon 126 | x <- w[,1] 127 | y <- x + w[,2] 128 | # insert some missing values 129 | nmiss <- round(0.1*n) 130 | x[sample(1:n,nmiss)] <- NA 131 | nmiss <- round(0.2*n) 132 | y[sample(1:n,nmiss)] <- NA 133 | acout <- lmac(cbind(x,y)) 134 | coef(acout) # should be near pop. values 0 and 1 135 | } 136 | 137 | \author{ 138 | Norm Matloff 139 | } 140 | 141 | -------------------------------------------------------------------------------- /inst/ScalingInPCA.md: -------------------------------------------------------------------------------- 1 | # Clearing the Confusion: Scaling in PCA 2 | 3 | Many resources on machine learning (ML) methodology recommend, or even 4 | state as crucial, that one *scale* (or *standardize*) one's data, i.e. 5 | divide each variable by its standard deviation (after subtracting the 6 | mean), before applying Principal Component Analysis (PCA). Here we will 7 | show why that can be problematic, and provide alternatives. 8 | 9 | ## Overview 10 | 11 | The recommendation to scale is common. Here are some examples: 12 | 13 | * R **prcomp()** man page 14 | 15 | They say "scaling is advisable": 16 | 17 | > scale.: a logical value indicating whether the variables should be 18 | > scaled to have unit variance before the analysis takes place. 19 | > The default is ‘FALSE’ for consistency with S, but in general 20 | > scaling is advisable. 21 | 22 | * [Scikit-Learn](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html): 23 | 24 | Actually, the mention of normal distributions is misleading and in 25 | any case not relevant, but again there is a rather imperative statement 26 | to scale: 27 | 28 | > Feature scaling through standardization (or Z-score normalization) can 29 | > be an important preprocessing step for many machine learning algorithms. 30 | > Standardization involves rescaling the features such that they have the 31 | > properties of a standard normal distribution with a mean of zero and a 32 | > standard deviation of one. 33 | 34 | * [DataCamp](https://www.datacamp.com/community/tutorials/pca-analysis-r) 35 | 36 | Again, their phrasing is rather imperative: 37 | 38 | > Note that the units used [in the **mtcars** dataset] vary and occupy 39 | > different scales...You will also set two arguments, center and scale, to 40 | > be TRUE. 41 | 42 | * [caret](https://cran.r-project.org/package=caret), **preProcess** man 43 | page 44 | 45 | Scaling done unless you say no: 46 | 47 | > If PCA is requested but centering and scaling are not, the values will 48 | > still be centered and scaled. 49 | 50 | * [Visually Enforced](https://www.gastonsanchez.com/visually-enforced/how-to/2012/06/17/PCA-in-R/) 51 | 52 | The word "must" is used here: 53 | 54 | > Since most of the times the variables are measured in different scales, 55 | > the PCA must be performed with standardized data (mean = 0, variance = 56 | > 1). 57 | 58 | ## The perceived problem 59 | 60 | As the DataCamp statement notes, some data may be "large" while other 61 | data are "small." There is a concern that, without scaling, the large 62 | ones will artificially dominate. This is especially an issue in light 63 | of the variation in measurement systems -- should a variable measured in 64 | kilometers be given more weight than one measured in miles? 65 | 66 | ## Motivating counterexample 67 | 68 | Consider a setting with two independent variables, A and B, with means 69 | 100, and with Var(A) = 500 and Var(B) = 2. Let A' and B' denote these 70 | variables after centering and scaling. 71 | 72 | PCA is all about removing variables with small variance, as they are 73 | essentially constant. If we work with A and B, we would of course use 74 | only A. **But if we work with A' and B', we would use both of them, as 75 | they both have variance 1.0.** Scaling has seriously misled us here. 76 | 77 | ## Alternatives 78 | 79 | The real goal should be to make the variables *commensurate*. 80 | Standardizing to mean 0, variance 1 is not the only way one can do this. 81 | Consider the following alternatives. 82 | 83 | * Do nothing. In many data sets, the variables of interest are already 84 | commensurate. Consider survey data, say, with each survey question 85 | asking for a response on a scale of 1 to 5. No need to transform the 86 | data here, and worse, standardizing would have the distoritionary effect 87 | of exaggerating rare values in items with small variance. 88 | 89 | * Map each variable to the interval [0,1], i.e. t -> (t-m)/(M-m), where 90 | m and M are the minimum and maximum values of the given variable. 91 | This is typically better than standardizing, but it does have some 92 | problems. First, it is sensitive to outliers. This might be 93 | ameliorated with a modified form of the transformation, but a second 94 | problem is that new data -- new data in prediction applications, say -- 95 | may stray from this [0,1] world. 96 | 97 | * Instead of changing the *standard deviation* of a variable to 1.0, 98 | change its *mean* to 1.0. This addresses the miles-vs.-kilometers 99 | concern more directly, without inducing the distortions I described 100 | above. And if one is worried about outliers, then divide the variable 101 | by the median or other trimmed mean. 102 | 103 | 104 | -------------------------------------------------------------------------------- /R/Penrose.R: -------------------------------------------------------------------------------- 1 | 2 | # routines to explore effect of deliberate overfitting beyond 3 | # "interpolation," i.e. beyond "perfect fit" 4 | 5 | ################### penroseLM() ######################### 6 | 7 | # Penrose inverse version of lm(); a 1s col is added as in lm() 8 | 9 | # arguments: 10 | 11 | # d:data frame; must be numeric 12 | # yName: name of "Y" column 13 | 14 | # value: 15 | 16 | # object of class 'penroseLM',with beta-hat as 'bh' and colnames(x) 17 | 18 | penroseLM <- function(d,yName) 19 | { 20 | ycol <- which(names(d) == yName) 21 | x <- cbind(1,as.matrix(d[,-ycol])) 22 | xnms <- colnames(x) 23 | y <- d[,ycol] 24 | # MASS::ginv() does Penrose inverse 25 | res <- list(bh=MASS::ginv(x) %*% y, xnms=xnms) 26 | class(res) <- 'penroseLM' 27 | res 28 | } 29 | 30 | # arguments: 31 | 32 | # object: return value of penroseLM() 33 | # newx: data frame in the same format as x in penroseLM(); numeric 34 | 35 | predict.penroseLM <- function(object,...) 36 | { 37 | arglist <- list(...) 38 | newx <- arglist[[1]] 39 | 40 | if(names(newx) != object$xnms) stop('name mismatch') 41 | newx <- cbind(1,as.matrix(newx)) 42 | bh <- object$bh 43 | newx %*% bh 44 | } 45 | 46 | ################### penrosePoly() ######################### 47 | 48 | # polynomial regression with Penrose inverse; uses polyreg 49 | 50 | penrosePoly <- function(d,yName,deg,maxInteractDeg=deg) 51 | { 52 | requireNamespace('polyreg') 53 | ycol <- which(names(d) == yName) 54 | x <- as.matrix(d[,-ycol,drop=FALSE]) 55 | polyout <- polyreg::getPoly(x,deg=deg,maxInteractDeg=maxInteractDeg) 56 | xPoly <- polyout$xdata # polynomial version of x 57 | y <- d[,ycol] 58 | xy <- cbind(xPoly,y) 59 | res <- list(bh=penroseLM(xy,'y')$bh, 60 | deg=deg, 61 | maxInteractDeg=maxInteractDeg, 62 | modelFormula=polyout$modelFormula, 63 | XtestFormula=polyout$XtestFormula, 64 | retainedNames=polyout$retainedNames, 65 | standardize=FALSE 66 | ) 67 | class(res) <- 'penrosePoly' 68 | res 69 | } 70 | 71 | predict.penrosePoly <- function(object,...) 72 | { 73 | requireNamespace('polyreg') 74 | arglist <- list(...) 75 | newx <- arglist[[1]] 76 | 77 | if (nrow(newx) == 1) { 78 | # problem in getPoly() for case of a 1-row newx, reported to PM; 79 | # have this workaround for now 80 | oneRow <- TRUE 81 | newx <- rbind(newx,newx) 82 | } else oneRow <- FALSE 83 | polyout <- polyreg::getPoly(newx, 84 | deg=object$deg, 85 | maxInteractDeg = object$maxInteractDeg, 86 | modelFormula = object$modelFormula, 87 | retainedNames = object$retainedNames) 88 | xPoly <- polyout$xdata # polynomial version of newx 89 | xPoly <- as.matrix(xPoly) 90 | xPoly <- cbind(1,xPoly) 91 | bh <- object$bh 92 | res <- xPoly %*% bh 93 | if (oneRow) res <- res[1] 94 | res 95 | } 96 | 97 | predpnr <- predict.penrosePoly 98 | 99 | ################### ridgePoly() ######################### 100 | 101 | # according to Hastie et al, "properly tuned" ridge regression beats 102 | # mininum norm 103 | 104 | # polynomial regression with Penrose inverse; uses polyreg 105 | 106 | ridgePoly <- function(d,yName,deg,maxInteractDeg=deg) 107 | { 108 | requireNamespace('polyreg') 109 | if (!allNumeric(d)) stop('for now, X,Y must be numeric') 110 | ycol <- which(names(d) == yName) 111 | x <- as.matrix(d[,-ycol]) 112 | polyout <- polyreg::getPoly(x,deg=deg,maxInteractDeg=maxInteractDeg) 113 | xPoly <- polyout$xdata # polynomial version of x 114 | xPoly <- as.matrix(xPoly) 115 | y <- d[,ycol] 116 | cvgout <- glmnet::cv.glmnet(x=xPoly,y=y,alpha=0,family='gaussian') 117 | res <- list(cvgout=cvgout, 118 | deg=deg, 119 | maxInteractDeg=maxInteractDeg, 120 | modelFormula=polyout$modelFormula, 121 | XtestFormula=polyout$XtestFormula, 122 | retainedNames=polyout$retainedNames, 123 | standardize=FALSE 124 | ) 125 | class(res) <- 'ridgePoly' 126 | res 127 | } 128 | 129 | predict.ridgePoly <- function(object,...) 130 | { 131 | requireNamespace('polyreg') 132 | arglist <- list(...) 133 | newx <- arglist[[1]] 134 | 135 | # newx must be a matrix, with the original column names 136 | if (nrow(newx) == 1) { 137 | # problem in getPoly() for case of a 1-row newx, reported to PM; 138 | # have this workaround for now 139 | oneRow <- TRUE 140 | newx <- rbind(newx,newx) 141 | } else oneRow <- FALSE 142 | polyout <- polyreg::getPoly(newx, 143 | deg=object$deg, 144 | maxInteractDeg = object$maxInteractDeg, 145 | modelFormula = object$modelFormula, 146 | retainedNames = object$retainedNames) 147 | xPoly <- polyout$xdata # polynomial version of newx 148 | xPoly <- as.matrix(xPoly) 149 | glmObject <- object$cvgout 150 | res <- predict(glmObject,s=glmObject$lambda.min,newx=xPoly) 151 | # bh <- object$bh 152 | # res <- xPoly %*% bh 153 | if (oneRow) res <- res[1] 154 | res 155 | } 156 | 157 | -------------------------------------------------------------------------------- /inst/InterpretedR.md: -------------------------------------------------------------------------------- 1 | # Clearing the Confusion: R and Python as Interpreted Languages, and the Roles of Vectorization and Pointers 2 | 3 | One often hears statements like "R is an interpreted language, and thus 4 | is slow." The same is true for Python and Java. But what does that 5 | really mean? 6 | 7 | ## Example 8 | 9 | Consider this code snippet: 10 | 11 | ``` r 12 | # x, creating previously, is a vector of 100 elements 13 | tot <- 0 14 | for (i in 1:100) 15 | tot <- tot + x[i] 16 | ``` 17 | 18 | The *vectorized* alternative is 19 | 20 | ``` r 21 | tot <- sum(x) 22 | ``` 23 | 24 | The second version will indeed be faster than the first (though with a 25 | small vector like this, both will execute essentially instantaneously). 26 | But why? 27 | 28 | ## Interpreted languages 29 | 30 | R does not run directly on your machine. What runs directly is another 31 | program, the *interpreter*. Like any program running on your machine, 32 | it is written in machine language, patterns of 0s and 1s that code 33 | primitive operations, e.g. ``Add these 2 numbers.'' In most cases, that 34 | machine language was generated from C/C++ source, translated to machine 35 | code by a *compiler*. The interpreter in essence simulates a fake 36 | machine, on which your R code "runs." The details are not important for 37 | us. 38 | 39 | The reason that first code snippet above will run slowly is that the 40 | compiler will have to do a lot of repetitive work. For instance, it 41 | will have to look up where in memory **tot** and **x** are--100 times! 42 | 43 | By contrast, in that second code snippet, the **sum()** function is 44 | actual machine code (again, generated from C by the compiler). The 45 | location of **x** is set just once, and the code marches through the 46 | vector without any lookup. That code will still repeat a summation 47 | operation 100 times, but without the time-consuming lookup. The 48 | situation is similar for **tot**. 49 | 50 | ## Vectorized code 51 | 52 | We say that the second snippet is *vectorized* code, because the entire 53 | vector is processed directly by machine code. As noted, that code will 54 | still loop around 100 times, so it is not the case that the code 55 | operates on the entire vector at once (there are special types 56 | of hardware, e.g. GPUs, that *can* do this), but it will be faster for 57 | the reasons cited above. Some R code is vectorizable, some not. 58 | 59 | ## Pointers 60 | 61 | A computer's memory is broken down into *bytes*, with a certain number 62 | of bytes forming a *word*. On today's machines, word size is typically 63 | 8 bytes. So, if your machine has, say 4 Gb of RAM, then it has 0.5 64 | billion words. Your code's numbers are typically stored one to a word, 65 | e.g. 100 consecutive words in the above code, while your text data is 66 | stored one to a byte. 67 | 68 | Each byte, and each word, has an ID, called an *address*. When your 69 | code refers to a vector, say **x** above, internally it refers to the 70 | address in memory at which the vector starts. Internally, that starting 71 | address is kept in a variable called a *pointer*. 72 | 73 | Now, here is a key point: Say the vector **u** is rather long, say 100 74 | million elements, and we execute 75 | 76 | ``` r 77 | v <- u 78 | ``` 79 | 80 | Will that operation be slow, due to the need to do all that copy of one 81 | vector to another? No! Since **u** and **v** are referenced by 82 | pointers, executing the above line of code merely means copying one 83 | pointer to another; they both point to the same place. 84 | 85 | But...what if we then execute 86 | 87 | ``` r 88 | v[88] <-3 89 | ``` 90 | 91 | We want **v** to change but **u** to NOT change. Now the interpreter 92 | must do some work. In preparing to make **v** separate from **u**, 93 | the interpreter must (a) find some unused part of memory at which to 94 | create the new vector, (b) copy all of **u** to that space, (c) point 95 | **v**'s pointer to that space, and (d) set the third word in that space 96 | to 3. 97 | 98 | So, here we see two lines of code, the first appearing to be slow but 99 | actually not slow, and the second looking innocuous and fast but 100 | actually slow. Writing fast R code does take some sophistication. 101 | 102 | ## Every operation is a function 103 | 104 | This is not directly related to the above material, but worth mentioning 105 | in this context. 106 | 107 | You are probably well familiar with functions, e.g. **sum()** above, but 108 | may think of something like 109 | 110 | ``` r 111 | 3 + 8 112 | ``` 113 | 114 | as a different animal. Actually, the latter is also a function call! 115 | The name of the function is **`+`**! (Note the backticks.) For 116 | example: 117 | 118 | ``` r 119 | > 3 + 8 120 | [1] 11 121 | > `+`(3,8) 122 | [1] 11 123 | ``` 124 | 125 | So, addition is done via the **`+`()** function, with the addends as 126 | arguments. The R interpreter converts that first form to the second. 127 | 128 | Similarly, we have a function **`[`** for vector element access, 129 | a function **`$`** for list element access, and so on. 130 | 131 | -------------------------------------------------------------------------------- /inst/tests/QuickTests.R: -------------------------------------------------------------------------------- 1 | 2 | # built-in data on major league baseball players 3 | data(mlb) 4 | mlb <- mlb[,3:6] # position, height, weight, age 5 | 6 | # note: qeNeural has its own random number stream (in Python), 7 | # and thus will give different numbers for the same R seed, around 13.8 8 | qeCompare(mlb,'Weight', 9 | c('qeLin','qePolyLin','qeKNN','qeRF','qeLASSO','qeNeural'),25) 10 | # qeFtn meanAcc 11 | # 1 qeLin 13.30490 12 | # 2 qePolyLin 13.33584 13 | # 3 qeKNN 13.72708 14 | # 4 qeRF 13.46515 15 | # 5 qeLASSO 13.34612 16 | # 6 qeNeural 13.89695 17 | 18 | qeCompare(mlb,'Position', 19 | c('qeLogit','qePolyLog','qeKNN','qeRF','qeNeural','qeSVM','qeGBoost'),25) 20 | # qeFtn meanAcc 21 | # 1 qeLogit 0.6677228 22 | # 2 qePolyLog 0.6843564 23 | # 3 qeKNN 0.6819802 24 | # 4 qeRF 0.6780198 25 | # 5 qeNeural 0.6708911 26 | # 6 qeSVM 0.6542574 27 | # 7 qeGBoost 0.6657426 28 | 29 | 30 | data(peFactors) 31 | pef <- peFactors[,c(1,3,5,7:9)] 32 | qeCompare(pef,'occ',c('qeLogit','qePolyLog','qeKNN','qeRF','qeNeural'),25) 33 | # qeFtn meanAcc 34 | # 1 qeLogit 0.61444 35 | # 2 qePolyLog 0.61136 36 | # 3 qeKNN 0.62524 37 | # 4 qeRF 0.61520 38 | # 5 qeNeural 0.61204 39 | # UCI vertebrae dataset, column_3C 40 | # vert <- read.table('~/Datasets/Vertebrae/column_3C.dat',header=FALSE,stringsAsFactors=TRUE) 41 | # qeCompare(vert,'V7',c('qeLogit','qePolyLog','qeKNN','qeRF','qeNeural'),25) 42 | # qeFtn meanAcc 43 | # 1 qeLogit 0.1419355 44 | # 2 qePolyLog 0.1974194 45 | # 3 qeKNN 0.2193548 46 | # 4 qeRF 0.1625806 47 | # 5 qeNeural 0.3393548 48 | 49 | set.seed(9999) 50 | 51 | # fit models 52 | knnout <- qeKNN(mlb,'Weight',k=25) 53 | rfout <- qeRF(mlb,'Weight') 54 | 55 | # mean abs. pred. error on holdout set, in pounds 56 | knnout$testAcc 57 | # [1] 11.75644 58 | rfout$testAcc 59 | # [1] 12.6787 60 | 61 | # predict a new case 62 | newx <- data.frame(Position='Catcher',Height=73.5,Age=26) 63 | predict(knnout,newx) 64 | [,1] 65 | # [1,] 204.04 66 | predict(rfout,newx) 67 | 11 68 | # 199.1714 69 | 70 | set.seed(9999) 71 | # how about some other ML methods? 72 | lassout <- qeLASSO(mlb,'Weight') 73 | lassout$testAcc 74 | # [1] 12.31019 75 | # poly reg, degree 3 76 | polyout <- qePolyLin(mlb,'Weight',3) 77 | polyout$testAcc 78 | # [1] 13.83444 79 | nnout <- qeNeural(mlb,'Weight') 80 | # ... 81 | nnout$testAcc 82 | # [1] 10.23094 83 | # try some nondefault hyperparams 84 | nnout <- qeNeural(mlb,'Weight',hidden=c(200,200),nEpoch=50) 85 | nnout$testAcc 86 | # [1] 13.40559 87 | 88 | # predict player position, 6 categories 89 | knnout <- qeKNN(mlb,'Position',k=25) 90 | rfout <- qeRF(mlb,'Position') 91 | knnout$testAcc 92 | # [1] 0.7524752 93 | rfout$testAcc 94 | # [1] 0.6138614 95 | table(mlb$Pos) / sum(table(mlb$Pos)) 96 | # Catcher First_Baseman Outfielder Relief_Pitcher 97 | # 0.07487685 0.05418719 0.19113300 0.31034483 98 | # Second_Baseman Shortstop Starting_Pitcher Third_Baseman 99 | # 0.05714286 0.05123153 0.21674877 0.04433498 100 | 101 | # kNN worse than always guessing Relief_Pitcher, RF about the same 102 | z <- qePolyLog(mlb,'Position',holdout=NULL) 103 | predict(z,mlb[8,-1]) 104 | # $predClasses 105 | # [1] "Outfielder" 106 | # $probs 107 | # Catcher First_Baseman Outfielder Relief_Pitcher Second_Baseman Shortstop 108 | # [1,] 0.125 0.125 0.125 0.125 0.125 0.125 109 | # Starting_Pitcher Third_Baseman 110 | # [1,] 0.125 0.125 111 | z <- qePolyLog(mlb[,c(1,3)],'Position',holdout=NULL) 112 | predict(z,mlb[8,3]) 113 | 114 | set.seed(9999) 115 | lgout <- qeLogit(mlb,'Position') 116 | lgout$testAcc 117 | # [1] 0.6732673 118 | newx <- data.frame(Height=73.5,Age=26,Weight=200) 119 | predict(lgout,newx) 120 | # $predClasses 121 | # [1] "Relief_Pitcher" 122 | # $probs 123 | # Catcher First_Baseman Outfielder Relief_Pitcher Second_Baseman 124 | # [1,] 0.06527784 0.05201025 0.214516 0.3336662 0.03421254 125 | # Shortstop Starting_Pitcher Third_Baseman 126 | # [1,] 0.03345139 0.2252583 0.04160745 127 | 128 | z <- qePolyLog(mlb,'Position',holdout=NULL) 129 | predict(z,mlb[8,-1]) 130 | # $predClasses 131 | # [1] "Outfielder" 132 | # $probs 133 | # Catcher First_Baseman Outfielder Relief_Pitcher Second_Baseman Shortstop 134 | # [1,] 0.173676 0.04955253 0.1418191 0.06851684 0.04072947 0.2907195 135 | # Starting_Pitcher Third_Baseman 136 | # [1,] 0.07216886 0.1628177 137 | 138 | # check via qeLogit() 139 | mlb1 <- mlb[,c(1,3)] # Position, Weight only 140 | z <- qePolyLog(mlb1,'Position',holdout=NULL) 141 | predict(z,mlb1[8,-1]) 142 | # $predClasses 143 | # [1] "Relief_Pitcher" 144 | # $probs 145 | # Catcher First_Baseman Outfielder Relief_Pitcher Second_Baseman 146 | # [1,] 0.09858535 0.05010766 0.03951525 0.03506279 0.05183345 147 | # Shortstop Starting_Pitcher Third_Baseman 148 | # [1,] 0.2148096 0.2074912 0.3025947 149 | mlb2 <- mlb1 150 | mlb2$wt2 <- mlb2$Weight^2 151 | z <- qeLogit(mlb2,'Position',holdout=NULL) 152 | predict(z,mlb2[8,-1]) 153 | # same as above 154 | 155 | # what about having only 2 classes? 156 | catcher <- toSubFactor(mlb$Position,'Catcher') 157 | mlb3 <- mlb 158 | mlb3$Position <- catcher 159 | predict(z,mlb3[8,-1]) 160 | # $predClasses 161 | # [1] "zzzOther" 162 | # $probs 163 | # Catcher zzzOther 164 | # [1,] 0.1533529 0.8466471 165 | 166 | # day2 167 | d2 <- day2[,-(13:14)] 168 | z <- pcaQE(0.6,d2,'tot','qeKNN',k=25,holdout=NULL) 169 | newx <- d2[8,-13] 170 | predict(z,newx) 171 | # [,1] 172 | # [1,] 1440.44 173 | 174 | -------------------------------------------------------------------------------- /inst/PoissonReg.md: -------------------------------------------------------------------------------- 1 | # Clearing the Confusion: Poisson regression 2 | 3 | One of the most commonly used statistical methods is the *general linear 4 | model*, implemented in R as the **glm()** function. The most common 5 | usage of that function is for *logistic regression*, but it's also 6 | popular for *Poisson regression* (PR), the subject of this entry in our 7 | Clearing the Confusion series. PR is often used as a regression model 8 | in which the response variable Y consists of counts, typically in the 9 | one- or two-digit range. 10 | 11 | This is not a tutorial on Poisson regression (PR). It is assumed that 12 | the reader already has some familiarity with the model, and the 13 | treatment here is somewhat critical. There are two main themes: 14 | 15 | * Unlike the linear and logistic cases, there is no theoretical or 16 | modeling justification for PR. 17 | 18 | * PR changes the nature of the β coefficients in major ways that 19 | may not be desirable. 20 | 21 | ## Notation 22 | 23 | Y: the response variable 24 | 25 | X1, X2,...,Xp: the predictor 26 | variables 27 | 28 | X: (X1, X2,...,Xp) 29 | 30 | n: the sample size, i.e. number of data points 31 | 32 | ## The linear and logistic models: theoretical justifications 33 | 34 | It will be helpful to first take a brief look at the theory behind the 35 | assumptions of standard linear and logistic models. 36 | 37 | **linear model:** The classic linear model assumes that: mean Y 38 | given X is a linear combination of the Xi; the conditional 39 | distribution of Y given X is Gaussian; and the conditional variance of Y 40 | given X is constant in X (homoscedasticity). 41 | 42 | Key point: *All the assumptions of this model hold if (X,Y) has a 43 | multivariate normal distribution.* 44 | 45 | In other words, the multivariate normal model implies the classic linear 46 | model. 47 | 48 | **logistic model:** For binary Y, the logistic model is 49 | 50 | P(Y = 1 | X) = 1 / [1 + exp{-( 51 | β0 + 52 | β1 X1 + 53 | ... 54 | βp Xp 55 | )}] 56 | 57 | Key point: *This assumption holds if X | Y = i is multivariate normal 58 | with covariance matrix independent of i.* 59 | 60 | Those assumptions, by the way, are exactly those of Fisher linear 61 | discriminant analysis. In other words, LDA settings imply the logistic 62 | model. 63 | 64 | ----------------- 65 | 66 | Of course, models are always approximations, and the linear and logistic 67 | models are no exception. But multivariate data is indeed often roughly 68 | mound-shaped, i.e. multivariate Gaussian-like, making the above 69 | theoretical models reasonable choices for consideration. 70 | 71 | ## Reasons cited for using "exponential" PR 72 | 73 | When Y has the form of counts, a Poisson model naturally comes to mind. 74 | However, unlike the linear and logistic cases, *there is no theoretical 75 | justification for the standard PR model*, 76 | 77 | mean Y = 78 | exp[β0 + 79 | β1 X1 + 80 | ... 81 | βp Xp] 82 | 83 | Let's call this the Exponential Poisson model (EP). 84 | 85 | Since most parametric regression analyses use linear models, a more 86 | natural model would be the standard linear one, 87 | 88 | mean Y = 89 | β0 + 90 | β1 X1 + 91 | ... 92 | βp Xp 93 | 94 | Let's call this the Linear Poisson model (LP). 95 | 96 | Advocates of EP are uncomfortable with LP. Under the linear model, mean 97 | Y could be negative in some instances, contrary to its being a mean of 98 | counts. Thus they use **exp()** to force the mean to be nonnegative. 99 | 100 | ## A closer look 101 | 102 | A fundamental problem, often overlooked, is this: 103 | 104 | ----------------- 105 | 106 | With use of EP instead of LP, the predictor effects βi 107 | change from **additive** to **multiplicative**. 108 | 109 | ----------------- 110 | 111 | Say X2 is age. Under LP, 1 extra year of age adds 112 | β2 to mean Y. Under EP, 1 extra year of age 113 | *multiplies* mean Y by exp(β2). 114 | 115 | In some applications, a multiplicative model is appropriate. But users 116 | should be aware of this major difference in models, and thus this major 117 | difference in interpretations of the coefficients. 118 | 119 | It must be noted that factor effects are not additive in logistic models 120 | either. However, the "data is often mound-shaped" argument at least 121 | gives a theoretical basis for considering a logistic model. EP has no 122 | such basis, and if the application at hand does not have a qualitative 123 | reason to assume multiplicativity, EP may not be justified. 124 | 125 | ## Issues with assumptions in LP 126 | 127 | Count data, at least for small mean, is not approximately normal, and in 128 | most cases it is not homoscedastic either. However, neither of these is 129 | a major problem. 130 | 131 | For large n, the Central Limit Theorem (applied in large-sample theory) 132 | shows that non-normality of the distribution of Y given X is not 133 | relevant. For small n, Gaussian linear model theory is not reliable, 134 | since no variable in the real world is normally distributed. One can 135 | and should still do inference, of course, but not take it so literally. 136 | 137 | As to heteroscedastic variance of Y given X, one can use the *sandwich 138 | estimator*. See for instance the **car** and **sandwich** packages in 139 | CRAN. 140 | 141 | ## Recommendations 142 | 143 | One can of course try both models, LP and EP, doing model fit assessment 144 | if the goal is Description or doing cross-validated assessment of 145 | predictive ability if the goal is Prediction. But again, in the 146 | Description case, one must take care in interpeting the coefficients of 147 | the two models. 148 | -------------------------------------------------------------------------------- /man/FineTuning.Rd: -------------------------------------------------------------------------------- 1 | \name{fineTuning,knnFineTune,fineTuningMult} 2 | \alias{fineTuning} 3 | \alias{fineTuningMult} 4 | \alias{knnFineTune} 5 | \alias{fineTuningPar} 6 | \alias{plot.tuner} 7 | 8 | \title{Grid Search Plus More} 9 | 10 | \description{ 11 | Adds various extra features to grid search for specified tuning 12 | parameter/hyperparameter combinations: There is a plot() function, using 13 | parallel coordinates graphs to show trends among the different 14 | combinations; and Bonferroni confidence intervals are computed to avoid 15 | p-hacking. An experimental smoothing facility is also included. 16 | } 17 | 18 | \usage{ 19 | fineTuning(dataset,pars,regCall,nCombs=NULL,specCombs=NULL,nTst=500, 20 | nXval=1,up=TRUE,k=NULL,dispOrderSmoothed=FALSE, 21 | showProgress=TRUE,...) 22 | fineTuningMult(dataset,pars,regCall,nCombs=NULL, 23 | nTst=500,nXval=1,up=TRUE,k=NULL,dispOrderSmoothed=FALSE, 24 | showProgress=TRUE,outDim=1,...) 25 | \method{plot}{tuner}(x,...) 26 | knnFineTune(data,yName,k,expandVars,ws,classif=FALSE,seed=9999) 27 | fineTuningPar(cls,dataset,pars,regCall,nCombs=NULL,specCombs=NULL, 28 | nTst=500,nXval=1,up=TRUE,k=NULL,dispOrderSmoothed=FALSE) 29 | } 30 | 31 | \arguments{ 32 | \item{...}{Arguments to be passed on by \code{fineTuning} or 33 | \code{plot.tuner}.} 34 | \item{x}{Output object from \code{fineTuning}.} 35 | \item{cls}{A \code{parallel} cluster.} 36 | \item{dataset}{Data frame etc. containing the data to be analyzed.} 37 | \item{data}{The data to be analyzed.} 38 | \item{yName}{Quoted name of "Y" in the column names of \code{data}.} 39 | \item{expandVars}{Indices of columns in \code{data} to be weighted in 40 | distance calculations.} 41 | \item{ws}{Weights to be used for \code{expandVars}.} 42 | \item{classif}{Set to TRUE for classification problems.} 43 | \item{seed}{Seed for random number generation.} 44 | \item{pars}{R list, showing the desired tuning parameter values.} 45 | \item{regCall}{Function to be called at each parameter combination, 46 | performing the model fit etc.} 47 | \item{nCombs}{Number of parameter combinations to run. If Null, all 48 | will be run}. 49 | \item{nTst}{Number of data points to be in each holdout set.} 50 | \item{nXval}{Number of holdout sets/folds to be run for a 51 | given data partition and parameter combination.} 52 | \item{k}{Nearest-neighbor smoothing parameter.} 53 | \item{up}{If TRUE, display results in ascending order of performance 54 | value.} 55 | \item{dispOrderSmoothed}{Display in order of smoothed results.} 56 | \item{showProgress}{If TRUE, print each output line as it becomes ready.} 57 | \item{specCombs}{A data frame in which the user specifies 58 | hyperparameter parameter combinations to evaluate.} 59 | \item{outDim}{Number of components in the value returned by \code{theCall}.} 60 | 61 | } 62 | 63 | \details{ 64 | 65 | The user specifies the values for each tuning parameter in 66 | \code{pars}. This leads to a number of possible combinations of the 67 | parameters. In many cases, there are more combinations than the user 68 | wishes to try, so \code{nCombs} of them will be chosen at random. 69 | 70 | For each combination, the function will run the analysis specified by 71 | the user in \code{regCall}. The latter must have the call form 72 | 73 | \code{ftnName(dtrn,dtst,cmbi} 74 | 75 | Again, note that it is \code{fineTuning} that calls this function. It 76 | will provide the training and test sets \code{dtrn} and \code{dtst}, as 77 | well as \code{cmbi} ("combination i"), the particular parameter 78 | combination to be run at this moment. 79 | 80 | Each chosen combination is run in \code{nXval} folds. All specified 81 | combinations are run fully, as opposed to a directional "hill descent" 82 | search that hopes it might eliminate poor combinations early in the process. 83 | 84 | The function \code{knnFineTune} is a wrapper for \code{fineTuning} for 85 | k-NN problems. 86 | 87 | The function \code{plot.tuner} draws a parallel coordinates plot to 88 | visualize the grid. The argument \code{x} is the output of 89 | \code{fineTuning}. Arguments to specify in the ellipsis are: 90 | \code{col} is the column to be plotted; 91 | \code{disp} is the number to display, with \code{0}, \code{-m} and 92 | \code{+m} meaning cases with the \code{m} smallest 'smoothed' values, all 93 | cases and the \code{m} largest values of 'smoothed', respectively; 94 | \code{jit} avoids plotting coincident lines by adding jitter in the 95 | amount \code{jit * range(x) * runif(n,-0.5,0.5)}. 96 | 97 | 98 | } 99 | 100 | \value{ 101 | Object of class **''tuner'**. Contains the grid results, including 102 | upper bounds of approximate one-sided 95% confidence intervals, both 103 | univariate and Bonferroni-Dunn (adjusted for the 104 | number of parameter combinations). 105 | } 106 | 107 | \examples{ 108 | 109 | # mlb data set, predict weight using k-NN, try various values of k 110 | 111 | tc <- function(dtrn,dtst,cmbi,...) 112 | { 113 | knnout <- kNN(dtrn[,-10],dtrn[,10],dtst[,-10],as.integer(cmbi[1])) 114 | preds <- knnout$regests 115 | mean(abs(preds - dtst[,10])) 116 | } 117 | 118 | data(mlb) 119 | mlb <- mlb[,3:6] 120 | mlb.d <- factorsToDummies(mlb) 121 | fineTuning(mlb.d,list(k=c(5,25)),tc,nTst=100,nXval=2) 122 | 123 | tc <- function(dtrn,dtst,cmbi,...) 124 | { 125 | knnout <- kNN(dtrn[,-10],dtrn[,10],dtst[,-10],as.integer(cmbi[1])) 126 | preds <- knnout$regests 127 | mean(abs(preds - dtst[,10])) 128 | } 129 | 130 | fineTuningMult(mlb.d,list(k=c(5,25)),tc,nTst=100,nXval=2) 131 | 132 | \dontrun{ 133 | library(qeML) 134 | data(svcensus) 135 | tc1 <- function(dtrn,dtst,cmbi,...) 136 | { 137 | knnout <- qeKNN(dtrn,'wageinc',as.integer(cmbi[1]),holdout=NULL) 138 | preds <- predict(knnout,dtst[,-4]) 139 | mape <- mean(abs(preds - dtst[,4])) 140 | bigprobs75 <- mean(preds > 75000) 141 | c(mape,bigprobs75) 142 | } 143 | 144 | fineTuningMult(svcensus,list(k = c(10,25)),tc1,outDim=2) 145 | } 146 | 147 | } 148 | 149 | \author{ 150 | Norm Matloff 151 | } 152 | 153 | -------------------------------------------------------------------------------- /man/factorsDummies.Rd: -------------------------------------------------------------------------------- 1 | \name{factorsToDummies} 2 | \alias{factorToDummies} 3 | \alias{factorsToDummies} 4 | \alias{dummiesToFactor} 5 | \alias{charsToFactors} 6 | \alias{factorTo012etc} 7 | \alias{getDFclasses} 8 | \alias{hasCharacters} 9 | \alias{hasFactors} 10 | \alias{toAllNumeric} 11 | \alias{toSubFactor} 12 | \alias{toSuperFactor} 13 | \alias{toAllNumeric} 14 | \alias{discretize} 15 | \alias{dummiesToInt} 16 | \alias{xyDataframeToMatrix} 17 | 18 | \title{Factor Conversion Utilities} 19 | 20 | \description{ 21 | Utilities from converting back and forth between factors and dummy 22 | variables. 23 | } 24 | 25 | \usage{ 26 | xyDataframeToMatrix(xy) 27 | dummiesToInt(dms,inclLast=FALSE) 28 | factorToDummies(f,fname,omitLast=FALSE,factorInfo=NULL) 29 | factorsToDummies(dfr,omitLast=FALSE,factorsInfo=NULL,dfOut=FALSE) 30 | dummiesToFactor(dms,inclLast=FALSE) 31 | charsToFactors(dtaf) 32 | factorTo012etc(f,earlierLevels = NULL) 33 | discretize(x,endpts) 34 | getDFclasses(dframe) 35 | hasCharacters(dfr) 36 | hasFactors(x) 37 | toAllNumeric(w,factorsInfo=NULL) 38 | toSubFactor(f,saveLevels,lumpedLevel="zzzOther") 39 | toSuperFactor(inFactor,superLevels) 40 | } 41 | 42 | \arguments{ 43 | \item{dfOut}{If TRUE, return a data frame, otherwise a matrix.} 44 | \item{dms}{Matrix or data frame of dummy columns.} 45 | \item{inclLast}{When forming a factor from dummies, include the last 46 | dummy as a level if this is TRUE.} 47 | \item{xy}{A data frame mentioned for prediction, "Y" in last column.} 48 | \item{saveLevels}{In collapsing a factor, which levels to retain.} 49 | \item{lumpedLevel}{Name of new level to be created from levels not retained.} 50 | \item{x}{A numeric vector, except in \code{hasFactors}, where it is a 51 | data frame.} 52 | \item{endpts}{Vector to be used as \code{breaks} in call to 53 | \code{cut}. To avoid NAs, range of the vector must cover the 54 | range of the input vector.} 55 | \item{f}{A factor.} 56 | \item{inFactor}{Original factor, to be extended.} 57 | \item{superLevels}{New levels to be added to the original factor.} 58 | \item{earlierLevels}{Previous levels found for this factor.} 59 | \item{fname}{A factor name.} 60 | \item{dfr}{A data frame.} 61 | \item{w}{A data frame.} 62 | \item{dframe}{A data frame, for which we wish to find the column classes.} 63 | \item{omitLast}{If TRUE, then generate only k-1 dummies from k factor 64 | levels.} 65 | \item{factorsInfo}{Attribute from output of \code{factorsToDummies}.} 66 | \item{factorInfo}{Attribute from output of \code{factorToDummies}.} 67 | \item{dtaf}{A data frame.} 68 | } 69 | 70 | \details{ 71 | 72 | Many R users prefer to express categorical data as R factors, or often 73 | work with data that is of this type to begin with. On the other hand, 74 | many regression packages, e.g. \pkg{lars}, disallow factors. These 75 | utilities facilitate conversion from one form to another. 76 | 77 | Here is an overview of the roles of the various functions: 78 | 79 | \itemize{ 80 | 81 | \item \code{factorToDummies}: Convert one factor to dummies, yielding a 82 | matrix of dummies corresponding to that factor. 83 | 84 | \item \code{factorsToDummies}: Convert all factors to dummies, yielding 85 | a matrix of dummies, corresponding to all factors in the input data 86 | frame. 87 | 88 | \item \code{dummiesToFactor}: Convert a set of related dummies to a 89 | factor. 90 | 91 | \item \code{factorTo012etc}: Convert a factor to a numeric code, 92 | starting at 0. 93 | 94 | \item \code{dummiesToInt}: Convert a related set of dummies to a numeric code, 95 | starting at 0. 96 | 97 | \item \code{charsToFactors}: Convert all character columns in a data 98 | frame to factors. 99 | 100 | \item \code{toAllNumeric}: Convert all factors in a data frame to 101 | dummies, yielding a new version of the data frame, including its 102 | original nonfactor columns. 103 | 104 | \item \code{toSubFactor}: Coalesce some levels of a factor, yielding a 105 | new factor. 106 | 107 | \item \code{toSuperFactor}: Add levels to a factor. Typically used in 108 | prediction contexts, in which a factor in a data point to be predicted 109 | does not have all the levels of the same factor in the training set. 110 | 111 | \\item \code{xyDataframeToMatrix}: Given a data frame to be used in 112 | a training set, with "Y" a factor in the last column, change to all 113 | numeric, with dummies in place of all "X" factors and in place of the 114 | "Y" factor. 115 | 116 | } 117 | 118 | The optional argument \code{factorsInfo} is intended for use in prediction 119 | contexts. Typically a set of new cases will not have all levels of the 120 | factor in the training set. Without this argument, only an incomplete 121 | set of dummies would be generated for the set of new cases. 122 | 123 | 124 | A key point about changing factors to dummies is that, for later 125 | prediction after fitting a model in our training set, one needs to use 126 | the same transformations. Say a factor has levels 'abc', 'de' and 'f' 127 | (and \code{omitLast = FALSE}). If we later have a set of say two new 128 | cases to predict, and their values for this factor are 'de' and 'f', we 129 | would generate dummies for them but not for 'abc', incompatible with the 130 | three dummies used in the training set. 131 | 132 | Thus the factor names and levels are saved in attributes, and can be 133 | used as input: The relations are as follows: 134 | 135 | \itemize{ 136 | 137 | \item \code{factorsToDummies} calls \code{factorToDummies} on each 138 | factor it finds in its input data frame 139 | 140 | \item \code{factorToDummies} outputs and later inputs \code{factorsInfo} 141 | 142 | \item \code{factorsToDummies} outputs and later inputs \code{factorsInfo} 143 | 144 | } 145 | 146 | Other functions: 147 | 148 | \itemize{ 149 | 150 | \item \code{getDFclasses}: Return a vector of the classes of the columns 151 | of a data frame. 152 | 153 | \item \code{discretize}: Partition range of a vector into (not 154 | necessarily equal-length) intervals, and construct a factor from the 155 | labels of the intervals that the input elements fall into. 156 | 157 | \item \code{hasCharacters, hasFactors}: Logical scalars, TRUE if the 158 | input data frame has any character or factor columns. 159 | 160 | } 161 | 162 | 163 | } 164 | 165 | \value{The function \code{factorToDummies} returns a matrix of dummy 166 | variables, while \code{factorsToDummies} returns a new version of the 167 | input data frame, in which each factor is replaced by columns of 168 | dummies. The function \code{factorToDummies} is similar, but changes 169 | character vectors to factors. 170 | } 171 | 172 | \examples{ 173 | x <- factor(c('abc','de','f','de')) 174 | xd <- factorToDummies(x,'x') 175 | xd 176 | # x.abc x.de 177 | # [1,] 1 0 178 | # [2,] 0 1 179 | # [3,] 0 0 180 | # [4,] 0 1 181 | # attr(,"factorInfo") 182 | # attr(,"factorInfo")$fname 183 | # [1] "x" 184 | # 185 | # attr(,"factorInfo")$omitLast 186 | # [1] TRUE 187 | # 188 | # attr(,"factorInfo")$fullLvls 189 | # [1] "abc" "de" "f" 190 | w <- factor(c('de','abc','abc')) 191 | wd <- factorToDummies(w,'x',factorInfo=attr(xd,'factorInfo')) 192 | wd 193 | # x.abc x.de 194 | # [1,] 0 1 195 | # [2,] 1 0 196 | # [3,] 1 0 197 | # attr(,"factorInfo") 198 | # attr(,"factorInfo")$fname 199 | # [1] "x" 200 | # 201 | # attr(,"factorInfo")$omitLast 202 | # [1] TRUE 203 | # 204 | # attr(,"factorInfo")$fullLvls 205 | # [1] "abc" "de" "f" 206 | 207 | } 208 | 209 | \author{ 210 | Norm Matloff 211 | } 212 | 213 | -------------------------------------------------------------------------------- /inst/DstrFit.md: -------------------------------------------------------------------------------- 1 | 2 | # Fitting Continuous Parametric Distriibutions 3 | 4 | Say one has some data and wishes to find a parametric distribution family. 5 | We address two questions here: 6 | 7 | * Why is this desirable? 8 | 9 | * How might it be done? 10 | 11 | ## Why fit a parametric distribution? 12 | 13 | Many statistical and probability models assume there are some parametric 14 | distributions driving the various variables. Even in more basic 15 | settings, having a parametric model for our data allows us to compactly 16 | describe our data, with a small number of parameters sufficing. 17 | 18 | ## How can the fitting be done? 19 | 20 | In statistics, we treat our data as a sample from a parent population, 21 | in which the variable of interest, say X, has unknown density 22 | fX(t). We estimate that density from our data. 23 | 24 | ### Running example: Pima diabetes data 25 | 26 | This is a widely-used dataset, available for instance from the **mlbench** 27 | package: 28 | 29 | ``` r 30 | > library(mlbench) 31 | > data(PimaIndiansDiabetes2) 32 | > pima <- PimaIndiansDiabetes2 # shorter to type 33 | > head(pima) # always look at your data 34 | pregnant glucose pressure triceps insulin mass pedigree age diabetes 35 | 1 6 148 72 35 NA 33.6 0.627 50 pos 36 | 2 1 85 66 29 NA 26.6 0.351 31 neg 37 | 3 8 183 64 NA NA 23.3 0.672 32 pos 38 | 4 1 89 66 23 94 28.1 0.167 21 neg 39 | 5 0 137 40 35 168 43.1 2.288 33 pos 40 | 6 5 116 74 NA NA 25.6 0.201 30 neg 41 | > bmi <- pima$mass 42 | > bmi <- na.exclude(bmi) # exclude any row with NAs 43 | ``` 44 | ### Visual inspection 45 | 46 | So, let's plot the data. We'll use R's basic histogram function, 47 | **hist()**. A more advanced alternative is **density()**, which plots a 48 | smooth curve. In calling **hist(bmi,freq=FALSE)** (that second 49 | argument means we want area = 1.0), we produce this: 50 | 51 | ![alt text](BMIhist.png) 52 | 53 | Remember, this is a sample estimate of fX, so it can be used 54 | for deciding whether to use a given parametric model. 55 | 56 | We see a rather steep rise from 0, quickly reaching a peak, then a 57 | gradual tapering off toward 0. This suggests that the gamma 58 | distribution family may work well. 59 | 60 | There is of course the question as to how *well* this data is fit by the 61 | gamma family. We will return to this later. 62 | 63 | ### Estimating parameter values 64 | 65 | Once we decide to use a certain parametric family, we must decide which 66 | *member* of that family to use. In other words, what parameter values? 67 | 68 | The two classic ways to estimate the parameters are Maximum Likelihood 69 | Estimation (MLE) and the Method of Moments (MM). I'm partial to MM, 70 | and will use that here. 71 | 72 | The population kth moment of X is defined to be 73 | E(Xk). It can be estimated by its sample analog 74 | 75 | Mk = 76 | (1/n) Σi=1n Xik 77 | 78 | where our data are X1,...,Xn. (The two quantities 79 | are analogous because E(Xk) is the average of Xk 80 | in the population, while Mk is the average of Xk 81 | in the sample.) So M1 is simply our sample mean, "X-bar." 82 | 83 | One may also use *central* moments, e.g. Var(X) and s2. 84 | Note that s2 = M2 - (M1)2. 85 | 86 | The idea of MM is to set the population moments equal to their sample 87 | analogs. Since the former are functions of the parameters, we can solve 88 | for the parameters, which serve actually as our estimated parameters. 89 | 90 | If we have q parameters, we form equations corresponding to the first q 91 | moments. For the gamma family with paraemeters r and λ, we use 92 | the first two moments. For convenience, we'll use variance 93 | rather than E(X2). 94 | 95 | For the gamma family, EX = r/λ and Var(X) = r/λ2. 96 | So our equations are 97 | 98 | M1 = rest / λest 99 | 100 | s2 = rest / λest2 101 | 102 | Luckily, these are easy to solve. We divide the first by the second, yielding 103 | 104 | λest = M1 / s2 105 | 106 | and 107 | 108 | rest = M12 / s2 109 | 110 | Let's superimpose the fitted gamma density onto the histogram: 111 | 112 | ``` r 113 | > curve(dgamma(x,rest,lambest),0,70,add=TRUE) 114 | ``` 115 | 116 | (The function **curve()** plots a function **x**, which in this case has 117 | range (070); **add=TRUE** means superimpose this new graph onto the old 118 | one.) 119 | 120 | ![alt text](BMIfitted.png) 121 | 122 | Our fitted parametric density estimate is rather close to the one that 123 | is model-free, so the parametric model seems pretty good. 124 | 125 | Now let's try comparing CDFs. Just as a histogram is 126 | a model-free estimate of fX, a model-free estimate of 127 | FX is the *empirical CDF*: 128 | 129 | Fest(t) = proportion of Xi that are ≤ t. 130 | 131 | The R function **ecdf** computes this and sets it up for plotting. 132 | (Actually, **ecdf** returns a function of class **'ecdf'**, so calling 133 | **plot()** on the return value invokes **plot.ecdf()**.) Let's go ahead: 134 | 135 | ``` r 136 | > plot(ecdf(bmi),pch=19,cex=0.1) 137 | > curve(pgamma(x,ch,lh),0,70,add=TRUE) # range of data is (0,70) 138 | ``` 139 | 140 | ![alt text](BMIfitwell.png) 141 | 142 | The ECDF and the fitted CDF are almost identical, wonderful. We were 143 | lucky here; in most real applications, we do not achieve such a close 144 | fit, even though the fit is usable. 145 | 146 | **Related software:** 147 | 148 | The **regtools** package includes a function **mm()** for computing 149 | Method of Moments estimators, and base R has the **mle()** function for 150 | MLE. 151 | 152 | ### Assessing fit 153 | 154 | Well then, how well did our gamma model fit? 155 | 156 | First, note that there are several possible sources of discrepancies between 157 | the histogram and the fitted gamma: 158 | 159 | * The true fX is not gamma. No model is ever perfectly 160 | right. Keep in mind the famous George Box quote, "All models are 161 | false, but some are useful." 162 | 163 | * We have just a sample of values from the population, so even if our 164 | gamma model were exactly correct for the true fX, our 165 | sample estimate would not form a perfect match. The larger the sample, 166 | the more likely we will have a close fit, but never exact. 167 | 168 | * Again due to our finite sample, we have nonzero widths for the 169 | histogram intervals. 170 | 171 | So, assessing fit is a matter of attempting to decide how much of the 172 | discrepancy is due to having a finite sample, and how much is due to 173 | model error. 174 | 175 | There are formal methods of assessment, known as *goodness of fit 176 | tests*, but these days hypothesis testing and p-values are frowned upon 177 | for any type of statistical situation, for good reason. Testing is 178 | especially irrelevant in assessing model fit. They don't tell us 179 | whether our fit is "good enough" for our purposes. 180 | 181 | If you feel you must do formal assessment, I suggest forming a 182 | *Kolmogorov-Smirnov confidence band*. We will not pursue that here. 183 | 184 | -------------------------------------------------------------------------------- /inst/ChoosingKinKFoldCV.md: -------------------------------------------------------------------------------- 1 | # Clearing the Confusion: Choosing K in K-Fold Cross Validation 2 | 3 | **N. Matloff, UC Davis** 4 | 5 | In model selection, a key goal is the avoidance of overfitting. 6 | Typically that is done by running the various models on a *training* 7 | dataset, then validating them on *test* data. The two datasets are 8 | usually obtaining by one of various kinds of partitioning of one's 9 | original data. This process is known as *cross-validation* (CV). 10 | 11 | The most commonly used form of this is *K-fold cross-validation*. Here 12 | K is the number of partitioned subsets. Note that K must be chosen by 13 | the user. 14 | 15 | We also note in passing that in machine learning circles, it's common to 16 | partition one's data into *three* sets: One chooses one's model by 17 | fitting several to the training set, often meaning choosing values of 18 | several tuning parameters, finally selecting the one with the best 19 | performance on the test set. The third set, the *validation* set, is 20 | then used to get a realistic evaluation of the performance of the final 21 | model, as even that test set stage will induce optimistic bias. 22 | 23 | ## Goals of this document 24 | 25 | 1. Explain the real motivations behind K-fold CV, and implications for 26 | choosing K. 27 | 28 | 2. Explain the severe limitations of the related theoretical work. 29 | 30 | 3. There is no really good solution, but we will make suggestions for 31 | practical alternatives to CV. 32 | 33 | ## Notation 34 | 35 | Say we are using a vector of predictor variables/features X to predict 36 | Y. The latter may be continuous, categorical or whatever. Let n denote 37 | our number of data points, and let p be the number of predictors. 38 | 39 | If say we are predicting human weight from height and age, with a 40 | dataset consisting of 1000 people, then p = 2 and n = 1000. 41 | 42 | ## What is K-fold CV? 43 | 44 | Let n denote the number of data points. In the simplest form, we 45 | randomly split our data into two groups of n/2 points each. We fit each 46 | of the candidate models to Group 1. We then temporarily pretend 47 | we don't know the Y values in Group 2, and do the following: 48 | 49 | For each model, we take the fitted model from Group 1, and apply it on 50 | the X values in Group 2 to predict the Y values in that group. Since 51 | we know the actual Y values, we can then compare how the various models 52 | fared. We choose the one that did the best. 53 | 54 | Or we could split the data into three groups of n/3 data 55 | points each, say Groups 1, 2 and 3. For each model, we would fit that 56 | model on the 2n/3 data in Groups 1 and 2, and predict Group 3. We would 57 | next fit to Groups 1 and 3, predicting Group 2, then finally predict 58 | Group 1 from Grups 2 and 3. We would then select the model that had the 59 | best overall accuracy in this process. 60 | 61 | K-fold CV refers to this approach, in which we partition the data into K 62 | groups. In the descriptions above, we first explained K = 2, then K = 3. An 63 | important special case is K = n, sometimes called the "leaving one out" method. 64 | 65 | ## How we might choose K: a first try 66 | 67 | Why burden the user with choosing K? She may already have many other 68 | tuning paramers to worry about, e.g. node size in random forests. Here 69 | is the tradeoff: 70 | 71 | 1. Consider K = 2, perhaps the most "natural" choice of K. 72 | A problem is that our results won't be very representative. After 73 | all, in the end, in order to use our final model in predicting future cases, 74 | we will want to use all n points in our dataset. But 2-fold CV will 75 | only find the best model for n/2 amount of data. In view of the fact 76 | that, the larger our dataset, the more detailed a model we can form, K=2 77 | is not really providing what we need. 78 | 79 | 2. Now consider the other extreme, K = n. During CV, we will be 80 | fitting models to data subsets of size n-1. But since the best model for 81 | n-1 amount of data will essentially be the same as for n amount 82 | (though possibly not the case if p is large), this choice of K seems best. But 83 | there are two problems. 84 | 85 | - Now we are facing a seriously large amount of computation -- for each 86 | of our candidate models, we do the computing n times, once for each 87 | fold. 88 | 89 | - Some theoretical work has indicated that this will not work for large 90 | n anyway, i.e. the probability of choosing the best model will 91 | actually decrease as n grows. 92 | 93 | Thus a "good" value of K would seem to be somewhere between 2 and n. 94 | Well, then, where? 95 | 96 | ## Role of the theory 97 | 98 | There has been much theoretical math done in answering the question of 99 | how one should choose K, beginning with Shao in 1993. A nice, updated 100 | account is in the book by Clark, Fokoue and Zhang (2009). See also the 101 | recent paper by Lei (2019). 102 | 103 | There has has been theoretical work aimed at deciding how large 104 | p can be, relative to n, for statistical estimators to have desired 105 | properties. The major work on this issue continues to be that of 106 | Portnoy (1988); see for instance Anirban DasGupta (2008). 107 | 108 | Though such work is impressive math, it is of rather little value in 109 | practice. Here's why: 110 | 111 | - The mathematical conditions assumed by the theorems are impossible to 112 | verify in practice, and indeed rather meaningless. 113 | 114 | - Even more important, consider a linear regression setting, so we have 115 | p+1 parameters. The theory typically assumes that most of the true 116 | regression coefficients are 0, with model selection amounting to 117 | determining which are non-0. This is not how things work in the real 118 | world. First, some true coefficients may be non-0 but small. Second, 119 | it is not necesarily true that we should use all variables with non-0 120 | coefficients, even if we knew which ones they are; there still may be 121 | too many of them to avoid overfitting. variables may result in 122 | overfitting. We thus may need to eliminate some, even with non-0 123 | values, a very different setting than what is covered by the theory. 124 | 125 | ## So, what CAN be done? 126 | 127 | Unfortunately, **there is no magic solution here**. But a reasonable 128 | approach is to limit model complexity (measured by p) in the first place. 129 | 130 | Here is the central issue: We do CV because of the optimistic bias that 131 | occurs when we assess a model by fitting and predicting on the same 132 | dataset. This is the motivation for partitioning. But if p << n, the 133 | amount of bias is negligible. In such situations, there is no need for 134 | CV. 135 | 136 | A good rule of thumb is to keep p < sqrt(n). This too is based on 137 | theory, but at least with rather minimal assumptions. Portnoy called 138 | this a "safe" strategy, and indeed I've found it to be conservative in 139 | practice, but one may consider that a virtue here. 140 | 141 | So, a reasonable approach to the CV question would be to keep p < 142 | sqrt(n), obviating the need for CV in the first place. We then would 143 | choose the richest model, i.e. the one that consists of all p candidate 144 | predictors. 145 | 146 | But the old saying, "Easier said than done," does apply. If our number 147 | of candidate features is a sizable portion of n, or even larger than n, 148 | we still must do some kind of preliminary dimension reduction to attain 149 | p < sqrt(n) before we begin model fitting. Here are a few possible 150 | approaches: 151 | 152 | - Apply PCA to the original candidate features, then use the first 153 | sqrt(n) principal components. 154 | 155 | - Do some kind of forward selection in linear regression, analysis, 156 | limiting the number of steps to sqrt(n). 157 | 158 | So for example in the PCA approach, say we have 100 candidate 159 | predictors. We would run PCA, then fit our model to the first 10 160 | components, and that would be our final model. 161 | 162 | Again, none of these is a fully satisfying solution. For instance, PCA 163 | has its own problems if p >> n, and anyway it is always possible that a 164 | small PC can have a large regression coefficient (Zumel, 2016). But 165 | they are reasonable solutions worth trying in what remains to be one of 166 | the top knotty problems in statistics. 167 | 168 | ## References 169 | 170 | Bertrand Clarke, Ernest Fokoue, Hao Helen Zhang. *Principles and Theory 171 | for Data Mining and Machine Learning*, Springer, 2009. 172 | 173 | Anirban DasGupta. *Asymptotic Theory of Statistics and Probability*, 174 | Springer, 2008 175 | 176 | Jing Lei. Cross-Validation with Confidence, arXiv:1703.07904, 2017 177 | 178 | Stephen Portnoy. Asymptotic Behavior of Likelihood Methods for 179 | Exponential Families when the Number of Parameters Tends to Infinity, 180 | *Ann. Statist.*, Volume 16, Number 1 (1988), 356-366. 181 | 182 | Jun Shao. Linear Model Selection by Cross-Validation. *Journal of the 183 | American Statistical Association*, 88(422):486–494, 1993 184 | 185 | Nina Zumel blog, https://ninazumel.com/tag/principal-components-analysis/, 186 | 2016 187 | 188 | -------------------------------------------------------------------------------- /inst/vn.save/vignettes/regtools.Rmd: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "regtools" 4 | author: "Norm Matloff" 5 | vignette: > 6 | %\VignetteIndexEntry{regtools} 7 | %\VignetteEngine{knitr::rmarkdown} 8 | output: rmarkdown::html_vignette 9 | --- 10 | 11 | # regtools 12 | 13 | ## Novel tools tools for linear, nonlinear and nonparametric regression. 14 | 15 | These tools are associated with my forthcoming book, From Linear 16 | Models to Machine Learning: Modern Statistical Regression and 17 | Classification, N. Matloff, CRC, 2017. 18 | 19 | The tools are 20 | useful in general, independently of the book. 21 | 22 | ## FEATURES: 23 | 24 | * Nonparametric regression for general dimensions in predictor and 25 | response variables, using k-Nearest Neighbors. Local-linear option. 26 | Allows for user-specified smoothing method. Allows for accelerated 27 | exploration of multiple values of k at once. Tool to aid in 28 | choosing k. 29 | 30 | * Innovative tools for assessing fit in linear and nonlinear parametric 31 | models, via nonparametric methods. Model evaluation, examination of 32 | quadratic effects, investigation of nonhomogeneity of variance. 33 | 34 | * Tools for multiclass classification, parametric and nonparametric. 35 | One vs. All and All vs. All. Novel adjustment for artificially 36 | balanced data. 37 | 38 | * Linear regression, PCA and log-linear model estimation in missing-data 39 | setting, via the Available Cases method. 40 | 41 | * Nicer implementation of ridge regression, with more meaningful scaling 42 | and better plotting. 43 | 44 | * Extension to nonlinear parametric regression with of Eickert-White 45 | technique to handle heteroscedasticity. 46 | 47 | * Misc. tools, e.g. Method of Moments estimation (including for 48 | nonregression settings). 49 | 50 | ## EXAMPLE: MODEL FIT ASSESSMENT 51 | 52 | Let's take a look at the data set prgeng, some Census data for 53 | California engineers and programmers in the year 2000. The response 54 | variable in this example is wage income, and the predictors are age, 55 | number of weeks worked, and dummy variables for MS and PhD degrees. 56 | (Some data wrangling was performed first; type ?knnest for the 57 | details.) 58 | 59 | The fit assessment techniques in regtools gauge the fit of 60 | parametric models by comparing to nonparametric ones. Since the latter 61 | are free of model bias, they are very useful in assessing the parametric 62 | models. 63 | 64 | The function nonparvsxplot() plots the nonparametric fits against 65 | each predictor variable, for instance to explore nonlinear effects. 66 | Here is the plot for wage versus (scaled) age: 67 | 68 | 69 | 70 | Of course, the effects of the other predictors don't show up here, but 71 | there does seem to be a quadratic effect. The same was true for the 72 | predictor measuring the number of weeks worked (slightly concave up, not 73 | shown here). In our linear parametric model, then, we will include 74 | squared terms for these two predictors. 75 | 76 | So, after fitting the linear model, run parvsnonparplot(), which 77 | plots the fit of the parametric model against the nonparametric one. 78 | Here is the result: 79 | 80 | 81 | 82 | There is quite a bit suggested in this picture: 83 | 84 | * There seems to be some overfitting near the low end, and underfitting at 85 | the high end. 86 | 87 | * The outliers, meaning points far from the fitted linear model, are 88 | almost all below the linear fit. 89 | 90 | * There are intriguing "sreaks" or "tails" of points, suggesting the 91 | possible existence of important subpopulations. 92 | 93 | * There appear to be a number of people with 0 wage income. Depending on 94 | the goals of our analysis, we might consider removing them. 95 | 96 | Finally, let's check the classical assumption of homoscedasticity, 97 | meaning that the conditional variance of Y given X is constant. The 98 | function nonparvarplot() plots the estimated conditional variance 99 | against the estimated conditional mean, both computed nonparametrically:: 100 | 101 | 102 | 103 | Wow, a hockey stick! Though there is a mild rise in coefficient of 104 | determination, i.e. standard deviation relative to the mean, up to 105 | about $80K, the slope increases sharply after that. 106 | 107 | What to do? As long as our linear regression model assumption holds, 108 | violation of the homoscedasticity assumption won't invalidate our 109 | estimates; they still will be statistically consistent. But the 110 | standard errors we compute, and thus the statistical inference we 111 | perform, will be affected. This is correctible using the Eickert-White 112 | procedure, which for linear models is available in the car 113 | package, included in regtools. Our package also extends 114 | this to nonlinear parametric models, in our function nlshc() (the 115 | validity of this extension is shown in the book). 116 | 117 | Of course, the "hockey stick" form is another indication that we should 118 | further investigate the model itself. It may well be useful to fit two 119 | separate linear models, one for incomes below $80K and the other for the 120 | higher incomes. For a more formal approach to this, we might consider 121 | changepoint methods, such as in the CRAN package 122 | chngpt. 123 | 124 | What is different: 125 | 126 | Note carefully that the above graph is unaffected by the validity of 127 | the parametric model; it is based purely on nonparametric analysis. 128 | This is in contrast to classic regression fit methods, most of which are 129 | based on examination of residuals of the fitted model. 130 | 131 | ## EXAMPLE: OVA VS. AVA IN MULTICLASS PROBLEMS 132 | 133 | A very popular prediction method in 2-class problems is to use logistic 134 | (logit) regression. In analyzing click-through patterns of Web users, 135 | for instance, we have 2 classes, Click and Nonclick. We might fit a 136 | logistic model for Click, given user Web history, demographics and so 137 | on. Note that logit actually models probabilities, e.g. the probability 138 | of Click given the predictor variables. 139 | 140 | But the situation is much less simple in multiclass settings. Suppose 141 | our application is recognition of hand-written digits (a famous machine 142 | learning example). The predictor variables are pixel patterns in images. 143 | There are two schools of thought on this: 144 | 145 | * One vs. All (OVA): We would run 26 logistic regression models, 146 | one for predicting '0' vs. non-'0', one for '1' vs. non-'1', and so 147 | on. For a particular image, we would thus obtain 26 estimated 148 | probabilities. Let imax be the image that yields the largest 149 | probability; we would then guess the digit for the image to be 'i'. 150 | 151 | * All vs. All (AVA): Here we would run C(10,2) = 45 logit 152 | analyses, one for each pair of digits. There would be one for '0' vs. 153 | '1', one for '0' vs. '2', etc., all the way up through '8' vs. '9'. 154 | Many in the machine learning literature recommend AVA over OVA, on the 155 | grounds that might be linearly separable (in the statistical sense) in 156 | pairs but not otherwise. My book counters by positing that such a 157 | situation could be remedied under OVA by adding quadratic terms to the 158 | logit models. 159 | 160 | At any rate, the regtools package gives you a choice, 161 | OVA or AVA, for both parametric and nonparametric methods. For example, 162 | avalogtrn() and avalogpred() do 163 | training and prediction operations for logit with AVA. 164 | 165 | Another feature concerns adjustment of class probabilities. In many 166 | multiclass data sets, the numbers of points in each class is the same, 167 | or least not reflective of the population class probabilities. In 168 | regtools, the user can specify estimates of the latter, 169 | for logit and nonparametric methods. 170 | 171 | So, let's look at an example, using the UCI Letter Recognition data set, 172 | another image recognition example. Again, the code below was preceded 173 | by some data wrangling, which changed the letter data from character to 174 | numeric, and which divided the data set into training and test sets. 175 | Here is the OVA run: 176 | 177 |
178 | > ologout <- ovalogtrn(26,lrtrn[,c(2:17,1)]) 
179 | > ypred <- ovalogpred(ologout,lrtest[,-1]) 
180 | > mean(ypred == lrtest[,1]) 
181 | [1] 0.7193333 
182 | 
183 | 184 | So, we get about a 72% rate of correct classification. Now let's try 185 | AVA: 186 | 187 |
188 | > alogout <- avalogtrn(26,lrtrn[,c(2:17,1)])
189 | > ypred <- avalogpred(26,alogout,lrtest[,-1])
190 | > mean(ypred == lrtest[,1])
191 | [1] 0.8355
192 | 
193 | 194 | AVA did considerably better, 84%. So, apparently AVA fixed a poor 195 | model. But of course, it's better to make a good model in the first 196 | place. Based on our previous observation that the boundaries may be 197 | better approximated by curves than lines, let's try a quadratic model. 198 | 199 | There were 16 predictors, thus 16 possible quadratic terms, and C(16,2) 200 | = 120 possible interaction terms. Inclusion of all such variables would 201 | probably produce too rich a model for the 14000 points in our training 202 | set. We'll settle for adding just the squared terms (not shown): 203 | 204 |
205 | > ologout <- ovalogtrn(26,lrtrn[,c(2:33,1)])
206 | > ypred <- ovalogpred(ologout,lrtest[,-1])
207 | > mean(ypred == lrtest[,1])
208 | [1] 0.8086667
209 | 
210 | 211 | Ah, much better, though still not quite as good as AVA. 212 | 213 | -------------------------------------------------------------------------------- /inst/vn.save/vignettes/regtools.Rmd.save: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "regtools" 4 | author: "Norm Matloff" 5 | vignette: > 6 | %\VignetteIndexEntry{regtools} 7 | %\VignetteEngine{knitr::rmarkdown} 8 | output: rmarkdown::html_vignette 9 | --- 10 | 11 | # regtools 12 | 13 | ## Novel tools tools for linear, nonlinear and nonparametric regression. 14 | 15 | These tools are associated with my forthcoming book, From Linear 16 | Models to Machine Learning: Modern Statistical Regression and 17 | Classification, N. Matloff, CRC, 2017. 18 | 19 | The tools are 20 | useful in general, independently of the book. 21 | 22 | ## FEATURES: 23 | 24 | * Nonparametric regression for general dimensions in predictor and 25 | response variables, using k-Nearest Neighbors. Local-linear option. 26 | Allows for user-specified smoothing method. Allows for accelerated 27 | exploration of multiple values of k at once. Tool to aid in 28 | choosing k. 29 | 30 | * Innovative tools for assessing fit in linear and nonlinear parametric 31 | models, via nonparametric methods. Model evaluation, examination of 32 | quadratic effects, investigation of nonhomogeneity of variance. 33 | 34 | * Tools for multiclass classification, parametric and nonparametric. 35 | One vs. All and All vs. All. Novel adjustment for artificially 36 | balanced data. 37 | 38 | * Linear regression, PCA and log-linear model estimation in missing-data 39 | setting, via the Available Cases method. 40 | 41 | * Nicer implementation of ridge regression, with more meaningful scaling 42 | and better plotting. 43 | 44 | * Extension to nonlinear parametric regression with of Eickert-White 45 | technique to handle heteroscedasticity. 46 | 47 | * Misc. tools, e.g. Method of Moments estimation (including for 48 | nonregression settings). 49 | 50 | ## EXAMPLE: MODEL FIT ASSESSMENT 51 | 52 | Let's take a look at the data set prgeng, some Census data for 53 | California engineers and programmers in the year 2000. The response 54 | variable in this example is wage income, and the predictors are age, 55 | number of weeks worked, and dummy variables for MS and PhD degrees. 56 | (Some data wrangling was performed first; type ?knnest for the 57 | details.) 58 | 59 | The fit assessment techniques in regtools gauge the fit of 60 | parametric models by comparing to nonparametric ones. Since the latter 61 | are free of model bias, they are very useful in assessing the parametric 62 | models. 63 | 64 | The function nonparvsxplot() plots the nonparametric fits against 65 | each predictor variable, for instance to explore nonlinear effects. 66 | Here is the plot for wage versus (scaled) age: 67 | 68 | 69 | 70 | Of course, the effects of the other predictors don't show up here, but 71 | there does seem to be a quadratic effect. The same was true for the 72 | predictor measuring the number of weeks worked (slightly concave up, not 73 | shown here). In our linear parametric model, then, we will include 74 | squared terms for these two predictors. 75 | 76 | So, after fitting the linear model, run parvsnonparplot(), which 77 | plots the fit of the parametric model against the nonparametric one. 78 | Here is the result: 79 | 80 | 81 | 82 | There is quite a bit suggested in this picture: 83 | 84 | * There seems to be some overfitting near the low end, and underfitting at 85 | the high end. 86 | 87 | * The outliers, meaning points far from the fitted linear model, are 88 | almost all below the linear fit. 89 | 90 | * There are intriguing "sreaks" or "tails" of points, suggesting the 91 | possible existence of important subpopulations. 92 | 93 | * There appear to be a number of people with 0 wage income. Depending on 94 | the goals of our analysis, we might consider removing them. 95 | 96 | Finally, let's check the classical assumption of homoscedasticity, 97 | meaning that the conditional variance of Y given X is constant. The 98 | function nonparvarplot() plots the estimated conditional variance 99 | against the estimated conditional mean, both computed nonparametrically:: 100 | 101 | 102 | 103 | Wow, a hockey stick! Though there is a mild rise in coefficient of 104 | determination, i.e. standard deviation relative to the mean, up to 105 | about $80K, the slope increases sharply after that. 106 | 107 | What to do? As long as our linear regression model assumption holds, 108 | violation of the homoscedasticity assumption won't invalidate our 109 | estimates; they still will be statistically consistent. But the 110 | standard errors we compute, and thus the statistical inference we 111 | perform, will be affected. This is correctible using the Eickert-White 112 | procedure, which for linear models is available in the car 113 | package, included in regtools. Our package also extends 114 | this to nonlinear parametric models, in our function nlshc() (the 115 | validity of this extension is shown in the book). 116 | 117 | Of course, the "hockey stick" form is another indication that we should 118 | further investigate the model itself. It may well be useful to fit two 119 | separate linear models, one for incomes below $80K and the other for the 120 | higher incomes. For a more formal approach to this, we might consider 121 | changepoint methods, such as in the CRAN package 122 | chngpt. 123 | 124 | What is different: 125 | 126 | Note carefully that the above graph is unaffected by the validity of 127 | the parametric model; it is based purely on nonparametric analysis. 128 | This is in contrast to classic regression fit methods, most of which are 129 | based on examination of residuals of the fitted model. 130 | 131 | ## EXAMPLE: OVA VS. AVA IN MULTICLASS PROBLEMS 132 | 133 | A very popular prediction method in 2-class problems is to use logistic 134 | (logit) regression. In analyzing click-through patterns of Web users, 135 | for instance, we have 2 classes, Click and Nonclick. We might fit a 136 | logistic model for Click, given user Web history, demographics and so 137 | on. Note that logit actually models probabilities, e.g. the probability 138 | of Click given the predictor variables. 139 | 140 | But the situation is much less simple in multiclass settings. Suppose 141 | our application is recognition of hand-written digits (a famous machine 142 | learning example). The predictor variables are pixel patterns in images. 143 | There are two schools of thought on this: 144 | 145 | * One vs. All (OVA): We would run 26 logistic regression models, 146 | one for predicting '0' vs. non-'0', one for '1' vs. non-'1', and so 147 | on. For a particular image, we would thus obtain 26 estimated 148 | probabilities. Let imax be the image that yields the largest 149 | probability; we would then guess the digit for the image to be 'i'. 150 | 151 | * All vs. All (AVA): Here we would run C(10,2) = 45 logit 152 | analyses, one for each pair of digits. There would be one for '0' vs. 153 | '1', one for '0' vs. '2', etc., all the way up through '8' vs. '9'. 154 | Many in the machine learning literature recommend AVA over OVA, on the 155 | grounds that might be linearly separable (in the statistical sense) in 156 | pairs but not otherwise. My book counters by positing that such a 157 | situation could be remedied under OVA by adding quadratic terms to the 158 | logit models. 159 | 160 | At any rate, the regtools package gives you a choice, 161 | OVA or AVA, for both parametric and nonparametric methods. For example, 162 | avalogtrn() and avalogpred() do 163 | training and prediction operations for logit with AVA. 164 | 165 | Another feature concerns adjustment of class probabilities. In many 166 | multiclass data sets, the numbers of points in each class is the same, 167 | or least not reflective of the population class probabilities. In 168 | regtools, the user can specify estimates of the latter, 169 | for logit and nonparametric methods. 170 | 171 | So, let's look at an example, using the UCI Letter Recognition data set, 172 | another image recognition example. Again, the code below was preceded 173 | by some data wrangling, which changed the letter data from character to 174 | numeric, and which divided the data set into training and test sets. 175 | Here is the OVA run: 176 | 177 |
178 | > ologout <- ovalogtrn(26,lrtrn[,c(2:17,1)]) 
179 | > ypred <- ovalogpred(ologout,lrtest[,-1]) 
180 | > mean(ypred == lrtest[,1]) 
181 | [1] 0.7193333 
182 | 
183 | 184 | So, we get about a 72% rate of correct classification. Now let's try 185 | AVA: 186 | 187 |
188 | > alogout <- avalogtrn(26,lrtrn[,c(2:17,1)])
189 | > ypred <- avalogpred(26,alogout,lrtest[,-1])
190 | > mean(ypred == lrtest[,1])
191 | [1] 0.8355
192 | 
193 | 194 | AVA did considerably better, 84%. So, apparently AVA fixed a poor 195 | model. But of course, it's better to make a good model in the first 196 | place. Based on our previous observation that the boundaries may be 197 | better approximated by curves than lines, let's try a quadratic model. 198 | 199 | There were 16 predictors, thus 16 possible quadratic terms, and C(16,2) 200 | = 120 possible interaction terms. Inclusion of all such variables would 201 | probably produce too rich a model for the 14000 points in our training 202 | set. We'll settle for adding just the squared terms (not shown): 203 | 204 |
205 | > ologout <- ovalogtrn(26,lrtrn[,c(2:33,1)])
206 | > ypred <- ovalogpred(ologout,lrtest[,-1])
207 | > mean(ypred == lrtest[,1])
208 | [1] 0.8086667
209 | 
210 | 211 | Ah, much better, though still not quite as good as AVA. 212 | 213 | -------------------------------------------------------------------------------- /inst/vn.save/vignettes/regtools.Rmd~: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "regtools" 4 | output: rmarkdown::html_vignette 5 | vignette: > 6 | %\VignetteIndexEntry{regtools} 7 | %\VignetteEngine{knitr::rmarkdown} 8 | \usepackage[utf8]{inputenc} 9 | --- 10 | 11 | # regtools 12 | 13 | ## Novel tools tools for linear, nonlinear and nonparametric regression. 14 | 15 | These tools are associated with my forthcoming book, From Linear 16 | Models to Machine Learning: Modern Statistical Regresison and 17 | Classification, N. Matloff, CRC, 2017. 18 | 19 | The tools are 20 | useful in general, independently of the book. 21 | 22 | ## FEATURES: 23 | 24 | * Nonparametric regression for general dimensions in predictor and 25 | response variables, using k-Nearest Neighbors. Local-linear option. 26 | Allows for user-specified smoothing method. Allows for accelerated 27 | exploration of multiple values of k at once. Tool to aid in 28 | choosing k. 29 | 30 | * Innovative tools for assessing fit in linear and nonlinear parametric 31 | models, via nonparametric methods. Model evaluation, examination of 32 | quadratic effects, investigation of nonhomogeneity of variance. 33 | 34 | * Tools for multiclass classification, parametric and nonparametric. 35 | One vs. All and All vs. All. Novel adjustment for artificially 36 | balanced data. 37 | 38 | * Linear regression, PCA and log-linear model estimation in missing-data 39 | setting, via the Available Cases method. 40 | 41 | * Nicer implementation of ridge regression, with more meaningful scaling 42 | and better plotting. 43 | 44 | * Extension to nonlinear parametric regression with of Eickert-White 45 | technique to handle heteroscedasticity. 46 | 47 | * Misc. tools, e.g. Method of Moments estimation (including for 48 | nonregression settings). 49 | 50 | ## EXAMPLE: MODEL FIT ASSESSMENT 51 | 52 | Let's take a look at the data set prgeng, some Census data for 53 | California engineers and programmers in the year 2000. The response 54 | variable in this example is wage income, and the predictors are age, 55 | number of weeks worked, and dummy variables for MS and PhD degrees. 56 | (Some data wrangling was performed first; type ?knnest for the 57 | details.) 58 | 59 | The fit assessment techniques in regtools gauge the fit of 60 | parametric models by comparing to nonparametric ones. Since the latter 61 | are free of model bias, they are very useful in assessing the parametric 62 | models. 63 | 64 | The function nonparvsxplot() plots the nonparametric fits against 65 | each predictor variable, for instance to explore nonlinear effects. 66 | Here is the plot for wage versus (scaled) age: 67 | 68 | 69 | 70 | Of course, the effects of the other predictors don't show up here, but 71 | there does seem to be a quadratic effect. The same was true for the 72 | predictor measuring the number of weeks worked (slightly concave up, not 73 | shown here). In our linear parametric model, then, we will include 74 | squared terms for these two predictors. 75 | 76 | So, after fitting the linear model, run parvsnonparplot(), which 77 | plots the fit of the parametric model against the nonparametric one. 78 | Here is the result: 79 | 80 | 81 | 82 | There is quite a bit suggested in this picture: 83 | 84 | * There seems to be some overfitting near the low end, and underfitting at 85 | the high end. 86 | 87 | * The outliers, meaning points far from the fitted linear model, are 88 | almost all below the linear fit. 89 | 90 | * There are intriguing "sreaks" or "tails" of points, suggesting the 91 | possible existence of important subpopulations. 92 | 93 | * There appear to be a number of people with 0 wage income. Depending on 94 | the goals of our analysis, we might consider removing them. 95 | 96 | Finally, let's check the classical assumption of homoscedasticity, 97 | meaning that the conditional variance of Y given X is constant. The 98 | function nonparvarplot() plots the estimated conditional variance 99 | against the estimated conditional mean, both computed nonparametrically:: 100 | 101 | 102 | 103 | Wow, a hockey stick! Though there is a mild rise in coefficient of 104 | determination, i.e. standard deviation relative to the mean, up to 105 | about $80K, the slope increases sharply after that. 106 | 107 | What to do? As long as our linear regression model assumption holds, 108 | violation of the homoscedasticity assumption won't invalidate our 109 | estimates; they still will be statistically consistent. But the 110 | standard errors we compute, and thus the statistical inference we 111 | perform, will be affected. This is correctible using the Eickert-White 112 | procedure, which for linear models is available in the car 113 | package, included in regtools. Our package also extends 114 | this to nonlinear parametric models, in our function nlshc() (the 115 | validity of this extension is shown in the book). 116 | 117 | Of course, the "hockey stick" form is another indication that we should 118 | further investigate the model itself. It may well be useful to fit two 119 | separate linear models, one for incomes below $80K and the other for the 120 | higher incomes. For a more formal approach to this, we might consider 121 | changepoint methods, such as in the CRAN package 122 | chngpt. 123 | 124 | What is different: 125 | 126 | Note carefully that the above graph is unaffected by the validity of 127 | the parametric model; it is based purely on nonparametric analysis. 128 | This is in contrast to classic regression fit methods, most of which are 129 | based on examination of residuals of the fitted model. 130 | 131 | ## EXAMPLE; OVA VS. AVA IN MULTICLASS PROBLEMS 132 | 133 | A very popular prediction method in 2-class problems is to use logistic 134 | (logit) regression. In analyzing click-through patterns of Web users, 135 | for instance, we have 2 classes, Click and Nonclick. We might fit a 136 | logistic model for Click, given user Web history, demographics and so 137 | on. Note that logit actually models probabilities, e.g. the probability 138 | of Click given the predictor variables. 139 | 140 | But the situation is much less simple in multiclass settings. Suppose 141 | our application is recognition of hand-written digits (a famous machine 142 | learning example). The predictor variables are pixel patterns in images. 143 | There are two schools of thought on this: 144 | 145 | * One vs. All (OVA): We would run 26 logistic regression models, 146 | one for predicting '0' vs. non-'0', one for '1' vs. non-'1', and so 147 | on. For a particular image, we would thus obtain 26 estimated 148 | probabilities. Let imax be the image that yields the largest 149 | probability; we would then guess the digit for the image to be 'i'. 150 | 151 | * All vs. All (AVA): Here we would run C(10,2) = 45 logit 152 | analyses, one for each pair of digits. There would be one for '0' vs. 153 | '1', one for '0' vs. '2', etc., all the way up through '8' vs. '9'. 154 | Many in the machine learning literature recommend AVA over OVA, on the 155 | grounds that might be linearly separable (in the statistical sense) in 156 | pairs but not otherwise. My book counters by positing that such a 157 | situation could be remedied under OVA by adding quadratic terms to the 158 | logit models. 159 | 160 | At any rate, the regtools package gives you a choice, 161 | OVA or AVA, for both parametric and nonparametric methods. For example, 162 | avalogtrn() and avalogpred() do 163 | training and prediction operations for logit with AVA. 164 | 165 | Another feature concerns adjustment of class probabilities. In many 166 | multiclass data sets, the numbers of points in each class is the same, 167 | or least not reflective of the population class probabilities. In 168 | regtools, the user can specify estimates of the latter, 169 | for logit and nonparametric methods. 170 | 171 | So, let's look at an example, using the UCI Letter Recognition data set, 172 | another image recognition example. Again, the code below was preceded 173 | by some data wrangling, which changed the letter data from character to 174 | numeric, and which divided the data set into training and test sets. 175 | Here is the OVA run: 176 | 177 | ```{r} 178 | &>; ologout <- ovalogtrn(26,lrtrn[,c(2:17,1)]) 179 | &>; ypred <- ovalogpred(ologout,lrtest[,-1]) 180 | &>; mean(ypred == lrtest[,1]) 181 | [1] 0.7193333 182 | ``` 183 | 184 | So, we get about a 72% rate of correct classification. Now let's try 185 | AVA: 186 | 187 | ```{r} 188 | > alogout <- avalogtrn(26,lrtrn[,c(2:17,1)]) 189 | > ypred <- avalogpred(26,alogout,lrtest[,-1]) 190 | > mean(ypred == lrtest[,1]) 191 | [1] 0.8355 192 | ``` 193 | 194 | AVA did considerably better, 84%. So, apparently AVA fixed a poor 195 | model. But of course, it’s better to make a good model in the first 196 | place. Based on our previous observation that the boundaries may be 197 | better approximated by curves than lines, let's try a quadratic model. 198 | 199 | There were 16 predictors, thus 16 possible quadratic terms, and C(16,2) 200 | = 120 possible interaction terms. Inclusion of all such variables would 201 | probably produce too rich a model for the 14000 points in our training 202 | set. We'll settle for adding just the squared terms (not shown): 203 | 204 | ```{r} 205 | > ologout <- ovalogtrn(26,lrtrn[,c(2:33,1)]) 206 | > ypred <- ovalogpred(ologout,lrtest[,-1]) 207 | > mean(ypred == lrtest[,1]) 208 | [1] 0.8086667 209 | ``` 210 | 211 | Ah, much better, though still not quite as good as AVA. 212 | 213 | -------------------------------------------------------------------------------- /R/AC.R: -------------------------------------------------------------------------------- 1 | 2 | # Missing Values routines; also see polyanNA package 3 | 4 | ###################################################################### 5 | ###################################################################### 6 | 7 | # code to implement the Available Cases method (also called Pairwise 8 | # Complete) for handling missing data 9 | 10 | ######################## linear regression ########################## 11 | 12 | # arguments: 13 | 14 | # xy: data, with predictors in the first columns and the 15 | # response variable in the last column 16 | # nboot: if nonzero, this requests bootstrapped computation of the 17 | # estimated covariance matrix of the estimated vector of 18 | # regression coefficients 19 | 20 | # value: an object of class 'lmac', with components 21 | # 22 | # coefficients: estimated regression coefficients 23 | # fitted.values: est. regression ftn. values at 24 | # complete cases (but with full coefs.) 25 | # residuals: residuals at complete cases (but with full coefs.) 26 | # r2: R-squared 27 | # cov: optional est. covariance matrix of the coefs. 28 | 29 | lmac <- function(xy,nboot=0) { 30 | p1 <- ncol(xy) 31 | p <- p1 - 1 32 | tmp <- cov(xy,use='pairwise.complete.obs') 33 | upu <- tmp[1:p,1:p] 34 | upv <- tmp[1:p,p+1] 35 | bhat <- solve(upu,upv) 36 | lmacout <- list() 37 | class(lmacout) <- 'lmac' 38 | # bhat0 <- mean(y) - colMeans(x) %*% bhat 39 | bhat0 <- colMeans(xy,na.rm=TRUE) %*% c(-bhat,1) 40 | bhat <- c(bhat0,bhat) 41 | lmacout$coefficients <- bhat 42 | xycc <- na.omit(xy) 43 | yhat <- cbind(1,xycc[,-p1]) %*% bhat 44 | lmacout$fitted.values <- yhat 45 | lmacout$residuals <- xycc[,p1] - yhat 46 | lmacout$r2 <- (cor(yhat,xycc[,p1]))^2 47 | if (nboot > 0) { 48 | n <- nrow(xy) 49 | bootonce <- function() { 50 | idxs <- sample(1:n,n,replace=TRUE) 51 | lmac(xy[idxs,],nboot=0)$coefficients 52 | } 53 | bootout <- replicate(nboot,bootonce()) 54 | lmacout$cov<- cov(t(bootout)) 55 | } 56 | lmacout 57 | } 58 | 59 | coef.lmac <- function(object,...) { 60 | object$coefficients 61 | } 62 | 63 | vcov.lmac <- function(object,...) { 64 | object$cov 65 | } 66 | 67 | ############################# PCA ############################### 68 | 69 | # arguments: 70 | # 71 | # indata: data frame or matrix 72 | # 73 | # value: list with components 'values' and 'vectors', as with eigen() 74 | 75 | pcac <- function(indata,scale=FALSE) { 76 | covcor <- if(scale) cor else cov 77 | cvr <- covcor(indata,use='pairwise.complete.obs') 78 | tmp <- eigen(cvr) 79 | res <- list() 80 | if (any(tmp$values < 0)) 81 | stop('at least one negative eigenvalue') 82 | res$sdev <- sqrt(tmp$values) 83 | res$rotation <- tmp$vectors 84 | res 85 | } 86 | 87 | ###################### log-linear model` ########################## 88 | 89 | # log-linear model; at present, handles only the 3-factor casea 90 | # 91 | # arguments: 92 | # 93 | # x: data frame/matrix, one row per observation; use tbltofakedf() 94 | # if data is in table form 95 | # margin: a list of vectors specifying the model, 96 | # as in loglin() 97 | # 98 | # value: $param and $fit components in the value emitted from R's loglin() 99 | 100 | loglinac <- function(x,margin) { 101 | # find lengths of the elements in the model, to determine what 102 | # situtation we are in 103 | termlengths <- Map(length,margin) 104 | n1 <- sum(termlengths == 1) # singletons 105 | n2 <- sum(termlengths == 2) # 2-way interactions 106 | # mdlf() ("model function") will find the right cell means 107 | # for the specified 'margin' 108 | # fully independent? 109 | if (n1 == 3) mdlf <- mindep else 110 | # one var. independent of the other 2? 111 | if (n2 == 1) mdlf <- mxindyz else 112 | # 2 vars. conditionally independent, given the 3rd? 113 | if (n2 == 2) mdlf <-myzcondindx else 114 | # case of all possible 2-way interactions not implemented, for 115 | # lack of a closed-form solution 116 | stop('case of all 2-way terms not implemented') 117 | # need an appropriate shell, with the right dimensions, labels etc.; 118 | # the contents here are irrelevant and will be overwritten 119 | x <- as.data.frame(x) 120 | tbl <- table(x) 121 | tbl <- mdlf(x,margin,tbl,termlengths) 122 | loglin(tbl,margin,param=TRUE,fit=TRUE) 123 | } 124 | 125 | # fully independent case 126 | mindep <- function(x,margin,tbl,termlengths) { 127 | nc <- ncol(x) # currently must be 3 128 | probs <- list() 129 | # find number of distinct values found in each variable, and the 130 | # estimated marginal probabilities of each value 131 | nvals <- vector(length=nc) 132 | for (i in 1:nc) { 133 | tmp <- table(x[,i]) 134 | probs[[i]] <- tmp / sum(tmp) 135 | nvals[i] <- length(tmp) 136 | } 137 | # now find estimated cell probabilities 138 | for (i in 1:nvals[1]) 139 | for (j in 1:nvals[2]) 140 | for (k in 1:nvals[3]) { 141 | tbl[i,j,k] <- 142 | probs[[1]][i] * 143 | probs[[2]][j] * 144 | probs[[3]][k] 145 | } 146 | # convert to estimated expected cell counts 147 | tbl <- nrow(x) * tbl 148 | } 149 | 150 | # case of 1 variable, X, being independent of the other 2, Y and Z 151 | mxindyz <- function(x,margin,tbl,termlengths) { 152 | # which ones are Y and Z? 153 | iyz <- margin[[1]] 154 | nc <- ncol(x) # 3 155 | # which variable is X? 156 | ix <- setdiff((1:nc),iyz) 157 | # find number of distinct values found in each variable, and the 158 | # estimated marginal probabilities of each value 159 | probs <- list() 160 | nvals <- vector(length=nc) 161 | nvals[1] <- length(table(x[,ix])) 162 | nvals[2] <- length(table(x[,iyz[1]])) 163 | nvals[3] <- length(table(x[,iyz[2]])) 164 | tmp <- table(x[,ix]) 165 | probs[[1]] <- tmp / sum(tmp) 166 | tmp <- table(x[,iyz]) 167 | probs[[2]] <- tmp / sum(tmp) 168 | for (i in 1:nvals[1]) 169 | for (j in 1:nvals[2]) 170 | for (k in 1:nvals[3]) { 171 | if (ix == 1) { 172 | tbl[i,j,k] <- 173 | probs[[1]][i] * 174 | probs[[2]][j,k] 175 | } else if (ix == 2) { 176 | tbl[i,j,k] <- 177 | probs[[1]][j] * 178 | probs[[2]][i,k] 179 | } else { # ix = 3 180 | tbl[i,j,k] <- 181 | probs[[1]][k] * 182 | probs[[2]][i,j] 183 | } 184 | } 185 | tbl <- nrow(x) * tbl 186 | } 187 | 188 | # case of 2 variables being conditionally independent, given the 3rd 189 | myzcondindx <- function(x,margin,tbl,termlengths) { 190 | # which variable is X? 191 | ix <- intersect(margin[[1]],margin[[2]]) 192 | # which ones are Y and Z? 193 | iyz <- setdiff(union(margin[[1]],margin[[2]]),ix) 194 | iy <- iyz[1] 195 | iz <- iyz[2] 196 | # easier to keep track of all if iy < iz 197 | if (iy > iz) { 198 | tmp <- iz 199 | iz <- iy 200 | iy <- tmp 201 | } 202 | nc <- ncol(x) # currently 3 203 | # find number of distinct values found in each variable, and the 204 | # estimated marginal probabilities of each value 205 | probs <- list() 206 | nvals <- vector(length=nc) 207 | # nvals[1] <- length(table(x[,ix])) 208 | # nvals[2] <- length(table(x[,iy])) 209 | # nvals[3] <- length(table(x[,iz])) 210 | nvals[ix] <- length(table(x[,ix])) 211 | nvals[iy] <- length(table(x[,iy])) 212 | nvals[iz] <- length(table(x[,iz])) 213 | tmp <- table(x[,ix]) 214 | probs[[1]] <- tmp / sum(tmp) 215 | tmp <- table(x[,c(ix,iy)]) 216 | probs[[2]] <- tmp / sum(tmp) 217 | tmp <- table(x[,c(ix,iz)]) 218 | probs[[3]] <- tmp / sum(tmp) 219 | for (i in 1:nvals[1]) 220 | for (j in 1:nvals[2]) 221 | for (k in 1:nvals[3]) { 222 | if (ix == 1) { 223 | tbl[i,j,k] <- 224 | probs[[3]][i,k] * 225 | probs[[2]][i,j] / 226 | probs[[1]][i] 227 | } else if (ix == 2) { 228 | tbl[i,j,k] <- 229 | probs[[3]][j,k] * 230 | probs[[2]][j,i] / 231 | probs[[1]][j] 232 | 233 | } else { # ix == 3 234 | tbl[i,j,k] <- 235 | probs[[3]][k,j] * 236 | probs[[2]][k,i] / 237 | probs[[1]][k] 238 | } 239 | } 240 | tbl <- nrow(x) * tbl 241 | } 242 | 243 | # converts an R table to a fake data frame; the number of rows will be 244 | # the number of cases in the table, i.e. sum(tbl), and the number of 245 | # columns will be the dimension of the table, i.e. length(dim(tbl)); 246 | # if a cell has frequency k, it will appear k times in the output 247 | tbltofakedf <- function(tbl) { 248 | adf <- as.data.frame(tbl) 249 | nc <- ncol(adf) 250 | onecell <- function(adfrow) { 251 | freq <- as.numeric(adfrow[nc]) 252 | if (freq == 0) return(NULL) 253 | remainingrow <- adfrow[-nc] 254 | matrix(rep(remainingrow,freq),byrow=TRUE,nrow=freq) 255 | } 256 | m <- Reduce(rbind,apply(adf,1,onecell)) 257 | as.data.frame(m) 258 | } 259 | 260 | ###################################################################### 261 | ###################################################################### 262 | 263 | ############################# misc. ############################### 264 | 265 | # for testing purposes; randomly replacing each element of matrix m by 266 | 267 | makeNA <- function(m,probna) { 268 | if (!is.matrix(m)) stop('m must be a matrix') 269 | n <- length(m) 270 | nmiss <- rbinom(1,n,probna) 271 | naidxs <- sample(1:n,nmiss,replace=FALSE) 272 | m[naidxs] <- NA 273 | m 274 | } 275 | 276 | # replace NAs by 0s 277 | 278 | NAsTo0s <- function(x) 279 | { 280 | x[is.na(x)] <- 0 281 | x 282 | } 283 | 284 | # replace 0s (or other) by NAs 285 | 286 | ZerosToNAs <- function(x,replaceVal=0) 287 | { 288 | x[x == replaceVal] <- NA 289 | x 290 | } 291 | 292 | -------------------------------------------------------------------------------- /inst/vn.save/vignettes/regtools.Rnw: -------------------------------------------------------------------------------- 1 | 2 | \documentclass[11pt]{article} 3 | 4 | \setlength{\oddsidemargin}{0in} 5 | \setlength{\evensidemargin}{0in} 6 | \setlength{\topmargin}{0.0in} 7 | \setlength{\headheight}{0in} 8 | \setlength{\headsep}{0in} 9 | \setlength{\textwidth}{6.5in} 10 | \setlength{\textheight}{9.0in} 11 | \setlength{\parindent}{0in} 12 | \setlength{\parskip}{0.1in} 13 | 14 | \usepackage{listings} 15 | 16 | \usepackage{graphicx} 17 | 18 | % library(knitr) 19 | %\VignetteIndexEntry{Partools} 20 | 21 | \begin{document} 22 | 23 | \title{regtools: Novel Tools for Linear, Nonlinear and 24 | Nonparametric Regression} 25 | 26 | \author{Norm Matloff} 27 | 28 | \date{November 6, 2016} 29 | 30 | \maketitle 31 | These tools are associated with my forthcoming book, {\it From Linear Models 32 | to Machine Learning: Modern Statistical Regression and Classification}, 33 | N. Matloff, CRC, 2017. 34 | 35 | {\it The tools are useful in general, independently of the book.} 36 | 37 | \section{FEATURES:}\label{features} 38 | 39 | \begin{itemize} 40 | \item 41 | Nonparametric regression for general dimensions in predictor and 42 | response variables, using k-Nearest Neighbors. Local-linear option. 43 | Allows for user-specified smoothing method. Allows for accelerated 44 | exploration of multiple values of k at once. Tool to aid in choosing 45 | k. 46 | \item 47 | Innovative tools for assessing fit in linear and nonlinear parametric 48 | models, via nonparametric methods. Model evaluation, examination of 49 | quadratic effects, investigation of nonhomogeneity of variance. 50 | \item 51 | Tools for multiclass classification, parametric and nonparametric. One 52 | vs.~All and All vs.~All. Novel adjustment for artificially balanced 53 | data. 54 | \item 55 | Linear regression, PCA and log-linear model estimation in missing-data 56 | setting, via the Available Cases method. 57 | \item 58 | Nicer implementation of ridge regression, with more meaningful scaling 59 | and better plotting. 60 | \item 61 | Extension to nonlinear parametric regression with of Eickert-White 62 | technique to handle heteroscedasticity. 63 | \item 64 | Misc. tools, e.g.~Method of Moments estimation (including for 65 | nonregression settings). 66 | \end{itemize} 67 | 68 | \section{EXAMPLE: MODEL FIT 69 | ASSESSMENT}\label{example-model-fit-assessment} 70 | 71 | Let's take a look at the data set prgeng, some Census data for 72 | California engineers and programmers in the year 2000. The response 73 | variable in this example is wage income, and the predictors are age, 74 | number of weeks worked, and dummy variables for MS and PhD degrees. 75 | (Some data wrangling was performed first; type ?knnest for the details.) 76 | 77 | The fit assessment techniques in regtools gauge the fit of parametric 78 | models by comparing to nonparametric ones. Since the latter are free of 79 | model bias, they are very useful in assessing the parametric models. 80 | 81 | The function nonparvsxplot() plots the nonparametric fits against each 82 | predictor variable, for instance to explore nonlinear effects. Here is 83 | the plot for wage versus (scaled) age: 84 | 85 | \includegraphics[width=3.75in]{wagevsage.png} 86 | 87 | Of course, the effects of the other predictors don't show up here, but 88 | there does seem to be a quadratic effect. The same was true for the 89 | predictor measuring the number of weeks worked (slightly concave up, not 90 | shown here). In our linear parametric model, then, we will include 91 | squared terms for these two predictors. 92 | 93 | So, after fitting the linear model, run parvsnonparplot(), which plots 94 | the fit of the parametric model against the nonparametric one. Here is 95 | the result: 96 | 97 | \includegraphics[width=4.25in]{parvsnonpar.png} 98 | 99 | There is quite a bit suggested in this picture: 100 | 101 | \begin{itemize} 102 | \item 103 | There seems to be some overfitting near the low end, and underfitting 104 | at the high end. 105 | \item 106 | The outliers, meaning points far from the fitted linear model, are 107 | almost all below the linear fit. 108 | \item 109 | There are intriguing ``sreaks'' or ``tails'' of points, suggesting the 110 | possible existence of important subpopulations. 111 | \item 112 | There appear to be a number of people with 0 wage income. Depending on 113 | the goals of our analysis, we might consider removing them. 114 | \end{itemize} 115 | 116 | Finally, let's check the classical assumption of homoscedasticity, 117 | meaning that the conditional variance of Y given X is constant. The 118 | function nonparvarplot() plots the estimated conditional variance 119 | against the estimated conditional mean, both computed 120 | nonparametrically: 121 | 122 | \includegraphics[width=3.75in]{varvsmean.png} 123 | 124 | Wow, a hockey stick! Though there is a mild rise in coefficient of 125 | determination, i.e. standard deviation relative to the mean, up to about 126 | \$80K, the slope increases sharply after that. 127 | 128 | What to do? As long as our linear regression model assumption holds, 129 | violation of the homoscedasticity assumption won't invalidate our 130 | estimates; they still will be statistically consistent. But the standard 131 | errors we compute, and thus the statistical inference we perform, will 132 | be affected. This is correctible using the Eickert-White procedure, 133 | which for linear models is available in the car package, included in 134 | regtools. Our package also extends this to nonlinear parametric models, 135 | in our function nlshc() (the validity of this extension is shown in the 136 | book). 137 | 138 | Of course, the ``hockey stick'' form is another indication that we 139 | should further investigate the model itself. It may well be useful to 140 | fit two separate linear models, one for incomes below \$80K and the 141 | other for the higher incomes. For a more formal approach to this, we 142 | might consider changepoint methods, such as in the CRAN package chngpt. 143 | 144 | What is different: 145 | 146 | Note carefully that the above graph is unaffected by the validity of the 147 | parametric model; it is based purely on nonparametric analysis. This is 148 | in contrast to classic regression fit methods, most of which are based 149 | on examination of residuals of the fitted model. 150 | 151 | \section{EXAMPLE; OVA VS. AVA IN MULTICLASS 152 | PROBLEMS}\label{example-ova-vs.-ava-in-multiclass-problems} 153 | 154 | A very popular prediction method in 2-class problems is to use logistic 155 | (logit) regression. In analyzing click-through patterns of Web users, 156 | for instance, we have 2 classes, Click and Nonclick. We might fit a 157 | logistic model for Click, given user Web history, demographics and so 158 | on. Note that logit actually models probabilities, e.g.~the probability 159 | of Click given the predictor variables. 160 | 161 | But the situation is much less simple in multiclass settings. Suppose 162 | our application is recognition of hand-written digits (a famous machine 163 | learning example). The predictor variables are pixel patterns in images. 164 | There are two schools of thought on this: 165 | 166 | \begin{itemize} 167 | \item 168 | One vs.~All (OVA): We would run 26 logistic regression models, one for 169 | predicting `0' vs.~non-`0', one for `1' vs.~non-`1', and so on. For a 170 | particular image, we would thus obtain 26 estimated probabilities. Let 171 | imax be the image that yields the largest probability; we would then 172 | guess the digit for the image to be `i'. 173 | \item 174 | All vs.~All (AVA): Here we would run C(10,2) = 45 logit analyses, one 175 | for each pair of digits. There would be one for `0' vs. `1', one for 176 | `0' vs. `2', etc., all the way up through `8' vs. `9'. Many in the 177 | machine learning literature recommend AVA over OVA, on the grounds 178 | that might be linearly separable (in the statistical sense) in pairs 179 | but not otherwise. My book counters by positing that such a situation 180 | could be remedied under OVA by adding quadratic terms to the logit 181 | models. 182 | \end{itemize} 183 | 184 | At any rate, the regtools package gives you a choice, OVA or AVA, for 185 | both parametric and nonparametric methods. For example, avalogtrn() and 186 | avalogpred() do training and prediction operations for logit with AVA. 187 | 188 | Another feature concerns adjustment of class probabilities. In many 189 | multiclass data sets, the numbers of points in each class is the same, 190 | or least not reflective of the population class probabilities. In 191 | regtools, the user can specify estimates of the latter, for logit and 192 | nonparametric methods. 193 | 194 | So, let's look at an example, using the UCI Letter Recognition data set, 195 | another image recognition example. Again, the code below was preceded by 196 | some data wrangling, which changed the letter data from character to 197 | numeric, and which divided the data set into training and test sets. 198 | Here is the OVA run: 199 | 200 | \begin{lstlisting} 201 | > ologout <- ovalogtrn(26,lrtrn[,c(2:17,1)]) 202 | > ypred <- ovalogpred(ologout,lrtest[,-1]) 203 | > mean(ypred == lrtest[,1]) 204 | [1] 0.7193333 205 | \end{lstlisting} 206 | 207 | So, we get about a 72\% rate of correct classification. Now let's try 208 | AVA: 209 | 210 | \begin{lstlisting} 211 | > alogout <- avalogtrn(26,lrtrn[,c(2:17,1)]) 212 | > ypred <- avalogpred(26,alogout,lrtest[,-1]) 213 | > mean(ypred == lrtest[,1]) 214 | [1] 0.8355 215 | \end{lstlisting} 216 | 217 | AVA did considerably better, 84\%. So, apparently AVA fixed a poor 218 | model. But of course, it's better to make a good model in the first 219 | place. Based on our previous observation that the boundaries may be 220 | better approximated by curves than lines, let's try a quadratic model. 221 | 222 | There were 16 predictors, thus 16 possible quadratic terms, and C(16,2) 223 | = 120 possible interaction terms. Inclusion of all such variables would 224 | probably produce too rich a model for the 14000 points in our training 225 | set. We'll settle for adding just the squared terms (not shown): 226 | 227 | \begin{lstlisting} 228 | > ologout <- ovalogtrn(26,lrtrn[,c(2:33,1)]) 229 | > ypred <- ovalogpred(ologout,lrtest[,-1]) 230 | > mean(ypred == lrtest[,1]) 231 | [1] 0.8086667 232 | \end{lstlisting} 233 | 234 | Ah, much better, though still not quite as good as AVA. 235 | 236 | \end{document} 237 | ` 238 | -------------------------------------------------------------------------------- /inst/vn.save/vignettes/regtools.Rnw.save: -------------------------------------------------------------------------------- 1 | 2 | \documentclass[11pt]{article} 3 | 4 | \setlength{\oddsidemargin}{0in} 5 | \setlength{\evensidemargin}{0in} 6 | \setlength{\topmargin}{0.0in} 7 | \setlength{\headheight}{0in} 8 | \setlength{\headsep}{0in} 9 | \setlength{\textwidth}{6.5in} 10 | \setlength{\textheight}{9.0in} 11 | \setlength{\parindent}{0in} 12 | \setlength{\parskip}{0.1in} 13 | 14 | \usepackage{listings} 15 | 16 | \usepackage{graphicx} 17 | 18 | % library(knitr) 19 | %\VignetteIndexEntry{Partools} 20 | 21 | \begin{document} 22 | 23 | \title{regtools: Novel Tools for Linear, Nonlinear and 24 | Nonparametric Regression} 25 | 26 | \author{Norm Matloff} 27 | 28 | \date{November 6, 2016} 29 | 30 | \maketitle 31 | These tools are associated with my forthcoming book, {\it From Linear Models 32 | to Machine Learning: Modern Statistical Regression and Classification}, 33 | N. Matloff, CRC, 2017. 34 | 35 | {\it The tools are useful in general, independently of the book.} 36 | 37 | \section{FEATURES:}\label{features} 38 | 39 | \begin{itemize} 40 | \item 41 | Nonparametric regression for general dimensions in predictor and 42 | response variables, using k-Nearest Neighbors. Local-linear option. 43 | Allows for user-specified smoothing method. Allows for accelerated 44 | exploration of multiple values of k at once. Tool to aid in choosing 45 | k. 46 | \item 47 | Innovative tools for assessing fit in linear and nonlinear parametric 48 | models, via nonparametric methods. Model evaluation, examination of 49 | quadratic effects, investigation of nonhomogeneity of variance. 50 | \item 51 | Tools for multiclass classification, parametric and nonparametric. One 52 | vs.~All and All vs.~All. Novel adjustment for artificially balanced 53 | data. 54 | \item 55 | Linear regression, PCA and log-linear model estimation in missing-data 56 | setting, via the Available Cases method. 57 | \item 58 | Nicer implementation of ridge regression, with more meaningful scaling 59 | and better plotting. 60 | \item 61 | Extension to nonlinear parametric regression with of Eickert-White 62 | technique to handle heteroscedasticity. 63 | \item 64 | Misc. tools, e.g.~Method of Moments estimation (including for 65 | nonregression settings). 66 | \end{itemize} 67 | 68 | \section{EXAMPLE: MODEL FIT 69 | ASSESSMENT}\label{example-model-fit-assessment} 70 | 71 | Let's take a look at the data set prgeng, some Census data for 72 | California engineers and programmers in the year 2000. The response 73 | variable in this example is wage income, and the predictors are age, 74 | number of weeks worked, and dummy variables for MS and PhD degrees. 75 | (Some data wrangling was performed first; type ?knnest for the details.) 76 | 77 | The fit assessment techniques in regtools gauge the fit of parametric 78 | models by comparing to nonparametric ones. Since the latter are free of 79 | model bias, they are very useful in assessing the parametric models. 80 | 81 | The function nonparvsxplot() plots the nonparametric fits against each 82 | predictor variable, for instance to explore nonlinear effects. Here is 83 | the plot for wage versus (scaled) age: 84 | 85 | \includegraphics[width=3.75in]{wagevsage.png} 86 | 87 | Of course, the effects of the other predictors don't show up here, but 88 | there does seem to be a quadratic effect. The same was true for the 89 | predictor measuring the number of weeks worked (slightly concave up, not 90 | shown here). In our linear parametric model, then, we will include 91 | squared terms for these two predictors. 92 | 93 | So, after fitting the linear model, run parvsnonparplot(), which plots 94 | the fit of the parametric model against the nonparametric one. Here is 95 | the result: 96 | 97 | \includegraphics[width=4.25in]{parvsnonpar.png} 98 | 99 | There is quite a bit suggested in this picture: 100 | 101 | \begin{itemize} 102 | \item 103 | There seems to be some overfitting near the low end, and underfitting 104 | at the high end. 105 | \item 106 | The outliers, meaning points far from the fitted linear model, are 107 | almost all below the linear fit. 108 | \item 109 | There are intriguing ``sreaks'' or ``tails'' of points, suggesting the 110 | possible existence of important subpopulations. 111 | \item 112 | There appear to be a number of people with 0 wage income. Depending on 113 | the goals of our analysis, we might consider removing them. 114 | \end{itemize} 115 | 116 | Finally, let's check the classical assumption of homoscedasticity, 117 | meaning that the conditional variance of Y given X is constant. The 118 | function nonparvarplot() plots the estimated conditional variance 119 | against the estimated conditional mean, both computed 120 | nonparametrically: 121 | 122 | \includegraphics[width=3.75in]{varvsmean.png} 123 | 124 | Wow, a hockey stick! Though there is a mild rise in coefficient of 125 | determination, i.e. standard deviation relative to the mean, up to about 126 | \$80K, the slope increases sharply after that. 127 | 128 | What to do? As long as our linear regression model assumption holds, 129 | violation of the homoscedasticity assumption won't invalidate our 130 | estimates; they still will be statistically consistent. But the standard 131 | errors we compute, and thus the statistical inference we perform, will 132 | be affected. This is correctible using the Eickert-White procedure, 133 | which for linear models is available in the car package, included in 134 | regtools. Our package also extends this to nonlinear parametric models, 135 | in our function nlshc() (the validity of this extension is shown in the 136 | book). 137 | 138 | Of course, the ``hockey stick'' form is another indication that we 139 | should further investigate the model itself. It may well be useful to 140 | fit two separate linear models, one for incomes below \$80K and the 141 | other for the higher incomes. For a more formal approach to this, we 142 | might consider changepoint methods, such as in the CRAN package chngpt. 143 | 144 | What is different: 145 | 146 | Note carefully that the above graph is unaffected by the validity of the 147 | parametric model; it is based purely on nonparametric analysis. This is 148 | in contrast to classic regression fit methods, most of which are based 149 | on examination of residuals of the fitted model. 150 | 151 | \section{EXAMPLE; OVA VS. AVA IN MULTICLASS 152 | PROBLEMS}\label{example-ova-vs.-ava-in-multiclass-problems} 153 | 154 | A very popular prediction method in 2-class problems is to use logistic 155 | (logit) regression. In analyzing click-through patterns of Web users, 156 | for instance, we have 2 classes, Click and Nonclick. We might fit a 157 | logistic model for Click, given user Web history, demographics and so 158 | on. Note that logit actually models probabilities, e.g.~the probability 159 | of Click given the predictor variables. 160 | 161 | But the situation is much less simple in multiclass settings. Suppose 162 | our application is recognition of hand-written digits (a famous machine 163 | learning example). The predictor variables are pixel patterns in images. 164 | There are two schools of thought on this: 165 | 166 | \begin{itemize} 167 | \item 168 | One vs.~All (OVA): We would run 26 logistic regression models, one for 169 | predicting `0' vs.~non-`0', one for `1' vs.~non-`1', and so on. For a 170 | particular image, we would thus obtain 26 estimated probabilities. Let 171 | imax be the image that yields the largest probability; we would then 172 | guess the digit for the image to be `i'. 173 | \item 174 | All vs.~All (AVA): Here we would run C(10,2) = 45 logit analyses, one 175 | for each pair of digits. There would be one for `0' vs. `1', one for 176 | `0' vs. `2', etc., all the way up through `8' vs. `9'. Many in the 177 | machine learning literature recommend AVA over OVA, on the grounds 178 | that might be linearly separable (in the statistical sense) in pairs 179 | but not otherwise. My book counters by positing that such a situation 180 | could be remedied under OVA by adding quadratic terms to the logit 181 | models. 182 | \end{itemize} 183 | 184 | At any rate, the regtools package gives you a choice, OVA or AVA, for 185 | both parametric and nonparametric methods. For example, avalogtrn() and 186 | avalogpred() do training and prediction operations for logit with AVA. 187 | 188 | Another feature concerns adjustment of class probabilities. In many 189 | multiclass data sets, the numbers of points in each class is the same, 190 | or least not reflective of the population class probabilities. In 191 | regtools, the user can specify estimates of the latter, for logit and 192 | nonparametric methods. 193 | 194 | So, let's look at an example, using the UCI Letter Recognition data set, 195 | another image recognition example. Again, the code below was preceded by 196 | some data wrangling, which changed the letter data from character to 197 | numeric, and which divided the data set into training and test sets. 198 | Here is the OVA run: 199 | 200 | \begin{lstlisting} 201 | > ologout <- ovalogtrn(26,lrtrn[,c(2:17,1)]) 202 | > ypred <- ovalogpred(ologout,lrtest[,-1]) 203 | > mean(ypred == lrtest[,1]) 204 | [1] 0.7193333 205 | \end{lstlisting} 206 | 207 | So, we get about a 72\% rate of correct classification. Now let's try 208 | AVA: 209 | 210 | \begin{lstlisting} 211 | > alogout <- avalogtrn(26,lrtrn[,c(2:17,1)]) 212 | > ypred <- avalogpred(26,alogout,lrtest[,-1]) 213 | > mean(ypred == lrtest[,1]) 214 | [1] 0.8355 215 | \end{lstlisting} 216 | 217 | AVA did considerably better, 84\%. So, apparently AVA fixed a poor 218 | model. But of course, it's better to make a good model in the first 219 | place. Based on our previous observation that the boundaries may be 220 | better approximated by curves than lines, let's try a quadratic model. 221 | 222 | There were 16 predictors, thus 16 possible quadratic terms, and C(16,2) 223 | = 120 possible interaction terms. Inclusion of all such variables would 224 | probably produce too rich a model for the 14000 points in our training 225 | set. We'll settle for adding just the squared terms (not shown): 226 | 227 | \begin{lstlisting} 228 | > ologout <- ovalogtrn(26,lrtrn[,c(2:33,1)]) 229 | > ypred <- ovalogpred(ologout,lrtest[,-1]) 230 | > mean(ypred == lrtest[,1]) 231 | [1] 0.8086667 232 | \end{lstlisting} 233 | 234 | Ah, much better, though still not quite as good as AVA. 235 | 236 | \end{document} 237 | ` 238 | -------------------------------------------------------------------------------- /man/Quick.Rd~: -------------------------------------------------------------------------------- 1 | \name{qe-Series Wrappers} 2 | \alias{qeLogit} 3 | \alias{qeLin} 4 | \alias{qeKNN} 5 | \alias{qeRF} 6 | \alias{qeSVM} 7 | \alias{qeGBoost} 8 | \alias{qeNeural} 9 | \alias{qeLASSO} 10 | \alias{qePolyLin} 11 | \alias{qePolyLog} 12 | \alias{qeIso} 13 | \alias{qeCompare} 14 | \alias{predict.qeLogit} 15 | \alias{predict.qeLin} 16 | \alias{predict.qeKNN} 17 | \alias{predict.qeRF} 18 | \alias{predict.qeSVM} 19 | \alias{predict.qeGBoost} 20 | \alias{predict.qeNeural} 21 | \alias{predict.qeLASSO} 22 | \alias{predict.qePolyLin} 23 | \alias{predict.qePolyLog} 24 | \alias{predict.qeIso} 25 | \alias{plot.RF} 26 | \alias{plot.LASSO} 27 | 28 | \title{Quick-Explore Regression/Classification Wrappers} 29 | 30 | \description{ 31 | Quick access to machine learning methods, with a very simple 32 | interface. Intended for convenient initial exploration of a dataset, 33 | both to gauge the predictive effectiveness of a model and to do simple 34 | prediction of new cases. Just one call needed to fit, no preliminary 35 | setup of model etc. The simplicity also makes the series useful 36 | for teaching. For advanced work, analysts may prefer to use 37 | the methods directly, in order to utilize specialized options. 38 | } 39 | 40 | \usage{ 41 | qeLogit(data,yName,holdout=floor(min(1000,0.1*nrow(data)))) 42 | qeLin(data,yName,holdout=floor(min(1000,0.1*nrow(data)))) 43 | qeKNN(data,yName,k,scaleX=TRUE,holdout=floor(min(1000,0.1*nrow(data)))) 44 | qeRF(data,yName,nTree,minNodeSize,holdout=floor(min(1000,0.1*nrow(data)))) 45 | qeSVM(data,yName,gamma=1.0,cost=1.0,kernel='radial',degree=2, 46 | holdout=floor(min(1000,0.1*nrow(data)))) 47 | qeGBoost(data,yName,nTree=100,minNodeSize=10,learnRate=0.1, 48 | holdout=floor(min(1000,0.1*nrow(data)))) 49 | qeNeural(data,yName,hidden=c(100,100),nEpoch=30, 50 | holdout=floor(min(1000,0.1*nrow(data)))) 51 | qeLASSO(data,yName,alpha=1,holdout=floor(min(1000,0.1*nrow(data)))) 52 | qePolyLin(data,yName,deg=2,maxInteractDeg = deg, 53 | holdout=floor(min(1000,0.1*nrow(data)))) 54 | qePolyLog(data,yName,deg=2,maxInteractDeg = deg, 55 | holdout=floor(min(1000,0.1*nrow(data)))) 56 | qeCompare(data,yName,qeFtnList,nReps,opts=NULL,seed=9999) 57 | \method{predict}{qeLogit}(object,newx) 58 | \method{predict}{qeLin}(object,newx) 59 | \method{predict}{qeKNN}(object,newx,newxK=1) 60 | \method{predict}{qeRF}(object,newx) 61 | \method{predict}{qeSVM}(object,newx,k=25) 62 | \method{predict}{qeGBoost}(object,newx) 63 | \method{predict}{qeNeural}(object,newx) 64 | \method{predict}{qeLASSO}(object,newx) 65 | \method{predict}{qePoly}(object,newx) 66 | \method{plot}{qeLASSO}(object,newx) 67 | \method{plot}{qeSVM}(object,newx,k=25) 68 | \method{plot}{qeRF}(object,newx) 69 | } 70 | 71 | \arguments{ 72 | \item{data}{Dataframe, training set. Classification case is signaled 73 | via labels column being an R factor.} 74 | \item{yName}{Name of the class labels column.} 75 | \item{holdout}{If not NULL, form a holdout set of the specified size. 76 | After fitting to the remaining data, evaluate accuracy on the test set.} 77 | \item{k}{Number of nearest neighbors. In functions other than 78 | \code{qeKNN} for which this is an argument, it is the number of 79 | neighbors to use in finding conditional probabilities via 80 | \code{knnCalib}.} 81 | \item{scaleX}{Scale the features.} 82 | \item{nTree}{Number of trees.} 83 | \item{minNodeSize}{Minimum number of data points in a tree node.} 84 | \item{learnRate}{Learning rate.} 85 | \item{hidden}{Vector of units per hidden layer. Fractional values 86 | indicated dropout proportions.} 87 | \item{nEpoch}{Number of iterations in neural net.} 88 | \item{alpha}{1 for LASSO, 2 for ridge.} 89 | \item{gamma}{Scale parameter in \code{e1071::svm}.} 90 | \item{cost}{Cost parameter in \code{e1071::svm}.} 91 | \item{kernel}{One of 'linear','radial','polynomial' and 'sigmoid'.} 92 | \item{degree}{Degree of SVM polynomial kernel, if any.} 93 | \item{qeFtnList}{Character vector of \code{qe*} names.} 94 | \item{nReps}{Number of holdout sets to generate.} 95 | \item{opts}{R list of optional arguments for none, some or all of th 96 | functions in \code{qeFtnList}.} 97 | \item{seed}{Seed for random number generation.} 98 | } 99 | 100 | \details{ 101 | 102 | As noted, these functions are intended for quick, first-level analysis 103 | of regression or multiclass classification problems. Emphasis here is 104 | on convenience and simplicity. Currently k-NN, SVM, random forests, 105 | gradient boosting, linear model, LASSO and polynomial regression are 106 | offered. 107 | 108 | The idea is that, given a new dataset, the analyst can quickly and 109 | easily try fitting a number of models in succession, say first k-NN, 110 | then random forests: 111 | 112 | \preformatted{ 113 | # built-in data on major league baseball players 114 | > data(mlb) 115 | > mlb <- mlb[,3:6] # position, height, weight, age 116 | 117 | # fit models 118 | > knnout <- qeKNN(mlb,'Weight',k=25) 119 | > rfout <- qeRF(mlb,'Weight') 120 | 121 | # mean abs. pred. error on holdout set, in pounds 122 | > knnout$testAcc 123 | [1] 11.75644 124 | > rfout$testAcc 125 | [1] 12.6787 126 | 127 | # predict a new case 128 | > newx <- data.frame(Position='Catcher',Height=73.5,Age=26) 129 | > predict(knnout,newx) 130 | [,1] 131 | [1,] 204.04 132 | > predict(rfout,newx) 133 | 11 134 | 199.1714 135 | 136 | # how about some other ML methods? 137 | > lassout <- qeLASSO(mlb,'Weight') 138 | > lassout$testAcc 139 | [1] 14.23122 140 | # poly reg, degree 3 141 | > polyout <- qePolyLin(mlb,'Weight',3) 142 | > polyout$testAcc 143 | [1] 13.55613 144 | > nnout <- qeNeural(mlb,'Weight') 145 | # ... 146 | > nnout$testAcc 147 | [1] 12.2537 148 | # try some nondefault hyperparams 149 | > nnout <- qeNeural(mlb,'Weight',hidden=c(200,200),nEpoch=50) 150 | > nnout$testAcc 151 | [1] 15.17982 152 | 153 | } 154 | 155 | The optional \code{holdout} argument triggers formation of a holdout set 156 | and the corresponding cross-validation evaluation of predictive power. 157 | Note that if a holdout is formed, the return value will consist of the 158 | fit on the training set, not on the full original dataset. 159 | 160 | In most cases, the full basket of options in the wrapped function is not 161 | reflected, and second-level analysis should use the relevant packages 162 | directly. 163 | 164 | The \code{qe*} functions do model fit. Each of them has a 165 | \code{predict} method, and some also have a \code{plot} method. 166 | Arguments for \code{qe*} are at least: \code{data} and \code{yName}; 167 | arguments for \code{predict} are at least: \code{object}, the return 168 | value from \code{qe*}, and \code{newx}, a data frame of points to be 169 | predicted. In some cases, there are additional algorithm-specific 170 | parameters; default values are provided. 171 | 172 | An additional benefit is that the \code{predict} functions work 173 | correctly on new cases with R factors. The proper levels are assigned 174 | to the new cases. (Of course, if a new case has a level not in the 175 | original data, nothing can be done.) 176 | 177 | The function \code{qeLin} handles classification problems as 178 | multivariate-outcome linea models. If one's goal is prediction, it can 179 | be much faster than \code{qeLogit}, often with comparable accuracy. 180 | 181 | The \code{qePolyLin} function does polynomial regression of the indicated 182 | degree. In the above example degree 3 means all terms through degree 3, 183 | e.g. \code{Height * Age^2}. Dummy variables are handled properly, e.g. 184 | no powers of a dummy are generatd. The logistic polynomial regression version 185 | is \code{qePolyLog}. 186 | 187 | The \code{qeCompare} function does quick-and-easy cross-validated 188 | comparisons among the \code{qe*} functions. The same holdout sets are 189 | generated and used by all the functions. Default values of 190 | hyperparameters of those functions can be set via \code{opts}. 191 | 192 | The \code{qeIso} function is intended mainly for use as a smoothing 193 | method in calibration actions. 194 | 195 | } 196 | 197 | \value{ 198 | 199 | The value returned by \code{qe*} functions depends on the algorithm, but 200 | with some commonality, e.g. \code{classif}, a logical value indicating 201 | whether the problem was of classification type. 202 | 203 | If a holdout set was requested, an additional returned component will be 204 | \code{testAcc}, the accuracy on the holdout set. This will be Mean 205 | Absolute Prediction Error in the regression case, and proportion of 206 | misclassified cases in the classification case. 207 | 208 | The value returned by the \code{predict} functions is an 209 | R list with components as follows: 210 | 211 | Classification case: 212 | 213 | \itemize{ 214 | 215 | \item \code{predClasses}: R factor instance of predicted class labels 216 | 217 | \item \code{probs}: vector/matrix of class probabilities; in the 2-class 218 | case, a vector, the probabilities of Y = 1 219 | 220 | } 221 | 222 | Regression case: vector of predicted values 223 | 224 | } 225 | 226 | \examples{ 227 | 228 | # see also 'details' above 229 | 230 | \dontrun{ 231 | 232 | data(peFactors) 233 | pef <- peFactors[,c(1,3,5,7:9)] 234 | # most people in the dataset have at least a Bachelor's degree; so let's 235 | # just consider Master's (code 14) and PhD (code 16) as special 236 | pef$educ <- toSubFactor(pef$educ,c('14','16')) 237 | 238 | # predict occupation; 6 classes, 100, 101, 102, 106, 140, 141, using SVM 239 | svmout <- qeSVM(pef,'occ',holdout=NULL) 240 | # as example of prediction, take the 8th case, but change the gender and 241 | # age to female and 25; note that by setting k to non-null, we are 242 | # requesting that conditional probabilities be calculated, via 243 | # knnCalib(), here using 25 nearest neighbors 244 | newx <- pef[8,-3] 245 | newx$sex <- '2' 246 | newx$age <- 25 247 | predict(svmout,newx,k=25) 248 | # $predClasses 249 | # 8 250 | # 100 251 | # Levels: 100 101 102 106 140 141 252 | # $dvals 253 | # 102/101 102/100 102/141 102/140 102/106 101/100 101/141 254 | # 8 -0.7774038 -0.5132022 0.9997894 1.003251 0.999688 -0.4023077 1.000419 255 | # 101/140 101/106 100/141 100/140 100/106 141/140 141/106 140/106 256 | # 8 1.000474 0.9997371 1.000088 1.000026 1.000126 0.9460703 -0.4974625 -1.035721 257 | # 258 | # $probs 259 | # 100 101 102 106 140 141 260 | # [1,] 0.24 0.52 0.12 0.08 0 0.04 261 | # 262 | # so, occupation code 100 is predicted, with a 0.36 conditional 263 | # probability 264 | 265 | # if holdout evaluation is desired as well, say 1000 cases, seed 9999: 266 | > svmout <- qeSVM(pef,'occ',holdout=c(1000,9999)) 267 | > svmout$testAcc 268 | [1] 0.622 # 62% error rate (overall rate for 6 classes) 269 | 270 | # linear 271 | # lm() doesn't like numeric factor levels, so prepend an 'a' 272 | pef$occ <- prepend('a',pef$occ) 273 | lmout <- qeLin(pef,'occ') 274 | predict(lmout,pef[1,-3]) # occ 100, prob 0.3316 275 | lmout <- qeLin(pef,'wageinc') 276 | predict(lmout,pef[1,-5]) # 70857.79 277 | 278 | qeCompare(mlb,'Weight',c('qeLin','qeKNN','qeRF'),25) 279 | # qeFtn meanAcc 280 | # 1 qeLin 13.30490 281 | # 2 qeKNN 13.72708 282 | # 3 qeRF 13.46515 283 | qeCompare(mlb,'Weight',c('qeLin','qeKNN','qeRF'),25, 284 | list(qeKNN='k=5',qeRF='nTree = 100, minNodeSize = 15')) 285 | # qeFtn meanAcc 286 | # 1 qeLin 13.30490 287 | # 2 qeKNN 14.34051 288 | # 3 qeRF 13.02334 289 | 290 | 291 | } 292 | 293 | } 294 | 295 | \author{ 296 | Norm Matloff 297 | } 298 | 299 | --------------------------------------------------------------------------------