├── inst
    ├── SVM.png
    ├── WtVsHt.png
    ├── tests
    │   ├── README
    │   ├── KNNTests
    │   ├── QuickTests.R~
    │   └── QuickTests.R
    ├── BMIfitted.png
    ├── BMIhist.png
    ├── RpartVert.png
    ├── BMIfitwell.png
    ├── VertebraeNN.png
    ├── vert1hidden.png
    ├── images
    │   ├── ROC45.png
    │   ├── PrgEngFit.png
    │   ├── PrgengVar.png
    │   ├── ParVsNonpar.png
    │   ├── RedSurroundsBlack.png
    │   └── README
    ├── ObjFtnPlusTangent.png
    ├── vn.save
    │   └── vignettes
    │   │   ├── parvsnonpar.png
    │   │   ├── varvsmean.png
    │   │   ├── wagevsage.png
    │   │   ├── regtools.Rmd
    │   │   ├── regtools.Rmd.save
    │   │   ├── regtools.Rmd~
    │   │   ├── regtools.Rnw
    │   │   └── regtools.Rnw.save
    ├── sdss2020
    │   ├── BestK.R
    │   ├── z
    │   ├── WeightedDistCensus.R
    │   ├── PrgEng.R
    │   ├── Pima.R
    │   ├── WeightedDistDiab.R
    │   ├── MahalanobisSong.R
    │   ├── FineTune.R~
    │   ├── ExpandGraph.R
    │   ├── FineTune.R
    │   ├── LocLinStudy.R
    │   └── LocLinStudy.R~
    ├── README.ClearingConfusion
    ├── RecSysLinModels.md
    ├── ScalingInPCA.md
    ├── InterpretedR.md
    ├── PoissonReg.md
    ├── DstrFit.md
    └── ChoosingKinKFoldCV.md
├── data
    ├── day.RData
    ├── day1.RData
    ├── day2.RData
    ├── mlb.txt.gz
    ├── pef.RData
    ├── english.RData
    ├── mlens.RData
    ├── peDumms.RData
    ├── phoneme.RData
    ├── prgeng.RData
    ├── quizzes.RData
    ├── yell10k.RData
    ├── SwissRoll.RData
    ├── newadult.RData
    ├── peFactors.RData
    ├── quizDocs.RData
    ├── weatherTS.RData
    ├── oliveoils.txt.gz
    ├── courseRecords.RData
    ├── falldetection.RData
    └── ltrfreqs.txt
├── vignettes
    ├── PrgengVar.png
    └── ParVsNonpar.png
├── man
    ├── yell10k.Rd
    ├── falldetection.Rd
    ├── phoneme.Rd
    ├── weatherTS.Rd
    ├── ltrfreqs.Rd
    ├── SwissRoll.Rd
    ├── oliveoils.Rd
    ├── mlb.Rd
    ├── english.Rd
    ├── mlens.Rd
    ├── currency.Rd
    ├── courseRecords.Rd
    ├── day.Rd
    ├── newadult.Rd
    ├── quizDocs.Rd
    ├── nlshc.Rd
    ├── Penrose.Rd
    ├── ridgelm.Rd
    ├── Graphics.Rd
    ├── unscale.Rd
    ├── mm.Rd
    ├── prgeng.Rd
    ├── textToXY.Rd
    ├── TS.Rd
    ├── regtools-package.Rd
    ├── misc.Rd
    ├── krsFit.Rd
    ├── ovalogtrn.Rd
    ├── lmac.Rd
    ├── FineTuning.Rd
    ├── factorsDummies.Rd
    └── Quick.Rd~
├── R
    ├── z.R
    ├── onAttach.R
    ├── FormulaWrappers.R
    ├── Nonlin.R
    ├── Ridge.R
    ├── DimRed.R
    ├── Text.R
    ├── MM.R
    ├── TS.R
    ├── Penrose.R
    └── AC.R
├── .gitignore
├── DESCRIPTION
└── NAMESPACE


/inst/SVM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/SVM.png


--------------------------------------------------------------------------------
/data/day.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/day.RData


--------------------------------------------------------------------------------
/data/day1.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/day1.RData


--------------------------------------------------------------------------------
/data/day2.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/day2.RData


--------------------------------------------------------------------------------
/data/mlb.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/mlb.txt.gz


--------------------------------------------------------------------------------
/data/pef.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/pef.RData


--------------------------------------------------------------------------------
/inst/WtVsHt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/WtVsHt.png


--------------------------------------------------------------------------------
/inst/tests/README:
--------------------------------------------------------------------------------
1 | 
2 | The file xTests.R consists of tests for ../R/x.R.
3 | 
4 | 


--------------------------------------------------------------------------------
/data/english.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/english.RData


--------------------------------------------------------------------------------
/data/mlens.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/mlens.RData


--------------------------------------------------------------------------------
/data/peDumms.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/peDumms.RData


--------------------------------------------------------------------------------
/data/phoneme.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/phoneme.RData


--------------------------------------------------------------------------------
/data/prgeng.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/prgeng.RData


--------------------------------------------------------------------------------
/data/quizzes.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/quizzes.RData


--------------------------------------------------------------------------------
/data/yell10k.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/yell10k.RData


--------------------------------------------------------------------------------
/inst/BMIfitted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/BMIfitted.png


--------------------------------------------------------------------------------
/inst/BMIhist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/BMIhist.png


--------------------------------------------------------------------------------
/inst/RpartVert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/RpartVert.png


--------------------------------------------------------------------------------
/data/SwissRoll.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/SwissRoll.RData


--------------------------------------------------------------------------------
/data/newadult.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/newadult.RData


--------------------------------------------------------------------------------
/data/peFactors.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/peFactors.RData


--------------------------------------------------------------------------------
/data/quizDocs.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/quizDocs.RData


--------------------------------------------------------------------------------
/data/weatherTS.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/weatherTS.RData


--------------------------------------------------------------------------------
/inst/BMIfitwell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/BMIfitwell.png


--------------------------------------------------------------------------------
/inst/VertebraeNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/VertebraeNN.png


--------------------------------------------------------------------------------
/inst/vert1hidden.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/vert1hidden.png


--------------------------------------------------------------------------------
/data/oliveoils.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/oliveoils.txt.gz


--------------------------------------------------------------------------------
/inst/images/ROC45.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/images/ROC45.png


--------------------------------------------------------------------------------
/vignettes/PrgengVar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/vignettes/PrgengVar.png


--------------------------------------------------------------------------------
/data/courseRecords.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/courseRecords.RData


--------------------------------------------------------------------------------
/data/falldetection.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/data/falldetection.RData


--------------------------------------------------------------------------------
/inst/images/PrgEngFit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/images/PrgEngFit.png


--------------------------------------------------------------------------------
/inst/images/PrgengVar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/images/PrgengVar.png


--------------------------------------------------------------------------------
/vignettes/ParVsNonpar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/vignettes/ParVsNonpar.png


--------------------------------------------------------------------------------
/inst/ObjFtnPlusTangent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/ObjFtnPlusTangent.png


--------------------------------------------------------------------------------
/inst/images/ParVsNonpar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/images/ParVsNonpar.png


--------------------------------------------------------------------------------
/inst/images/RedSurroundsBlack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/images/RedSurroundsBlack.png


--------------------------------------------------------------------------------
/inst/vn.save/vignettes/parvsnonpar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/vn.save/vignettes/parvsnonpar.png


--------------------------------------------------------------------------------
/inst/vn.save/vignettes/varvsmean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/vn.save/vignettes/varvsmean.png


--------------------------------------------------------------------------------
/inst/vn.save/vignettes/wagevsage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/regtools/HEAD/inst/vn.save/vignettes/wagevsage.png


--------------------------------------------------------------------------------
/man/yell10k.Rd:
--------------------------------------------------------------------------------
 1 | \name{yell10k}
 2 | \alias{yell10k}
 3 | 
 4 | \title{
 5 | New York Taxi Data
 6 | }
 7 | 
 8 | \description{
 9 |    From public data on New York City taxi trips.
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/R/z.R:
--------------------------------------------------------------------------------
1 | 
2 | library(mvtnorm)
3 | n <- 500
4 | cv <- rbind(c(1,0.2),c(0.2,1))
5 | xy <- NULL
6 | for (i in 1:3)
7 |   xy <- rbind(xy,rmvnorm(n,mean=rep(i*0.5,2),sigma=cv))
8 | xy <- cbind(xy,rep(0:2,each=n))


--------------------------------------------------------------------------------
/inst/sdss2020/BestK.R:
--------------------------------------------------------------------------------
 1 | 
 2 | library(regtools)
 3 | data(peDumms)
 4 | ped <- peDumms[,c(1,20,22,24:29,31,32)]
 5 | x <- ped[,-10]
 6 | y <- ped[,10]
 7 | bkpp <- bestKperPoint(x,y,50)
 8 | plot(density(bkpp))
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/inst/sdss2020/z:
--------------------------------------------------------------------------------
 1 | 
 2 | tiff('LocLin.tiff',width=6,height=6,unit='in',res=800)
 3 | plottingSim()
 4 | dev.off()
 5 | system('convert LocLin.tiff LocLinTiff.jpg')
 6 | system('mv LocLin* ~/Research/SDSSknn')
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/inst/sdss2020/WeightedDistCensus.R:
--------------------------------------------------------------------------------
1 | 
2 | library(regtools)
3 | data(peDumms)
4 | ped <- peDumms[,c(1,20,22,24:29,31,32)]
5 | x <- ped[,-10]
6 | y <- ped[,10]
7 | plotExpVars(x,y,x,y,25,c(1,2,8,9,10),1.5,lossFtn='MAPE',ylim=c(23000,25000))
8 | 
9 | 


--------------------------------------------------------------------------------
/man/falldetection.Rd:
--------------------------------------------------------------------------------
 1 | \name{falldetection}
 2 | \alias{falldetection}
 3 | 
 4 | \title{
 5 | Fall Detection Data
 6 | }
 7 | 
 8 | \description{
 9 | Detection falls in the elderly via physiological measurements.
10 | Obtained from Kaggle.
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/R/onAttach.R:
--------------------------------------------------------------------------------
1 | 
2 | .onAttach <- function(libname, pkgname) {
3 |    packageStartupMessage(
4 |       '\n\n\n\n\n*********************\n\n\n\nLatest version of regtools at GitHub.com/matloff\n\n\nType ?regtools to see function list by category\n\n\n\n')
5 | }
6 | 
7 | 


--------------------------------------------------------------------------------
/man/phoneme.Rd:
--------------------------------------------------------------------------------
 1 | \name{phoneme}
 2 | \alias{phoneme}
 3 | 
 4 | \title{
 5 | Phoneme Data
 6 | }
 7 | 
 8 | \description{
 9 | Phoneme detection, 2 types.  Features are from harmonic analysis of th
10 | voice.  From OpenML, \url{https://www.openml.org/d/1489}.
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/inst/sdss2020/PrgEng.R:
--------------------------------------------------------------------------------
 1 | 
 2 | prgengExpVars <- function() 
 3 | {
 4 |    data(peDumms) 
 5 |    ped <- peDumms[,c(1,20,22,24:29,31,32)] 
 6 |    x <- ped[,-10] 
 7 |    y <- ped[,10] 
 8 |    plotExpVars(x,y,x,y,25,c(1,2,8,9,10),1.5,lossFtn='MAPE',ylim=c(23500,25000)) 
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/man/weatherTS.Rd:
--------------------------------------------------------------------------------
 1 | \name{weatherTS}
 2 | \alias{weatherTS}
 3 | 
 4 | \title{
 5 | Weather Time Series
 6 | }
 7 | 
 8 | \description{
 9 | Various measurements on weather variables collected by NASA.  Downloaded
10 | via \code{nasapower}; see that package for documentation.
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/man/ltrfreqs.Rd:
--------------------------------------------------------------------------------
 1 | \name{ltrfreqs}
 2 | \alias{ltrfreqs}
 3 | 
 4 | \title{
 5 | Letter Frequencies
 6 | }
 7 | 
 8 | \description{
 9 | 
10 | This is data consists of capital letter frequencies obtained at
11 | http://www.math.cornell.edu/~mec/2003-2004/cryptography/subs/frequencies.h
12 | tml
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/man/SwissRoll.Rd:
--------------------------------------------------------------------------------
 1 | \name{SwissRoll}
 2 | \alias{SwissRoll}
 3 | \alias{sw}
 4 | 
 5 | \title{
 6 | Swiss Roll
 7 | }
 8 | 
 9 | \description{
10 | See \url{http://people.cs.uchicago.edu/~dinoj/manifold/swissroll.html}
11 | for this version of Swiss Roll.
12 | 
13 | Running \code{data(SwissRoll)} produces an object \code{sw}.
14 | 
15 | }
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/man/oliveoils.Rd:
--------------------------------------------------------------------------------
 1 | \name{oliveoils}
 2 | \alias{oliveoils}
 3 | 
 4 | \title{
 5 | Italian olive oils data set.
 6 | }
 7 | 
 8 | \description{
 9 | Italian olive oils data set, as used in \emph{Graphics of Large 
10 | Datasets: Visualizing a  Million}, by  Antony Unwin, Martin Theus and 
11 | Heike Hofmann, Springer, 2006.  Included here with permission of Dr. 
12 | Martin Theus.  
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/man/mlb.Rd:
--------------------------------------------------------------------------------
 1 | \name{mlb}
 2 | \alias{mlb}
 3 | 
 4 | \title{
 5 | Major Leage Baseball player data set.
 6 | }
 7 | 
 8 | \description{
 9 | Heights, weights, ages etc. of major league baseball players.  A new
10 | variable has been added, consolidating positions into Infielders,
11 | Outfielders, Catchers and Pitchers.
12 | 
13 | Included here with the permission of the UCLA Statistics Department.
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/man/english.Rd:
--------------------------------------------------------------------------------
 1 | \name{english}
 2 | \alias{english}
 3 | 
 4 | \title{
 5 | English vocabulary data
 6 | }
 7 | 
 8 | \description{
 9 | 
10 | The Stanford WordBank data on vocabulary acquisition in young children.
11 | The file consists of about 5500 rows. (There are many NA values, though,
12 | and only about 2800 complete cases.) Variables are age, birth order,
13 | sex, mother's education and vocabulary size.
14 | 
15 | }
16 | 
17 | 


--------------------------------------------------------------------------------
/inst/images/README:
--------------------------------------------------------------------------------
 1 | 
 2 | RedSurroundsBlack.png:
 3 | 
 4 | Predict gender from various, then plot predicted values against 2
 5 | features, age and income.  Female spots surrounded!
 6 | 
 7 | data(peDumms)
 8 | pe <- peDumms
 9 | pe <- pe[,c(1,20:29,31,32)]
10 | kout <- kNN(pe[,-11],pe[,11],kmax=25,classif=T)
11 | plot(pe$age,pe$wageinc,col=preds,pch=16)
12 | preds <- kout$ypreds + 1
13 | plot(pe$age,pe$wageinc,col=preds,pch=16)
14 | 
15 | 


--------------------------------------------------------------------------------
/inst/sdss2020/Pima.R:
--------------------------------------------------------------------------------
 1 | 
 2 | pimaExpVars <- function() 
 3 | {
 4 |    library(mlbench)
 5 |    data(PimaIndiansDiabetes2)
 6 |    diab <- PimaIndiansDiabetes2
 7 |    db <- diab[setdiff(names(diab),c('triceps','insulin'))]
 8 |    db <- db[complete.cases(db),]
 9 |    x <- as.matrix(db[,-7])
10 |    y <- as.numeric(db[,7] == 'pos')
11 |    plotExpVars(x,y,x,y,25,1:6,1.5,'probIncorrectClass',c(0.2,0.35),leave1out=TRUE)
12 | }
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/man/mlens.Rd:
--------------------------------------------------------------------------------
 1 | \name{mlens}
 2 | \alias{mlens}
 3 | 
 4 | \title{
 5 | MovieLens User Summary Data
 6 | }
 7 | 
 8 | \description{
 9 | The MovieLens dataset, \url{https://grouplens.org/},
10 | is a standard example in the recommender systems literature.  Here we
11 | give demographic data for each user, plus the mean rating and number of
12 | ratings.  One may explore, for instance, the relation between ratings
13 | and age.
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/man/currency.Rd:
--------------------------------------------------------------------------------
 1 | \name{currency}
 2 | \alias{currency}
 3 | 
 4 | \title{
 5 | Pre-Euro Era Currency Fluctuations
 6 | }
 7 | 
 8 | \description{
 9 | From Wai Mun Fong and Sam Ouliaris, "Spectral Tests of the Martingale       
10 | Hypothesis for Exchange Rates", Journal of Applied Econometrics, Vol.
11 | 10, No. 3, 1995, pp. 255-271.  Weekly exchange rates against US dollar,
12 | over the period 7 August 1974 to 29 March 1989.        
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/R/FormulaWrappers.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # basic idea:  some popular packages for regression, classification and
 3 | # machine learning do not acccommodate specifying Y and X vi an R
 4 | # formula, e.g. weight ~ height+age; this file contains wrappers to
 5 | # allow this
 6 | 
 7 | # also allowed will be factor-valued X
 8 | 
 9 | # note that the generic predict() functions must also be wrappers
10 | 
11 | # the suffix 'W' will be used to indicate "wrapper"
12 | 
13 | 


--------------------------------------------------------------------------------
/inst/sdss2020/WeightedDistDiab.R:
--------------------------------------------------------------------------------
 1 | 
 2 | library(regtools)
 3 | library(mlbench)
 4 | data(PimaIndiansDiabetes2)
 5 | diab <- PimaIndiansDiabetes2
 6 | db <- diab[setdiff(names(diab),c('triceps','insulin'))]
 7 | db <- db[complete.cases(db),]
 8 | head(db)
 9 | x <- as.matrix(db[,-7])
10 | y <- as.numeric(db[,7] == 'pos')
11 | set.seed(9999)
12 | tstidxs <- sample(1:nrow(x),100)
13 | xtst <- x[tstidxs,]
14 | ytst <- y[tstidxs]
15 | xtrn <- x[-tstidxs,]
16 | ytrn <- y[-tstidxs]
17 | plotExpVars(xtrn, ytrn, xtst,ytst,5,1:6,1.5,'propMisclass',c(0.5,0.9))
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/man/courseRecords.Rd:
--------------------------------------------------------------------------------
 1 | \name{courseRecords}
 2 | \alias{courseRecords}
 3 | 
 4 | \title{
 5 | Records from several offerings of a certain course.
 6 | }
 7 | 
 8 | \description{
 9 | The data are in the form of an R list.  Each element of the list
10 | corresponds to one offering of the course.  Fields are:  Class level;
11 | major (two different computer science majors, LCSI in Letters and
12 | Science and ECSE in engineering); quiz grade average (scale of 4.0, A+
13 | counting as 4.3); homework grade average (same scale); and course letter
14 | grade.
15 | }
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Mac system file
 2 | .DS_Store
 3 | 
 4 | # History files
 5 | .Rhistory
 6 | .Rapp.history
 7 | 
 8 | # Session Data files
 9 | .RData
10 | 
11 | # User-specific files
12 | .Ruserdata
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | 
23 | # knitr and R markdown default cache directories
24 | *_cache/
25 | /cache/
26 | 
27 | # Temporary files created by R markdown
28 | *.utf8.md
29 | *.knit.md
30 | 
31 | # R Environment Variables
32 | .Renviron
33 | 
34 | # pkgdown site
35 | docs/
36 | 
37 | # translation temp files
38 | po/*~
39 | 


--------------------------------------------------------------------------------
/data/ltrfreqs.txt:
--------------------------------------------------------------------------------
 1 |   'ltr'      'percent'
 2 |   E          12.02
 3 |   T           9.10
 4 |   A           8.12
 5 |   O           7.68
 6 |   I           7.31
 7 |   N           6.95
 8 |   S           6.28
 9 |   R           6.02
10 |   H           5.92
11 |   D           4.32
12 |   L           3.98
13 |   U           2.88
14 |   C           2.71
15 |   M           2.61
16 |   F           2.30
17 |   Y           2.11
18 |   W           2.09
19 |   G           2.03
20 |   P           1.82
21 |   B           1.49
22 |   V           1.11
23 |   K           0.69
24 |   X           0.17
25 |   Q           0.11
26 |   J           0.10
27 |   Z           0.07
28 | 


--------------------------------------------------------------------------------
/inst/README.ClearingConfusion:
--------------------------------------------------------------------------------
 1 | 
 2 | A number of the files here comprise our Clearing the Confusion series,
 3 | clarifying many topics that are widely misunderstood in statistics.
 4 | Cufrrently we have:
 5 | 
 6 | ChoosingKinKFoldCV.md:  Choosing the number of folds in k-fold
 7 |    cross-validation.
 8 | 
 9 | NoPVals.md:  Why p-values should not be used.
10 | 
11 | ScalingInPCA.md:  Arguing that use of scaling prior to PCA is problematic.
12 | 
13 | PoissonReg.md:  Why Poisson regression should be used only in some
14 |    restrictive settings.
15 | 
16 | UnbalancedClasses.md:  Don't artificially balance data in classification
17 |    problems.
18 | 
19 | 


--------------------------------------------------------------------------------
/inst/sdss2020/MahalanobisSong.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ibrary(regtools)
 3 | load('YearData.save')  # obtain separately, data frame 'yr'
 4 | yr <- yr[,seq(2,91,5)]
 5 | idxs <- sample(1:nrow(yr),100000)
 6 | yr1 <- yr[idxs,]
 7 | idxs <- sample(1:nrow(yr1),5000)
 8 | trn <- yr1[-idxs,]
 9 | tst <- yr1[idxs,]
10 | xtrn <- trn[,-1]
11 | ytrn <- trn[,1]
12 | xtst <- tst[,-1]
13 | ytst <- tst[,1]
14 | knnout <- kNN(xtrn,ytrn,xtst,25)
15 | mhd <- knnout$mhdists
16 | far <- which(mhd > 150)
17 | xn <- xtst[far,]
18 | yn <- ytst[far]
19 | preds <- kNN(xtrn,ytrn,xn,25)$regests
20 | mean(abs(preds - yn))  
21 | preds <- kNN(xtrn,ytrn,xn,25,smoothingFtn=loclin)$regests
22 | mean(abs(preds - yn)) 
23 | 
24 | 


--------------------------------------------------------------------------------
/man/day.Rd:
--------------------------------------------------------------------------------
 1 | \name{day,day1}
 2 | \alias{day}
 3 | \alias{day1}
 4 | \alias{day2}
 5 | 
 6 | \title{
 7 | Bike sharing data.
 8 | }
 9 | 
10 | \description{
11 | This is the Bike Sharing dataset (day records only) from the UC Irvine
12 | Machine Learning Dataset Repository.  Included here with 
13 | permission of Dr. Hadi Fanaee.  
14 | 
15 | The \code{day} data is as on UCI; \code{day1} is modified so that the
16 | numeric weather variables are on their original scale.
17 | 
18 | The \code{day2} is the same as \code{day1}, except that \code{dteday}
19 | has been removed, and \code{season}, \code{mnth}, \code{weekday} and
20 | \code{weathersit} have been converted to R factors.
21 | 
22 | See \url{https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset}
23 | for details.
24 | }
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/inst/sdss2020/FineTune.R~:
--------------------------------------------------------------------------------
 1 | 
 2 | # apply regtools:::fineTuning() to choosing the expansion factors in
 3 | # kNN()
 4 | 
 5 | knnFineTune <- function(data,yName,k,expandVars,ws,classif=FALSE,
 6 |    seed=9999) 
 7 | {
 8 |    if (classif) stop('not ready for classification problems')
 9 |    ycol <- which(names(data) == yName)
10 | 
11 |    theCall <- function(dtrn,dtst,cmbi) {
12 |       x <- dtrn[,-ycol]
13 |       y <- dtrn[,ycol]
14 |       newx <- dtst[,-ycol]
15 |       newy <- dtst[,ycol]
16 |       knnout <- kNN(x,y,newx,k,expandVars=expandVars,expandVals=cmbi)
17 |       mean(abs(knnout$regests - newy))
18 |    }
19 |    
20 |    wcols <- paste0('w',1:length(expandVars),'=ws',collapse=',')
21 |    wcols <- paste0('list(',wcols,')')
22 |    fineTuning(dataset=data,pars=eval(parse(text=wcols)),regCall=theCall,
23 |       nXval=10)
24 | 
25 | }
26 | 
27 | fT <- fineTuning
28 | 
29 | 


--------------------------------------------------------------------------------
/inst/tests/KNNTests:
--------------------------------------------------------------------------------
 1 | 
 2 | data(mlb) 
 3 | mlb <- mlb[,c(4,6,5)]  # height, age, weight
 4 | # fit, then predict 75", age 21, and 72", age 32
 5 | knnout <- kNN(mlb[,1:2],mlb[,3],rbind(c(75,21),c(72,32)),25) 
 6 | knnout$regests
 7 | # [1] 201.84 195.72
 8 | 
 9 | # fit now, predict later
10 | knnout <- kNN(mlb[,1:2],mlb[,3],NULL,25) 
11 | predict(knnout,c(70,28)) 
12 | # [1] 185.24
13 | 
14 | # set saveNhbrs to TRUE to avoid re-doing the same computation
15 | knnout <- kNN(mlb[,1:2],mlb[,3],rbind(c(75,21),c(72,32)),25) 
16 | knnout$regests
17 | # [1] 201.84 195.72
18 | # what about k = 20?; first, the direct way
19 | knnout <- kNN(mlb[,1:2],mlb[,3],rbind(c(75,21),c(72,32)),20) 
20 | knnout$regests
21 | # [1] 202.05 196.05
22 | # now the computation-reusing way
23 | knnout25 <- kNN(mlb[,1:2],mlb[,3],rbind(c(75,21),c(72,32)),25,
24 |    saveNhbrs=TRUE) 
25 | knnout20 <- kNN(mlb[,1:2],mlb[,3],rbind(c(75,21),c(72,32)),20,
26 |    savedNhbrs=knnout25$nhbrs) 
27 | knnout20$regests
28 | 
29 | 


--------------------------------------------------------------------------------
/man/newadult.Rd:
--------------------------------------------------------------------------------
 1 | \name{newadult}
 2 | \alias{newadult}
 3 | \alias{newAdult}
 4 | 
 5 | \title{
 6 | UCI adult income data set, adapted
 7 | }
 8 | 
 9 | \description{
10 | This data set is adapted from
11 | the Adult data from the UCI Machine Learning Repository,
12 | which was in turn adapted from Census data on adult incomes and other 
13 | demographic variables.  The UCI data is used here with permission 
14 | from Ronny Kohavi.
15 | 
16 | The variables are:
17 | 
18 | \itemize{
19 |    \item \code{gt50}, which converts the original \code{>50K} variable
20 |    to an indicator variable; 1 for income greater than $50,000, else 0
21 | 
22 |    \item \code{edu}, which converts a set of education levels to
23 |    approximate number of years of schooling
24 | 
25 |    \item \code{age}
26 | 
27 |    \item \code{gender}, 1 for male, 0 for female
28 | 
29 |    \item \code{mar}, 1 for married, 0 for single
30 | }
31 | 
32 | Note that the education variable is now numeric.
33 | 
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/R/Nonlin.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # uses output of R's nls() to get an asymptotic covariance 
 3 | # matrix in general heteroscedastic case
 4 | 
 5 | # arguments:
 6 | # 
 7 | #    nlsout: object of type 'nls'
 8 | # 
 9 | # value: approximate covariance matrix for the 
10 | #        estimated parameter vector
11 | 
12 | nlshc <- function(nlsout,type='HC') {
13 |    # notation: g(t,b) is the regression model, 
14 |    # where t is the vector of variables for a 
15 |    # given observation; b is the estimated parameter
16 |    # vector; x is the matrix of predictor values
17 |    b <- coef(nlsout)
18 |    m <- nlsout$m
19 |    # y - g:
20 |    resid <- m$resid()
21 |    # row i of hmat will be deriv of g(x[i,],b) 
22 |    # with respect to b
23 |    hmat <- m$gradient()
24 |    # calculate the artificial "x" and "y" of 
25 |    # the algorithm
26 |    xhm <- hmat
27 |    yresidhm <- resid + hmat %*% b
28 |    # -1 means no constant term in the model
29 |    lmout <- lm(yresidhm ~ xhm - 1)
30 |    # vcovHC(lmout); was getting NAs for some data sets
31 |    sandwich::vcovHC(lmout,type)
32 | }
33 | 


--------------------------------------------------------------------------------
/inst/tests/QuickTests.R~:
--------------------------------------------------------------------------------
 1 | 
 2 | # built-in data on major league baseball players
 3 | data(mlb)  
 4 | mlb <- mlb[,3:6]  # position, height, weight, age
 5 | 
 6 | set.seed(9999)
 7 | 
 8 | # fit models
 9 | knnout <- qeKNN(mlb,'Weight',k=25)
10 | rfout <- qeRF(mlb,'Weight')
11 | 
12 | # mean abs. pred. error on holdout set, in pounds
13 | knnout$testAcc
14 | # [1] 11.75644
15 | rfout$testAcc
16 | # [1] 12.6787
17 | 
18 | # predict a new case
19 | newx <- data.frame(Position='Catcher',Height=73.5,Age=26)
20 | predict(knnout,newx)
21 |        [,1]
22 | # [1,] 204.04
23 | predict(rfout,newx)
24 |       11 
25 | # 199.1714
26 | 
27 | # how about some other ML methods?
28 | lassout <- qeLASSO(mlb,'Weight')
29 | lassout$testAcc
30 | # [1] 14.23122
31 | # poly reg, degree 3 
32 | polyout <- qePoly(mlb,'Weight',3)
33 | polyout$testAcc
34 | # [1] 12.69412
35 | nnout <- qeNeural(mlb,'Weight')
36 | # ...
37 | nnout$testAcc
38 | # [1] 12.03419
39 | # try some nondefault hyperparams
40 | nnout <- qeNeural(mlb,'Weight',hidden=c(200,200),nEpoch=50)
41 | nnout$testAcc
42 | # [1] 15.8038
43 | 
44 | 


--------------------------------------------------------------------------------
/man/quizDocs.Rd:
--------------------------------------------------------------------------------
 1 | \name{quizDocs}
 2 | \alias{quizDocs}
 3 | \alias{quizzes}
 4 | 
 5 | \title{
 6 | Course quiz documents
 7 | }
 8 | 
 9 | \description{
10 | 
11 | This data is suitable for NLP analysis.  It consists of all the quizzes
12 | I've given in undergraduate courses, 143 quizzes in all.  
13 | 
14 | It is available in two forms.  First, \code{quizzes} is a data.frame,
15 | 143 rows and 2 columns.  Row i consists of a single character vector
16 | comprising the entire quiz i, followed by the course name (as an R
17 | factor).  The second form is an R list, 143 elements.  Each list element
18 | is a character vector, one vector element per line of the quiz.
19 | 
20 | The original documents were LaTeX files.  They have been run through the
21 | \code{detex} utility to remove most LaTeX commands, as well as removing
22 | the LaTeX preambles separately.
23 | 
24 | The names of the list elements are the course names, as follows:
25 | 
26 | ECS 50:  a course in machine organization
27 | 
28 | ECS 132:  an undergraduate course in probabilistic modeling
29 | 
30 | ECS 145:  a course in scripting languages (Python, R)
31 | 
32 | ECS 158:  an undergraduate course in parallel computation
33 | 
34 | ECS 256:  a graduate course in probabilistic modeling
35 | 
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/inst/sdss2020/ExpandGraph.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # explore use of the ExpandVars arg
 3 | 
 4 | # arguments:
 5 |  
 6 | #    xtrn: vector or matrix for "X" portion of training data
 7 | #    ytrn: vector or matrix for "Y" portion of training data; matrix
 8 | #       case is for vector "Y", i.e. multiclass 
 9 | #    xtst,ytst: test data analogs of xtrn, ytrn
10 | #    k: number of nearest neighbors
11 | #    eVar: column number of the predictor to be expanded
12 | #    maxEVal: maximum expansion 
13 | #    lossFtn: loss function; internal offerings are 'MAPE' and 'propMisclass'
14 | #    eValIncr: expansion value increment 
15 | 
16 | # value:
17 | 
18 | #    mean loss, evaluated from 0 to maxEVal, increments of eValIncr
19 | 
20 | exploreExpVars <- 
21 |    function(xtrn,ytrn,xtst,ytst,k,eVar,maxEVal,loss,incr=0.05) 
22 | {
23 |    dfr <- data.frame(NULL,NULL)
24 |    for (w in seq(0.05,1.5,eValIncr)) {
25 |       preds <- kNN(xtrn,ytrn,xtst,k,expandVars=eVar,expandVals=w)
26 |       dfr <- rbind(dfr,c(w,mean(loss(preds$regests,ytst)
27 | 
28 |       
29 |       abs(preds$regests-ytst))))
30 |    } 
31 |    names(dfr) <- c('w',loss)
32 |    frmla <- as.formula(paste0(loss, ' ~ w'))
33 |    lwout <- loess(frmla,data=dfr) 
34 |    lwout$fitted
35 | }
36 | 
37 | 
38 | # plot accuracy of applying one or more instances of the ExpandVars arg
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/inst/sdss2020/FineTune.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # apply regtools:::fineTuning() to choosing the expansion factors in
 3 | # kNN()
 4 | 
 5 | # arguments:
 6 | 
 7 | #    data: data frame (or matrix with col names), including both "X" and "Y"
 8 | #    yName: name of the data column containing "Y"
 9 | #    expandVars: indices of the column numbers 
10 | 
11 | knnFineTune <- function(data,yName,k,expandVars,ws,classif=FALSE,
12 |    seed=9999) 
13 | {
14 |    if (classif) stop('not ready for classification problems')
15 | 
16 |    ycol <- which(names(data) == yName)
17 |    # may need to shift some of expandVars over, once "Y" is removed
18 |    if (ycol < ncol(data)) {
19 |       topvars <- which(expandVars > ycol)
20 |       if (length(topvars) > 0) {
21 |          expandVars[topvars] <- expandVars[topvars] - 1
22 |       }
23 |    }
24 |    expandNms <- colnames(data[,-ycol])[expandVars]
25 | 
26 |    theCall <- function(dtrn,dtst,cmbi) {
27 |       x <- dtrn[,-ycol]
28 |       y <- dtrn[,ycol]
29 |       newx <- dtst[,-ycol]
30 |       newy <- dtst[,ycol]
31 |       cmbi <- as.numeric(cmbi)
32 |       knnout <- kNN(x,y,newx,k,expandVars=expandVars,expandVals=cmbi)
33 |       mean(abs(knnout$regests - newy))
34 |    }
35 |    
36 |    # wcols <- paste0('w',1:length(expandVars),'=ws',collapse=',')
37 |    # wcols <- paste0('list(',wcols,')')
38 |    wcols <- paste0(expandNms,'=ws',collapse=',')
39 |    wcols <- paste0('list(',wcols,')')
40 |    fineTuning(dataset=data,pars=eval(parse(text=wcols)),regCall=theCall,
41 |       nXval=10)
42 | 
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/R/Ridge.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # ridge regression, similar to MASS::lm.ridge()
 3 | 
 4 | # X will be scaled and centered, using scale(); est. beta vector obtained by
 5 | # solving 
 6 | 
 7 | #    (X'X)/n + lambda I = X'Y/n
 8 | 
 9 | # to make choice of lambda easier, since (X'X)/n will have 1s on diag
10 | 
11 | # arguments:
12 | 
13 | #    xy: data matrix, "Y" in last column
14 | 
15 | #    lambda: set of lambda values to try
16 | 
17 | # value: object of class 'rlm', with components
18 | 
19 | #     bhats: matrix of est reg coefs, one col for each lambda value; if
20 | #        mapback is TRUE, these coefs will be mapped back to the
21 | #        original predictors' scale
22 | #     lambda: copy of the input lambda
23 | 
24 | ridgelm <- function(xy,lambda=seq(0.01,1.00,0.01),mapback=TRUE) {
25 |    p <- ncol(xy) - 1; n <- nrow(xy)
26 |    x <- xy[,1:p]
27 |    y <- xy[,p+1]
28 |    x <- scale(x); y <- y - mean(y)
29 |    tx <- t(x)
30 |    xpx <- tx %*% x / n
31 |    xpy <- tx %*% y / n
32 |    mapftn <- function(lambval) 
33 |       qr.solve(xpx + lambval*diag(p),xpy)
34 |    tmp <- Map(mapftn,lambda)
35 |    tmp <- Reduce(cbind,tmp)
36 |    if (mapback) {
37 |       sds <- attr(x,'scaled:scale')
38 |       for (i in 1:p) tmp[i,] <- tmp[i,] / sds[i]
39 |    }
40 |    result <- list(bhats=tmp,lambda=lambda)
41 |    class(result) <- 'rlm'
42 |    result
43 | }
44 | 
45 | plot.rlm <- function(x,y,...) {  
46 |    lamb <- x$lambda
47 |    bhs <- t(x$bhats)
48 |    matplot(lamb,bhs,type='l',pch='.',xlab='lambda',ylab='beta-hat')
49 | }
50 | 
51 | # print.rlm <- function(x,...) print(t(x$bhats)) 
52 | 
53 | 


--------------------------------------------------------------------------------
/man/nlshc.Rd:
--------------------------------------------------------------------------------
 1 | \name{nlshc}
 2 | \alias{nlshc}
 3 | 
 4 | \title{Heteroscedastic Nonlinear Regression}
 5 | 
 6 | \description{
 7 | Extension of \code{nls} to the heteroscedastic case.
 8 | }
 9 | 
10 | \usage{
11 | nlshc(nlsout,type='HC')
12 | }
13 | 
14 | \arguments{
15 |   \item{nlsout}{Object of type 'nls'.}
16 |   \item{type}{Eickert-White algorithm to use.  See documentation for
17 |   \pkg{nls}.}
18 | }
19 | 
20 | \details{
21 |    Calls \code{nls} but then forms a different estimated covariance 
22 |    matrix for the estimated regression coefficients, applying the
23 |    Eickert-White technique to handle heteroscedasticity.  This then
24 |    gives valid statistical inference in that setting.
25 | 
26 |    Some users may prefer to use \code{nlsLM} of the package
27 |    \pkg{minpack.lm} instead of \code{nls}. This is fine, as both
28 |    functions return objects of class 'nls'.
29 | }
30 | 
31 | \value{
32 | Estimated covariance matrix
33 | }
34 | 
35 | \examples{
36 | # simulate data from a setting in which mean Y is 
37 | # 1 / (b1 * X1 + b2 * X2)
38 | n <- 250
39 | b <- 1:2
40 | x <- matrix(rexp(2*n),ncol=2)
41 | meany <- 1 / (x \%*\% b)  # reg ftn
42 | y <- meany + (runif(n) - 0.5) * meany  # heterosced epsilon
43 | xy <- cbind(x,y)
44 | xy <- data.frame(xy)
45 | # see nls() docs
46 | nlout <- nls(X3 ~ 1 / (b1*X1+b2*X2),
47 |    data=xy,start=list(b1 = 1,b2=1))
48 | nlshc(nlout)
49 | }
50 | 
51 | \references{ Zeileis A (2006), Object-Oriented Computation of Sandwich
52 | Estimators.  \emph{Journal of Statistical Software}, \bold{16}(9),
53 | 1--16, \url{https://www.jstatsoft.org/v16/i09/}.  }
54 | 
55 | \author{
56 | Norm Matloff
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/man/Penrose.Rd:
--------------------------------------------------------------------------------
 1 | \name{Penrose Linear}
 2 | \alias{penroseLM}
 3 | \alias{ridgePoly}
 4 | \alias{penrosePoly}
 5 | \alias{predict.penroseLM}
 6 | \alias{predict.penrosePoly}
 7 | 
 8 | \title{Penrose-Inverse Linear Models and Polynomial Regression}
 9 | 
10 | \description{
11 | 
12 | Provides mininum-norm solutions to linear models, identical to OLS in
13 | standard situations, but allowing exploration of overfitting in the
14 | overparameterized case. Also provides a wrapper for the polynomial
15 | case.
16 | }
17 | 
18 | \usage{
19 | penroseLM(d,yName)
20 | penrosePoly(d,yName,deg,maxInteractDeg=deg)
21 | ridgePoly(d,yName,deg,maxInteractDeg=deg)
22 | \method{predict}{penroseLM}(object,...)
23 | \method{predict}{penrosePoly}(object,...)
24 | 
25 | }
26 | 
27 | \arguments{
28 |   \item{...}{Arguments for the \code{predict} functions.}
29 |   \item{d}{Dataframe, training set.} 
30 |   \item{yName}{Name of the class labels column.}
31 |   \item{deg}{Polynomial degree.}
32 |   \item{maxInteractDeg}{Maximum degree of interaction terms.}
33 |   \item{object}{A value returned by \code{penroseLM} or
34 |      \code{penrosePoly}.}
35 | }
36 | 
37 | \details{
38 | 
39 | First, provides a convenient wrapper to the \pkg{polyreg} package for
40 | polynomial regression. (See \code{qePoly} here for an even higher-level
41 | wrapper.)  Note that this computes true polynomials, with
42 | cross-product/interaction terms rather than just powers, and that dummy
43 | variables are handled properly (to NOT compute powers).
44 | 
45 | Second, provides a tool for exploring the "double descent" phenomenon,
46 | in which prediction error may improve upon fitting past the
47 | interpolation point.
48 | 
49 | }
50 | 
51 | \author{
52 | Norm Matloff
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/man/ridgelm.Rd:
--------------------------------------------------------------------------------
 1 | \name{ridgelm,plot.rlm}
 2 | \alias{ridgelm}
 3 | \alias{plot.rlm}
 4 | 
 5 | \title{Ridge Regression}
 6 | 
 7 | \description{Similar to \code{lm.ridge} in \code{MASS} packaged included
 8 | with R, but with a different kind of scaling and a little nicer
 9 | plotting.
10 | }
11 | 
12 | \usage{
13 | ridgelm(xy,lambda = seq(0.01,1,0.01),mapback=TRUE) 
14 | \method{plot}{rlm}(x,y,...)
15 | }
16 | 
17 | \arguments{
18 |   \item{xy}{Data, response variable in the last column.}
19 |   \item{lambda}{Vector of desired values for the ridge parameter.}
20 |   \item{mapback}{If TRUE, the scaling that had been applied to the
21 |      original data will be map back to the original scale, so that the
22 |      estimated regression coefficients are now on the scale of the original
23 |      data.}
24 |   \item{x}{Object of type 'rlm', output of \code{ridgelm}.}
25 |   \item{y}{Needed for consistency with the generic. Not used.}
26 |   \item{...}{Needed for consistency with the generic.  Not used.}
27 | }
28 | 
29 | \details{
30 |    
31 |    Centers and scales the predictors X, and centers the response
32 |    variable Y.  Computes X'X and then solves [(X'X)/n + lambda I]b =
33 |    X'Y/n for b.  The 1/n factors are important, making the diagonal
34 |    elements of (X'X)/n all 1s and thus facilitating choices for the
35 |    lambdas in a manner independent of the data.
36 | 
37 |    Calling \code{plot} on the output of \code{ridgelm} dispatches to
38 |    \code{plot.rlm}, thus diplaying the ridge traces.
39 |    
40 | }
41 | 
42 | \value{
43 | 
44 |    The function \code{ridgelm} returns an object of class 'rlm', with
45 |    components \code{bhats}, the estimated beta vectors, one column per
46 |    lambda value, and \code{lambda}, a copy of the input.
47 | 
48 | }
49 | 
50 | \author{
51 | Norm Matloff
52 | }
53 | 
54 | 


--------------------------------------------------------------------------------
/man/Graphics.Rd:
--------------------------------------------------------------------------------
 1 | \name{xyzPlot}
 2 | \alias{xyzPlot}
 3 | 
 4 | \title{Misc. Graphics}
 5 | 
 6 | \description{
 7 | Graphics utiliites.
 8 | }
 9 | 
10 | \usage{
11 | xyzPlot(xyz,clrs=NULL,cexText=1.0,xlim=NULL,ylim=NULL,
12 |    xlab=NULL,ylab=NULL,legendPos=NULL,plotType='l') 
13 | }
14 | 
15 | \arguments{
16 |   \item{xyz}{A matrix or data frame of at least 3 columns, the first
17 |      3 serving as 'x', 'y' and 'z' coordinates of points to be plotted. 
18 |      Grouping, if any, is specified in column 4, in which case \code{xyz}
19 |      must be a data frame.}
20 |   \item{clrs}{Colors to be used in the grouped case.}
21 |   \item{cexText}{Text size, proportional to standard.}
22 |   \item{xlim}{As in \code{plot}.}
23 |   \item{ylim}{As in \code{plot}.}
24 |   \item{xlab}{As in \code{plot}.}
25 |   \item{ylab}{As in \code{plot}.}
26 |   \item{legendPos}{As in \code{legend}.}
27 |   \item{plotType}{Coded 'l' for lines, 'p' for points.}
28 | }
29 | 
30 | \details{
31 | A way to display 3-dimensional data in 2 dimensions.  For each plotted
32 | point (x,y), a z value is written in text over the point.  A grouping
33 | variable is also allowed, with different colors used to plot different
34 | groups.
35 | 
36 | A group (including the entire data in the case of one group) can be
37 | displayed either as a polygonal line, or just as a point cloud.  The
38 | user should experiment with different argument settings to get the most
39 | visually impactful plot.
40 | }
41 | 
42 | \examples{
43 | 
44 | \dontrun{
45 | 
46 | xyzPlot(mtcars[,c(3,6,1)],plotType='l',cexText=0.75)
47 | xyzPlot(mtcars[,c(3,6,1)],plotType='p',cexText=0.75)
48 | xyzPlot(mtcars[,c(3,6,1)],plotType='l',cexText=0.75)
49 | xyzPlot(mtcars[,c(3,6,1,2)],clrs=c('red','darkgreen','blue'),plotType='l',cexText=0.75)
50 | 
51 | }
52 | 
53 | }
54 | 
55 | \author{
56 | Norm Matloff
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: regtools
 2 | Version: 1.7.3
 3 | Title: Regression and Classification Tools
 4 | Authors@R: c(person("Norm", "Matloff", email = "matloff@cs.ucdavis.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9179-6785")),
 5 |            person("Robin", "Yancey", email = "reyancey@ucdavis.edu", role = c("aut")),
 6 |            person("Bochao", "Xin", email = "xinbochao97@outlook.com",role = c("ctb")),
 7 |            person("Kenneth", "Lee", email = "honlee@ucdavis.edu", role = c("ctb")),
 8 |            person("Rongkui", "Han", email = "rkbhan@ucdavis.edu", role = c("ctb")))
 9 | Maintainer: Norm Matloff <matloff@cs.ucdavis.edu>
10 | Depends: R (>= 3.5.0),FNN,gtools
11 | Imports: R.utils,mvtnorm,sandwich,MASS,car,data.table,glmnet,rje,text2vec,
12 |              polyreg
13 | Suggests: knitr, rmarkdown, OpenImageR, cdparcoord, keras, magick, partools
14 | VignetteBuilder: knitr
15 | License: GPL (>= 2)
16 | Description: Tools for linear, nonlinear and nonparametric regression
17 |              and classification.  Novel graphical methods for assessment 
18 |              of parametric models using nonparametric methods. One 
19 |              vs. All and All vs. All multiclass classification, optional
20 |              class probabilities adjustment.  Nonparametric regression 
21 |              (k-NN) for general dimension, local-linear option.  Nonlinear 
22 |              regression with Eickert-White method for dealing with 
23 |              heteroscedasticity.  Utilities for converting time series
24 |              to rectangular form.  Utilities for conversion between
25 |              factors and indicator variables.  Some code related to
26 |              "Statistical Regression and Classification: from Linear
27 |              Models to Machine Learning", N. Matloff, 2017, CRC,
28 |              ISBN 9781498710916.
29 | URL: https://github.com/matloff/regtools
30 | BugReports: https://github.com/matloff/regtools/issues
31 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | 
 2 | import(FNN,mvtnorm)
 3 | 
 4 |   import("graphics")
 5 |   import("stats")
 6 |   import("utils")
 7 |   import("grDevices")
 8 |   import('MASS')
 9 |   import('glmnet')
10 |   import('rje')
11 |   import('text2vec')
12 |   importFrom("data.table",'setDT','setkey')
13 |   importFrom('car','vif')
14 |   importFrom("sandwich","vcovHC")
15 |   importFrom('gtools','defmacro')
16 | 
17 | S3method(coef,lmac)
18 | S3method(vcov,lmac)
19 | S3method(plot,rlm)
20 | S3method(predict,knn)
21 | S3method(predict,kNN)
22 | S3method(predict,kNNallK)
23 | S3method(plot,kmin)
24 | S3method(predict,ovaknn)
25 | S3method(plot,tuner)
26 | S3method(predict,krsFit)
27 | S3method(predict,penroseLM)
28 | S3method(predict,penrosePoly)
29 | S3method(predict,ridgePoly)
30 | 
31 | export(knnest,preprocessx,meany,mediany,vary,loclin, loclogit, predict.knn,nlshc,
32 |    kmin,lmac,coef.lmac,vcov.lmac,pcac,loglinac,tbltofakedf,
33 |    makeNA,ZerosToNAs,NAsTo0s,
34 |    ovalogtrn,avalogtrn,avalogpred,
35 |    doPCA,ridgePoly,replicMeans,PCAwithFactors,
36 |    knntrn,ovaknntrn,predict.ovaknn,classadjust,
37 |    knnFineTune,kNN,findOverallLoss,MAPE,probIncorrectClass,kNNxv,propMisclass,
38 |    plotExpVars,exploreExpVars,multCols,kNNallK, 
39 |    bestKperPoint, parvsnonparplot,nonparvsxplot,nonparvarplot,
40 |    ridgelm,plot.rlm, boundaryplot,
41 |    l2,l1, 
42 |    mm,unscale,mmscale,constCols,catDFRow,
43 |    confusion,factorTo012etc,allNumeric,
44 |    penroseLM,predict.penroseLM,penrosePoly,predict.penrosePoly,ridgePoly, 
45 |    ulist,getNamedArgs,discretize,
46 |    toAllNumeric,stopBrowser,
47 |    factorsToDummies,factorToDummies,dummiesToFactor,hasFactors,dummiesToInt,
48 |    xyDataframeToMatrix,dummiesToInt,charsToFactors,hasCharacters,
49 |    stdErrPred,getDFclasses,
50 |    toSuperFactor,toSubFactor,
51 |    prToFile,pythonBlankSplit,xyzPlot,
52 |    fineTuning,fineTuningPar,partTrnTst,fineTuningMult,
53 |    krsFit,krsFitImg,diagNeural,
54 |    TStoX,TStoXmv,textToXY,textToXYpred)
55 | 


--------------------------------------------------------------------------------
/man/unscale.Rd:
--------------------------------------------------------------------------------
 1 | \name{unscale}
 2 | \alias{unscale}
 3 | \alias{mmscale}
 4 | \alias{catDFRow}
 5 | \alias{constCols}
 6 | \alias{allNumeric}
 7 | 
 8 | \title{Miscellaneous Utilities}
 9 | 
10 | \description{
11 | Utilities.
12 | }
13 | 
14 | \usage{
15 | unscale(scaledx,ctrs=NULL,sds=NULL)
16 | mmscale(m,scalePars=NULL,p=NULL)
17 | catDFRow(dfRow)
18 | constCols(d)
19 | allNumeric(lst)
20 | }
21 | 
22 | \arguments{
23 |   \item{scaledx}{A matrix.}
24 |   \item{m}{A matrix.}
25 |   \item{ctrs}{Take the original means to be \code{ctrs}}
26 |   \item{lst}{An R list.}
27 |   \item{sds}{Take the original standard deviations to be \code{sds}}
28 |   \item{dfRow}{A row in a data frame.}
29 |   \item{d}{A data frame or matrix.}
30 |   \item{scalePars}{If not NULL, a 2-row matrix, with column \code{i} storing
31 |      the min and max values to be used in scaling column \code{i} of \code{m}.
32 |      Typically, one has previously called \code{mmscale} on a dataset and
33 |      saved the resulting scale parameters, and we wish to use those
34 |      same scale parameters on new data.}
35 |   \item{p}{If \code{m} is a vector, this specifies the 
36 |      number of columns it should have as a matrix. The code will try to take 
37 |      care of this by itself if \code{p} is left at NULL.} 
38 | }
39 | 
40 | \value{
41 | The function \code{unscale} returns the original object to which
42 | \code{scale} had been applied.  Or, the attributes \code{ctrs} and
43 | \code{sds} can be specified by the user.
44 | }
45 | 
46 | \details{
47 | 
48 | The function \code{mmscale} is meant as a better-behaved alternative to
49 | \code{scale}.  Using minimum and maximum values, it maps variables to
50 | [0,1], thus avoiding the problems arising from very small standard
51 | deviations in \code{scale}.
52 | 
53 | The function \code{catDFRow} nicely prints a row of a data frame.
54 | 
55 | The function \code{constCols} determines which columns of a data frame
56 | or matrix are constant, if any.
57 | }
58 | 
59 | \author{
60 | Norm Matloff
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/R/DimRed.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # under construction
 3 | 
 4 | ##  # uniform wrapper for various dimension reduction methods, including
 5 | ##  # predict() functions
 6 | ##  
 7 | ##  # no centering/scaling is done; user may do separately
 8 | ##  
 9 | ##  # example
10 | ##  
11 | ##  # tg <- ToothGrowth
12 | ##  # tg$supp <- as.numeric(tg$supp)
13 | ##  # tg <- as.matrix(tg)
14 | ##  # tgsvd <- dimRed(tg,method='svd',2)  # 2 PCs out of a possible 3
15 | ##  # newx <- c(8.8,1,0.5)
16 | ##  # dimRedNewX(tgsvd,newx)  # -8.860902 0.4095568, new coordinates
17 | ##  # tg1 <- reduceComps(tgsvd,1)  # go down to just 1 PC
18 | ##  # dimRedNewX(tg1,newx)  # -8.860902
19 | ##  
20 | ##  dimRed <- function(dat,method='prcomp',nComps) 
21 | ##  {
22 | ##     compSizes <- NULL  # eigenvalues etc.
23 | ##     if (method == 'prcomp') {
24 | ##        tmp <- prcomp(dat,center=FALSE,scale.=FALSE)
25 | ##        tmp$method <- 'prcomp'
26 | ##        tmp$rotation <- tmp$rotation[,1:nComps]
27 | ##     } else if (method == 'svd') {
28 | ##        tmp <- svd(dat,nu=nComps,nv=nComps)
29 | ##        tmp$method <- 'svd'
30 | ##        tmp$rotation <- tmp$v  # equiv to PCA $rotation
31 | ##     } else if (method == 'irlba') {
32 | ##        require(irlba)
33 | ##        tmp <- irlba(dat,nComps)
34 | ##        tmp$method <- 'irlba'
35 | ##        tmp$rotation <- tmp$v
36 | ##     } else stop('no such method')
37 | ##     tmp$compSizes <- compSizes
38 | ##     class(tmp) <- c('dimRed',class(tmp))
39 | ##     tmp
40 | ##  }
41 | ##  
42 | ##  # apply the same transformation to new X data
43 | ##  dimRedNewX <- function(object,newxs) 
44 | ##  {
45 | ##     method <- object$method
46 | ##     if (method == 'prcomp' || method == 'svd' || method == 'irlba') {
47 | ##        if (!is.matrix(newxs)) {
48 | ##           newxs <- as.matrix(newxs)
49 | ##           if (ncol(newxs) == 1) newxs <- t(newxs)
50 | ##        }
51 | ##        newxs %*% object$rotation
52 | ##     } 
53 | ##  }
54 | ##  
55 | ##  # ask for further reduction in the number of components
56 | ##  reduceComps <- function(object,nNewComps) 
57 | ##  {
58 | ##     method <- object$method
59 | ##     if (method == 'prcomp' || method == 'svd' || method == 'irlba') {
60 | ##        object$rotation <- object$rotation[,1:nNewComps]
61 | ##     }
62 | ##     object
63 | ##  }
64 | ##  
65 | 


--------------------------------------------------------------------------------
/man/mm.Rd:
--------------------------------------------------------------------------------
 1 | \name{mm}
 2 | \alias{mm}
 3 | 
 4 | \title{Method of Moments, Including Possible Regression Terms}
 5 | 
 6 | \description{
 7 | 
 8 | Method of Moments computation for almost any statistical problem that
 9 | has derivatives with respect to theta.  Capable of handling models that
10 | include parametric regression terms, but not need be a regression
11 | problem.  (This is not \emph{Generalized} Method of Moments; see the
12 | package \pkg{gmm} for the latter.)
13 | 
14 | }
15 | 
16 | \usage{
17 | mm(m,g,x,init=rep(0.5,length(m)),eps=0.0001,maxiters=1000) 
18 | }
19 | 
20 | \arguments{
21 |   \item{m}{Vector of sample moments, "left-hand sides" of moment
22 |      equations.}
23 |   \item{g}{Function of parameter estimates, forming the "right-hand
24 |      sides." This is a multivariate-valued function, of dimensionality
25 |      equal to that of \code{m}}.
26 |   \item{init}{Vector of initial guesses for parameter estimates.  If 
27 |      components are named, these will be used as labels in the output.}
28 |   \item{eps}{Convergence criterion.} 
29 |   \item{maxiters}{Maximum number of iterations.}
30 |   \item{x}{Input data.}
31 | }
32 | 
33 | \details{
34 |  
35 |    Standard Newton-Raphson methods are used to solve for the parameter
36 |    estimates, with \code{numericDeriv} being used to find the
37 |    approximate derivatives.
38 | }
39 | 
40 | \value{
41 | 
42 |    R list consisting of components \code{tht}, the vector of parameter
43 |    estimates, and \code{numiters}, the number of iterations performed. 
44 | 
45 | }
46 | 
47 | \examples{
48 | x <- rgamma(1000,2)
49 | m <- c(mean(x),var(x))
50 | g <- function(x,theta) {  # from theoretical properties of gamma distr.
51 |    g1 <-  theta[1] / theta[2]
52 |    g2 <-  theta[1] / theta[2]^2
53 |    c(g1,g2)
54 | }
55 | # should output about 2 and 1
56 | mm(m,g,x)
57 | 
58 | \dontrun{
59 | library(mfp)
60 | data(bodyfat)
61 | # model as a beta distribution 
62 | g <- function(x,theta) {
63 |    t1 <- theta[1]
64 |    t2 <- theta[2]
65 |    t12 <- t1 + t2
66 |    meanb <- t1 / t12
67 |    m1 <- meanb 
68 |    m2 <- t1*t2 / (t12^2 * (t12+1)) 
69 |    c(m1,m2)
70 | }
71 | x <- bodyfat$brozek/100
72 | m <- c(mean(x),var(x))
73 | # about 4.65 and 19.89
74 | mm(m,g,x)
75 | }
76 | 
77 | }
78 | 
79 | \author{
80 | Norm Matloff
81 | }
82 | 
83 | 


--------------------------------------------------------------------------------
/R/Text.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | ############################  textToXY()  ##############################
 5 | 
 6 | # preparation for text classification; inputs text, label data; outputs 
 7 | # X matrix, Y vector
 8 | 
 9 | # arguments:
10 | 
11 | #   docs: character vector, one element per document
12 | #   labels: R factor, class labels corresponding to docs
13 | #   stopWords: character vector of stop words; suggest 
14 | #      stopWords <- tm::stopwords('english')
15 | #   kTop: number of most-frequent words to retain
16 | 
17 | textToXY <- function(docs,labels,kTop=50,stopWords='a') 
18 | {
19 |    # prep
20 |    if (is.null(labels)) labels <- rep(NA,length(docs))
21 |    id <- 1:length(docs)
22 |    x <- data.frame(docs,labels,id=id)
23 |    if (!is.character(x$docs)) x$docs <- as.character(x$docs)
24 |    data.table::setDT(x)  # make data frame a by-reference data.table
25 |    data.table::setkey(x,id)  # sort the table by id
26 | 
27 |    # compute vocab, doc term mat
28 |    prep_fun <- tolower  # change letters to lower-case
29 |    tok_fun <- text2vec::word_tokenizer  # break text into words
30 |    itx <- text2vec::itoken(x$docs,
31 |         preprocessor = prep_fun,
32 |         tokenizer = tok_fun,
33 |         ids = x$id,
34 |         progressbar = FALSE)
35 |    vocab <- text2vec::create_vocabulary(itx)
36 |    vectorizer <- text2vec::vocab_vectorizer(vocab)
37 |    dtm <- text2vec::create_dtm(itx, vectorizer)  # document-term matrix, one row per doc
38 |    
39 |    # remove stop words
40 |    vocab <- text2vec::create_vocabulary(itx, stopwords = stopWords)
41 |    prunedVocab <- text2vec::prune_vocabulary(vocab)
42 |    vectorizer <- text2vec::vocab_vectorizer(prunedVocab)
43 |    dtm <- text2vec::create_dtm(itx, vectorizer)  # new doc-term matrix
44 | 
45 |    nw <- ncol(dtm)
46 |    if (kTop > 0) dtm <- dtm[,(nw-kTop+1):nw]
47 |    dtm <- as.matrix(dtm)
48 |    list(x=dtm,y=labels,stopWords=stopWords)
49 | }
50 | 
51 | textToXYpred <- function(ttXYout,predDocs) 
52 | {
53 | 
54 |    predX <- textToXY(predDocs,NULL,kTop=0,stopWords=ttXYout$stopWords)$x
55 |    namesTrain <- colnames(ttXYout$x)
56 |    namesTest <- colnames(predX)
57 |    x <- matrix(0,nrow=length(predDocs),ncol=length(namesTrain))
58 |    colnames(x) <- namesTrain
59 |    for (word in namesTrain) 
60 |       if (word %in% namesTest) x[,word] <- predX[,word]
61 |    x
62 | }
63 | 
64 | 


--------------------------------------------------------------------------------
/man/prgeng.Rd:
--------------------------------------------------------------------------------
 1 | \name{prgeng}
 2 | \alias{prgeng}
 3 | \alias{peDumms}
 4 | \alias{peFactors}
 5 | \alias{pef}
 6 | 
 7 | \title{
 8 | Silicon Valley programmers and engineers data
 9 | }
10 | \usage{
11 | data(prgeng)
12 | data(peDumms)
13 | data(peFactors)
14 | }
15 | 
16 | \description{
17 | This data set is adapted from the 2000 Census (5\% sample, person
18 | records).  It is mainly restricted to programmers and engineers in the 
19 | Silicon Valley area.  (Apparently due to errors, there are some from
20 | other ZIP codes.)
21 | 
22 | There are three versions:
23 | 
24 |    \itemize{
25 | 
26 |     \item{\code{prgeng}, the original data, with categorical variables,
27 |     e.g. Occupation, in their original codes}
28 | 
29 |     \item{\code{peDumms}, same but with categorical variables
30 |     converted to dummies; due to the large number of levels the birth
31 |     and PUMA data is not included}
32 | 
33 |     \item{\code{peFactors}, same but with categorical variables
34 |     converted to factors}
35 | 
36 |     \item{\code{pef}, same as \code{peFactors}, but having only columns
37 |     for age, education, occupation, gender, wage income and weeks
38 |     worked.  The education column has been collapsed to Master's degree,
39 |     PhD and other.}
40 | 
41 |     }
42 | 
43 | The variable codes, e.g. occupational codes, are available from 
44 | \url{https://usa.ipums.org/usa/volii/occ2000.shtml}.
45 | (Short code lists are given in the record layout, but longer ones are in
46 | the appendix Code Lists.)
47 | 
48 | The variables are:
49 | 
50 |    \itemize{
51 | 
52 |    \item{\code{age}, with a U(0,1) variate added for jitter}
53 | 
54 |    \item{\code{cit}, citizenship; 1-4 code various categories of
55 |    citizens; 5 means noncitizen (including permanent residents)}
56 | 
57 |    \item{\code{educ}: 01-09 code no college; 10-12 means some college;
58 |    13 is a bachelor's degree, 14 a master's, 15 a professional degree and
59 |    16 is a doctorate}
60 | 
61 |    \item{\code{occ}, occupation}
62 | 
63 |    \item{\code{birth}, place of birth}
64 | 
65 |    \item{\code{wageinc}, wage income}
66 | 
67 |    \item{\code{wkswrkd}, number of weeks worked}
68 | 
69 |    \item{\code{yrentry}, year of entry to the U.S. (0 for natives)}
70 | 
71 |    \item{\code{powpuma}, location of work} 
72 | 
73 |    \item{\code{gender}, 1 for male, 2 for female}
74 | 
75 |    }
76 | 
77 | }
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/R/MM.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Method of Moments, including regression terms
 3 | 
 4 | # overview:
 5 | 
 6 | #    motivated by linear mixed effects models, but more general
 7 | # 
 8 | #    say the parameter vector theta has length k, so we need k
 9 | #    equations; their left-hand sides are specified by the argument g,
10 | #    while the right-hand sides are given by mm; the user integrates
11 | #    regression terms into these two function arguments
12 | 
13 | # arguments:
14 | # 
15 | #    m: a vector of sample moments ("left-hand sides" of MM eqns); 
16 | #       x is the data, one observation per row; might be more
17 | #       general than moments
18 | #    g(x,theta): 
19 | #       a vector-valued function, specifying the "right-hand sides" 
20 | #       of the MM eqns; x as above, and theta is the vector of 
21 | #       parameters to be estimated; it is required that the second 
22 | #       argument of g() be named 'theta'
23 | #    x: our x in m and g()
24 | #    init: initial guess for theta; R list with names corresponding
25 | #          to the parameters in g
26 | #    eps: convergence criterion; iterations stop at 1000, or whe
27 | #         sum(abs(g)) < eps
28 | #    maxiters: max number of iterations
29 | 
30 | mm <- function(m,g,x,init=rep(0.5,length(m)),eps=0.0001,maxiters=1000) {
31 |    tht <- init
32 |    # mvec <- m(data)
33 |    mvec <- m
34 |    for (i in 1:maxiters) {
35 |       # g values for current iteration
36 |       # gvec <- getgvec(g,tht)
37 |       # browser()
38 |       gvec <- g(x,tht)
39 |       if (max(abs(mvec - gvec)) < eps) {
40 |          if (!is.null(names(init))) 
41 |             names(tht) <- names(init)
42 |          result <- list(tht=tht,numiters=i)
43 |          return(tht)
44 |       }
45 |       # not done, so get new Jacobian and update tht
46 |       jcb <- getjcb(g,x,tht)
47 |       tht <- tht + solve(jcb,mvec-gvec)
48 |    }
49 |    print('max iterations exceeded')
50 | }
51 | 
52 | # getgvec <- function(g,tht) {
53 | #    theta <- tht
54 | #    g(theta)
55 | # }
56 | 
57 | getjcb <- function(g,x,tht) {
58 |    theta <- tht
59 |    attr(numericDeriv(quote(g(x,theta)),'theta'),'gradient')
60 | }
61 | 
62 | # test case; should output about 2 and 1
63 | # x <- rgamma(1000,2)
64 | # m <- c(mean(x),var(x)
65 | # g <- function(theta) {
66 | #    g1 <-  theta[1] / theta[2]
67 | #    g2 <-  theta[1] / theta[2]^2
68 | #    c(g1,g2)
69 | # }
70 | # mm(m,g,x)
71 | 
72 | 


--------------------------------------------------------------------------------
/man/textToXY.Rd:
--------------------------------------------------------------------------------
 1 | 
 2 | \name{textToXY,textToXYpred}
 3 | \alias{textToXY}
 4 | \alias{textToXYpred}
 5 | 
 6 | \title{Tools for Text Classification}
 7 | 
 8 | \description{
 9 |   "R-style," classification-oriented wrappers for the \pkg{text2vec} package.
10 | }
11 | 
12 | \usage{
13 |     textToXY(docs,labels,kTop=50,stopWords='a') 
14 |     textToXYpred(ttXYout,predDocs) 
15 | }
16 | 
17 | \arguments{
18 |   \item{docs}{Character vector, one element per document.}
19 |   \item{predDocs}{Character vector, one element per document.}
20 |   \item{labels}{Class labels, as numeric, character or factor.  NULL is
21 |      used at the prediction stage.}
22 |   \item{kTop}{The number of most-frequent words to retain; 0 means 
23 |      retain all.}
24 |   \item{stopWords}{Character vector of common words, e.g. prepositions
25 |      to delete. Recommended is \code{tm::stopwords('english')}.}
26 |   \item{ttXYout}{Output object from \code{textToXY}.}
27 | }
28 | 
29 | \details{
30 | 
31 |    A typical classification/machine learning package will have as arguments
32 |    a feature matrix X and a labels vector/factor Y.  For a "bag of
33 |    words" analysis in the text case, each row of X would be a document
34 |    and each column a word.
35 | 
36 |    The functions here are basically wrappers for generating X.  Wrappers
37 |    are convenient in that:
38 | 
39 |    \itemize{
40 |       \item The \pkg{text2vec} package is rather arcane, so a "R-style" 
41 |       wrapper would be useful.
42 |       \item The \pkg{text2vec} are not directly set up to do
43 |       classification, so the functions here provide the "glue" to do
44 |       that.
45 |    }
46 | 
47 |    The typical usage pattern is thus:
48 | 
49 |    \itemize{
50 |       \item Run the documents vector and labels vector/factor through
51 |       \code{textToXY}, generating X and Y.
52 |       \item Apply your favorite classification/machine learning package
53 |       p to X and Y, returning o.
54 |       \item When predicting a new document d, run o and d through
55 |       \code{textToXY}, producing x.
56 |       \item Run x on p's \code{predict} function.
57 |    }
58 | }
59 | 
60 | \value{
61 | 
62 |    The function \code{textToXY} returns an R list with components
63 |    \code{x} and \code{y} for X and Y, and a copy of the input
64 |    \code{stopWords}.
65 | 
66 |    The function \code{textToXY} returns X.
67 | 
68 | }
69 | 
70 | \author{
71 | Norm Matloff
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/inst/RecSysLinModels.md:
--------------------------------------------------------------------------------
 1 | #  Linear Models in Recommender Systems
 2 | 
 3 | **N. Matloff, UC Davis**
 4 | 
 5 | ## Overview
 6 | 
 7 | In the collaborative filtering approach to recommender systems modeling,
 8 | a very simple but common model for the rating user i gives to item j is
 9 | 
10 | Y<sub>ij</sub> = &mu; + u<sub>i</sub> + v<sub>j</sub> +
11 | &epsilon;<sub>ij</sub>
12 | 
13 | where 
14 | 
15 | - &mu; is the overall mean rating over all users and items
16 | 
17 | - u<sub>i</sub> is the propensity of user i to rate items liberally or
18 |   harshly 
19 | 
20 | - v<sub>j</sub> is the propensity of item j to be rated liberally or
21 |   harshly 
22 | 
23 | - &epsilon;<sub>ij</sub> is an error term, incorporating all other
24 |   factors
25 | 
26 | - taken as random variables as i and j vary through all users and
27 |   items, u<sub>i</sub>, v<sub>j</sub>, and &epsilon;<sub>ij</sub> 
28 |   are independent with mean 0
29 | 
30 | The form of the above model suggests using linear model software, e.g. 
31 | 
32 | ``` r
33 | library(dslabs)         
34 | data(movielens)
35 | ml <- movielens
36 | ml <- ml[,c(5,1,6)]
37 | ml$userId <- as.factor(ml$userId)
38 | ml$movieId <- as.factor(ml$movieId)
39 | lm(rating ~ .,data=ml)
40 | ```
41 | 
42 | At first glance, this seems like a questionable idea.  In this version
43 | of the MovieLens data, there are 671 users and 9066 movies, thus nearly
44 | 10,000 dummy variables generated by **lm()**.  With only 100,000 data
45 | points (and which are not independent), we run a real risk of
46 | overfitting.  Worse, the code is quite long-running (over 2 hours in the
47 | run I tried on an ordinary PC).
48 | 
49 | But it turns out there is a simple, fast, closed-form solution, both for
50 | this model and for some more advanced versions featuring interaction
51 | terms.
52 | 
53 | ## Analysis:  Noniteractive model
54 | 
55 | Estimating &mu; is easy.  From its definition, we take our estimate to
56 | be
57 | 
58 | Y<sub>..</sub> = 
59 | &Sigma;<sub>i</sub> 
60 | &Sigma;<sub>j</sub> 
61 | Y<sub>ij</sub> / n
62 | 
63 | where is the total number of data points.
64 | 
65 | Write the above model in population form.  
66 | 
67 | Y = &mu; + U + I + e
68 | 
69 | Now consider user i, taking expectation conditioned on U = i:
70 | 
71 | E(Y | U = i) = &mu; + u<sub>i</sub> 
72 | 
73 | The natural estimate of the LHS is
74 | 
75 | Y<sub>..</sub> = &Sigma;<sub>i</sub> N<sub>i</sub>
76 | 
77 | where N<sub>i</sub> is the number of items rated by user i.
78 | 
79 | Our estimate for u<sub>i</sub> is then
80 | 
81 | Y<sub>i.</sub> - Y<sub>..</sub>
82 | 
83 | A similar derivation yields our estimate for v<sub>j</sub>,
84 | 
85 | Y<sub>.j</sub> - Y<sub>..</sub>
86 |  
87 | (under construction)
88 | 


--------------------------------------------------------------------------------
/man/TS.Rd:
--------------------------------------------------------------------------------
 1 | \name{TStoX}
 2 | \alias{TStoX}
 3 | \alias{TStoXmv}
 4 | 
 5 | \title{Transform Time Series to Rectangular Form}
 6 | 
 7 | \description{
 8 | Input a time series and transform it to a form suitable for prediction
 9 | using \code{lm} etc.
10 | }
11 | 
12 | \usage{
13 | TStoX(x,lg)
14 | TStoXmv(xmat,lg,y)
15 | }
16 | 
17 | \arguments{
18 |    \item{x}{A vector.}
19 |    \item{lg}{Lag, a positive integer.}
20 |    \item{xmat}{A matrix, data frame etc., a multivariate time series.
21 |        Each column is a time series, over a common time period.}
22 |    \item{y}{A time series, again on that common time period.  If NULL in
23 |        \code{TStoXmv}, then \code{y} is set to \code{x} (i.e. for a 
24 |        univariate time series in which older values predict newer ones).}
25 | }
26 | 
27 | \details{
28 | 
29 | Similar to \code{stats::embed}, but in lagged form, with applications
30 | such as \code{lm} in mind.
31 | 
32 | \code{TStoX} is for transforming vectors, while \code{TStoXmv}
33 | handles the multivariate time series case.  Intended for use with
34 | \code{lm} or other regression/machine learning  model, predicting 
35 | \code{y[i]} from observations \code{i-lg, i-lg+1,...,i-1}.
36 | }
37 | 
38 | \value{
39 | 
40 | 
41 |     As noted, the idea is to set up something like \code{lm(Y ~ X)}.
42 |     Let \code{m} denote length of \code{x}, and in the matrix input
43 |     case, the number of rows in \code{xmat}.  Let \code{p} be 1 in the
44 |     vector case, \code{ncol(xmat)} in the matrix case.  The return value
45 |     is a matrix with \code{m-lg} rows.  There will be \code{p*lg+1}
46 |     columns, with "Y," the numbers to be predicted in the last column.
47 | 
48 |     In the output in the multivariate case, let k denote
49 |     \code{ncol(xmat)}.  Then the first k columns of the output  will be
50 |     the k series at lag \code{lg}, the second k columns will be the k
51 |     series at lag \code{lg-1}, ..., and the \code{lg}-th set of k
52 |     columns will be the k series at lag 1,
53 | 
54 | }
55 | 
56 | \examples{
57 | 
58 | x1 <- c(5,12,13,8,88,6) 
59 | x2 <- c(5,4,3,18,168,0) 
60 | y <- 1:6 
61 | xmat <- cbind(x1,x2) 
62 | 
63 | TStoX(x1,2)
64 | #      [,1] [,2] [,3]
65 | # [1,]    5   12   13
66 | # [2,]   12   13    8
67 | # [3,]   13    8   88
68 | # [4,]    8   88    6
69 | 
70 | xy <- TStoXmv(xmat,2,y)
71 | xy
72 | #      [,1] [,2] [,3] [,4] [,5]
73 | # [1,]    5    5   12    4    3
74 | # [2,]   12    4   13    3    4
75 | # [3,]   13    3    8   18    5
76 | # [4,]    8   18   88  168    6
77 | 
78 | lm(xy[,5] ~ xy[,-5])
79 | # Coefficients:
80 | # (Intercept)    xy[, -5]1    xy[, -5]2    xy[, -5]3    xy[, -5]4
81 | #       -65.6          3.2         18.2         -3.2           NA
82 | # need n > 7 here for useful lm() call, but this illustrates the idea
83 | }
84 | 
85 | \author{
86 | Norm Matloff
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/man/regtools-package.Rd:
--------------------------------------------------------------------------------
  1 | 
  2 | \name{regtools-package}
  3 | \alias{regtools-package}
  4 | \alias{regtools}
  5 | \docType{package}         
  6 | 
  7 | \title{Overview and Package Reference Guide}
  8 |                                                                                 \description{ 
  9 | 
 10 | This package provides a broad collection of functions useful for
 11 | regression and classification analysis, and machine learning.
 12 | 
 13 | }
 14 | 
 15 | \section{Function List}{
 16 | 
 17 | \bold{Parametric modeling:}
 18 | 
 19 |    \itemize{
 20 | 
 21 |    \item nonlinear regression:  nlshc
 22 | 
 23 |    \item ridge regression:  ridgelm, plot
 24 | 
 25 |    \item missing values (also see our \pkg{toweranNA} package):
 26 |    lmac,makeNA,coef.lmac,vcov.lmac,pcac
 27 | 
 28 |    }
 29 | 
 30 | \bold{Diagnostic plots:}
 31 | 
 32 |    \itemize{
 33 | 
 34 |    \item regression diagnostics:  parvsnonparplot, nonparvsxplot,
 35 |    nonparvarplot
 36 | 
 37 |    \item other: boundaryplot, nonparvsxplot
 38 | 
 39 |    }
 40 | 
 41 | \bold{Classification:}
 42 | 
 43 |    \itemize{
 44 | 
 45 |    \item unbalanced data: classadjust (see \bold{UnbalancedClasses.md})
 46 |    
 47 |    \item All vs. All: avalogtrn, avalogpred
 48 | 
 49 |    \item k-NN reweighting: exploreExpVars, plotExpVars, knnFineTune
 50 | 
 51 |    }
 52 | 
 53 | \bold{Machine learning (also see qeML package):}
 54 | 
 55 |    \itemize{
 56 | 
 57 |    \item k-NN: kNN, kmin, knnest, knntrn, preprocessx, meany, vary, loclin,
 58 |    predict, kmin, pwplot, bestKperPoint, knnFineTune
 59 | 
 60 |    \item neural networks: krsFit,multCol
 61 | 
 62 |    \item advanced grid search: fineTuning, fineTuningPar, plot.tuner,
 63 |       knnFineTune
 64 | 
 65 |    \item loss: l1, l2, MAPE, ROC
 66 | 
 67 |    }
 68 | 
 69 | 
 70 | \bold{Dummies and R factors Utilities:}
 71 | 
 72 |    \itemize{
 73 | 
 74 |    \item conversion between factors and dummies: dummiesToFactor,
 75 |    dummiesToInt, factorsToDummies, factorToDummies, factorTo012etc,
 76 |    dummiesToInt, hasFactors, charsToFactors, makeAllNumeric
 77 | 
 78 |    \item dealing with superset and subsets of factors: toSuperFactor,
 79 |    toSubFactor
 80 | 
 81 |    }
 82 | 
 83 | \bold{Statistics:}
 84 | 
 85 |    \itemize{
 86 | 
 87 |    \item mm
 88 | 
 89 |    }
 90 | 
 91 | \bold{Matrix:}
 92 | 
 93 |    \itemize{
 94 | 
 95 |    \item multCols, constCols
 96 | 
 97 |    }
 98 | 
 99 | \bold{Time series:}
100 | 
101 |    \itemize{
102 | 
103 |    \item convert rectangular to TS: TStoX
104 | 
105 |    }
106 | 
107 | \bold{Text processing:}
108 | 
109 |    \itemize{
110 | 
111 |    \item textToXY
112 | 
113 |    }
114 | 
115 | \bold{Misc.:}
116 | 
117 |    \itemize{
118 | 
119 |    \item scaling:  mmscale, unscale
120 | 
121 |    \item data frames: catDFRow, tabletofakedf
122 | 
123 |    \item R: getNamedArgs, ulist
124 | 
125 |    \item discretize
126 | 
127 |    }
128 | 
129 | 
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/R/TS.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # routines to convert time series to rectangular data, so that we can
  4 | # then fit using lm() or whatever, predicting from the last lg
  5 | # observations
  6 | 
  7 | # the first function, TStoX(x,lg,y), inputs a univariate time series and
  8 | # outputs an "X" matrix in the sense of lm(Y ~ X); here the "Y" vector
  9 | # is either supplied as an argument, or by default is x
 10 | 
 11 | # consider for instance x = (5,12,13,8,88,6) and lg = 2, with y = x; we
 12 | # want to redict x from itself, i.e.
 13 |  
 14 | # predict the 13 from 5, 12
 15 | # predict the 8 from 12, 13
 16 | # predict the 88 from 13, 8
 17 |  
 18 | # and
 19 |  
 20 | # predict the 6 from 8, 88
 21 | 
 22 | # our training set computed by TStoX() would then be
 23 | # 
 24 | # X =
 25 | # 
 26 | #    5 12
 27 | #   12 13
 28 | #   13  8
 29 | #    8 88
 30 | #  
 31 | # Y = (13,8,88,6)
 32 | 
 33 | ########################## TStoX() #####################################
 34 | 
 35 | # inputs a time series, and transforms to rectangular shape suitable for
 36 | # lm() or some other regression model, in which any current observation
 37 | # is predicted from the last lg ones
 38 | 
 39 | # arguments:
 40 | # 
 41 | #    x:  a univariate time series; m is set to length(x) below
 42 | #    lg:  lag, for fitting of a model in which observations at 
 43 | #         time t will be predicted from observations at times 
 44 | #         t-lg, t-lg+1,...,t-1
 45 | 
 46 | # value:
 47 | # 
 48 | #    matrix, suitable for fitting a prediction model; m-lg rows,
 49 | #    lg+1 columns; x[lg+1], x[lg+2], ..., x[m] will be in the last column
 50 | 
 51 | #    the "X portion" will be 
 52 | #    
 53 | #    x[1], x[2], ..., x[lg]
 54 | #    x[2], x[3], ..., x[lg+1]
 55 | #    ...
 56 | #    x[m-lg], x[m-lg+1], ..., x[m-1]
 57 | 
 58 | TStoX <- function(x,lg) 
 59 | {
 60 |    # row k of the output
 61 |    onerow <- function(k) {
 62 |       s <- k
 63 |       e <- k + lg 
 64 |       x[s:e]
 65 |    }
 66 |    lx <- length(x)
 67 |    outrows <- lapply(1:(lx-lg),onerow)
 68 |    do.call(rbind,outrows)
 69 | 
 70 | }
 71 | 
 72 | # k-variate time series version of TStoX (but y is not optional)
 73 | 
 74 | # arguments:
 75 | 
 76 | #    each col of xmat is a time series, y is a vector (separate from x)
 77 |    
 78 | # value:
 79 | 
 80 | #    the first k cols will be the k series at lag lg,
 81 | #    the second k cols will be the k series at lag lg-1,
 82 | #    ...
 83 | #    the lg-th k cols will be the k series at lag 1,
 84 | 
 85 | TStoXmv <- function(xmat,lg,y) {
 86 |    k <- ncol(xmat)
 87 |    # take one time series, transform to "X" form, delete the "Y" col
 88 |    processOneTS <- function(xmatCol) TStoX(xmatCol,lg)[,1:lg]
 89 |    tmp <- lapply(as.data.frame(xmat),processOneTS)
 90 |    # now piece everything together
 91 |    rslt <- NULL
 92 |    for (lag in 1:lg) {
 93 |       for (tSer in 1:k) {
 94 |          rslt <- cbind(rslt,tmp[[tSer]][,lag])
 95 |       }
 96 |    }
 97 |    cbind(rslt,y[-(1:lg)])
 98 | 
 99 | }
100 | 
101 | 


--------------------------------------------------------------------------------
/man/misc.Rd:
--------------------------------------------------------------------------------
  1 | \name{misc}
  2 | \alias{replicMeans}
  3 | \alias{stdErrPred}
  4 | \alias{pythonBlankSplit}
  5 | \alias{stopBrowser}
  6 | \alias{doPCA}
  7 | \alias{PCAwithFactors}
  8 | \alias{ulist}
  9 | \alias{prToFile}
 10 | \alias{partTrnTst}
 11 | \alias{findOverallLoss}
 12 | \alias{getNamedArgs}
 13 | \alias{multCols}
 14 | \alias{probIncorrectClass}
 15 | \alias{propMisclass}
 16 | 
 17 | \title{Utilities}
 18 | 
 19 | \description{
 20 | Various helper functions.
 21 | }
 22 | 
 23 | \usage{
 24 | replicMeans(nrep,toReplic,timing=FALSE)
 25 | stdErrPred(regObj,xnew)
 26 | pythonBlankSplit(s)
 27 | stopBrowser(msg = stop("msg not supplied"))
 28 | doPCA(x,pcaProp)
 29 | PCAwithFactors(x, nComps = ncol(x))
 30 | ulist(lst)
 31 | prToFile(filename)
 32 | partTrnTst(fullData,nTest=min(1000,round(0.2*nrow(fullData))))
 33 | findOverallLoss(regests,y,lossFtn = MAPE) 
 34 | getNamedArgs(argVec)
 35 | multCols(x,cols,vals)
 36 | probIncorrectClass(yhat, y, startAt1 = TRUE)
 37 | propMisclass(y,yhat)
 38 | }
 39 | 
 40 | \arguments{
 41 |   \item{regests}{Fitted regression estimates, training set.}
 42 |   \item{y}{Y values, training set.}
 43 |   \item{yhat}{Predicted Y values}
 44 |   \item{startAt1}{TRUE if indexing starts at 1, FALSE if starting at 0.}
 45 |   \item{lossFtn}{Loss functin.}
 46 |   \item{fullData}{A data frame or matrix.}
 47 |   \item{nTest}{Number of rows for the test set.}
 48 |   \item{filename}{Name of output file.}
 49 |   \item{lst}{An R list.}
 50 |   \item{x}{Matrix or data frame.}
 51 |   \item{pcaProp}{Fraction in [0,1], specifying number of PCA components
 52 |      to compute, in terms of fraction of total variance.}
 53 |   \item{nComps}{Number of PCA components.}
 54 |   \item{regObj}{An object of class \code{'lm'} or similar, for which 
 55 |      there is a \code{vcov} generic function.}
 56 |   \item{xnew}{New X value to be predicted.}
 57 |   \item{nrep}{Number of replications.}
 58 |   \item{s}{A character string.}
 59 |   \item{toReplic}{Function call(s), as a quoted string, separated by
 60 |      semicolons if more than one call.}
 61 |   \item{timing}{If TRUE, find average elapsed time over the replicates.}
 62 |   \item{msg}{Character string, error message for existing debug browser.}
 63 |   \item{argVec}{R list or vector with named elements.}
 64 |   \item{cols}{A set of column numbers.}
 65 |   \item{vals}{A set of positive expansion numbers.}
 66 | }
 67 | 
 68 | \details{
 69 | 
 70 | The function \code{PCAwithFactors} is a wrapper for
 71 | \code{stats::prcomp}, to be used on data frames that contain at least on
 72 | R factor.
 73 | 
 74 | }
 75 | 
 76 | \value{
 77 | 
 78 | The function \code{PCAwithFactors} returns an object of class
 79 | 'PCAwithFactors'. with components \code{pcout}, the object returned by
 80 | the wrapped call to \code{prcomp}; \code{factorsInfo}, factor conversion
 81 | information to be used with \code{predict}; and \code{preds}, the PCA
 82 | version of \code{x}.
 83 | 
 84 | The function \code{getNamedArgs} will assign in the caller's space
 85 | variables with the names and values in \code{argVec}.
 86 | 
 87 | }
 88 | 
 89 | \examples{
 90 | 
 91 | w <- list(a=3,b=8)
 92 | getNamedArgs(w)  
 93 | a
 94 | b
 95 | u <- c(5,12,13)
 96 | names(u) <- c('x','y','z')
 97 | getNamedArgs(u)
 98 | x
 99 | y
100 | z
101 | 
102 | }
103 | 
104 | \author{
105 | Norm Matloff
106 | }
107 | 
108 | 


--------------------------------------------------------------------------------
/inst/sdss2020/LocLinStudy.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # arguments
  3 | 
  4 | #    x: matrix of numeric features
  5 | #    y: factor or numeric vector
  6 | #    newx: matrix of numeric features in new cases
  7 | #    k: number of nearest neighbors
  8 | #    mahaThresh: a proportion
  9 | 
 10 | # each data point in newx will use as smoothing function loclin()
 11 | # instead of mean()
 12 | 
 13 | mixedPreds <- function(x,y,newx,k,mahaThresh,scaleX) 
 14 | {
 15 |    if (is.factor(y)) stop('classification case not yet implemented')
 16 |    
 17 |    if (scaleX) {
 18 |       x <- scale(x,center=TRUE,scale=TRUE)
 19 |       xcntr <- attr(x, "scaled:center")
 20 |       xscl <- attr(x, "scaled:scale")
 21 |       newx <- scale(newx, center = xcntr, scale = xscl)
 22 |    }
 23 | 
 24 |    # first get distribution of M-dist
 25 |    meanx <- mean(x)
 26 |    covx <- cov(x)
 27 |    mhdists <- mahalanobis(x, meanx,covx)
 28 |    outerThresh <- quantile(mhdists,1-mahaThresh)
 29 | 
 30 |    # which rows of newx are on the edge of the data?
 31 |    newxMhdists <- mahalanobis(newx,meanx,covx)
 32 |    outThere <- which(newxMhdists > outerThresh)
 33 |    # the rest
 34 |    mainstream <- setdiff(1:nrow(newx),outThere)
 35 | 
 36 |    # now predict
 37 |    predsMainstream <- kNN(x,y,newx[mainstream,],k,scaleX=FALSE)
 38 |    predsOutThere <- kNN(x,y,newx[outThere,],k,scaleX=FALSE,
 39 |       smoothingFtn=loclin)
 40 |    preds <- vector(length=nrow(newx))
 41 |    preds[mainstream] <- predsMainstream$regests
 42 |    preds[outThere] <- predsOutThere$regests
 43 | 
 44 |    list(preds=preds,mainstream=mainstream,outThere=outThere,
 45 |       predsMainstream=predsMainstream,predsOutThere=predsOutThere,
 46 |       k=k,scaleX=scaleX,outerThresh=outerThresh)
 47 | }
 48 | 
 49 | # experiments on the value of expanding predictor weights
 50 | 
 51 | # generate data, then fit both mean() and loclin(), at various levels of
 52 | # the Mahalanobis distance threshold; return vector of ratios of MAPE,
 53 | # loclin()/mean()
 54 | 
 55 | simLocLin <- function(n,p,k,catOut=FALSE,seed=9999) 
 56 | {
 57 |    x <- matrix(rexp(n*p),nrow=n)
 58 |    y <- rowSums(x)^2 + p*rnorm(n)
 59 |    newx <- matrix(rexp(n*p),nrow=n)
 60 |    newy <- rowSums(newx)^2 + p*rnorm(n)
 61 |    predsZM <- kNN(x,y,newx,50,scaleX=T)$regests
 62 |    res <- vector(length = 25)
 63 |    mhs <- seq(0.001,0.25,0.001)
 64 |    for (i in 1:length(mhs)) {
 65 |       mhprop <- mhs[i]
 66 |       zLL <- mixedPreds(x,y,newx,50,mhprop,T)
 67 |       predszLL <- zLL$preds
 68 |       mapeLoclin <- mean(abs(predszLL[zLL$outThere] - newy[zLL$outThere]))
 69 |       mapeMean <- mean(abs(predsZM[zLL$outThere] - newy[zLL$outThere]))
 70 |       if (catOut) cat(mhprop,mapeLoclin,mapeMean,'\n')
 71 |       res[i] <- mapeLoclin / mapeMean
 72 |    }
 73 |    res
 74 | }
 75 | 
 76 | doSim <- function() 
 77 | {
 78 |    res.5000.2.25 <<- simLocLin(5000,2,25,catOut=T)
 79 |    res.5000.2.100 <<- simLocLin(5000,2,100,catOut=T)
 80 |    res.5000.10.25 <<- simLocLin(5000,10,25,catOut=T)
 81 |    res.5000.10.100 <<- simLocLin(5000,10,100,catOut=T)
 82 |    res.5000.20.25 <<- simLocLin(5000,20,25,catOut=T)
 83 |    res.5000.20.100 <<- simLocLin(5000,20,100,catOut=T)
 84 | }
 85 | 
 86 | plottingSim <- function() 
 87 | {
 88 | 
 89 |    plot(seq(0.001,0.25,0.001),res.5000.2.25, type = "l",ylim=c(0,1),
 90 |       xlab='MH distance threshold',ylab='MAPE.loclin/mean')
 91 |    lines(seq(0.001,0.25,0.001),res.5000.2.100, type = "l")
 92 |    lines(seq(0.001,0.25,0.001),res.5000.10.25, type = "l")
 93 |    lines(seq(0.001,0.25,0.001),res.5000.10.100, type = "l")
 94 |    lines(seq(0.001,0.25,0.001),res.5000.20.25, type = "l")
 95 |    lines(seq(0.001,0.25,0.001),res.5000.20.100, type = "l")
 96 |    abline(h=1)
 97 |    
 98 |    # see which line is which
 99 |    print(res.5000.2.25[250])
100 |    print(res.5000.2.100[250])
101 |    print(res.5000.10.25[250])
102 |    print(res.5000.10.100[250])
103 |    print(res.5000.20.25[250])
104 |    print(res.5000.20.100[250])
105 |    
106 |    text(0.190,0.930,labels='p=2,k=25')
107 |    text(0.241,0.893,labels='p=2,k=100')
108 |    text(0.199,0.289,labels='p=10')
109 |    text(0.118,0.066,labels='p=20')
110 | 
111 | }
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/inst/sdss2020/LocLinStudy.R~:
--------------------------------------------------------------------------------
  1 | 
  2 | # arguments
  3 | 
  4 | #    x: matrix of numeric features
  5 | #    y: factor or numeric vector
  6 | #    newx: matrix of numeric features in new cases
  7 | #    k: number of nearest neighbors
  8 | #    mahaThresh: a proportion
  9 | 
 10 | # each data point in newx will use as smoothing function loclin()
 11 | # instead of mean()
 12 | 
 13 | mixedPreds <- function(x,y,newx,k,mahaThresh,scaleX) 
 14 | {
 15 |    if (is.factor(y)) stop('classification case not yet implemented')
 16 |    
 17 |    if (scaleX) {
 18 |       x <- scale(x,center=TRUE,scale=TRUE)
 19 |       xcntr <- attr(x, "scaled:center")
 20 |       xscl <- attr(x, "scaled:scale")
 21 |       newx <- scale(newx, center = xcntr, scale = xscl)
 22 |    }
 23 | 
 24 |    # first get distribution of M-dist
 25 |    meanx <- mean(x)
 26 |    covx <- cov(x)
 27 |    mhdists <- mahalanobis(x, meanx,covx)
 28 |    outerThresh <- quantile(mhdists,1-mahaThresh)
 29 | 
 30 |    # which rows of newx are on the edge of the data?
 31 |    newxMhdists <- mahalanobis(newx,meanx,covx)
 32 |    outThere <- which(newxMhdists > outerThresh)
 33 |    # the rest
 34 |    mainstream <- setdiff(1:nrow(newx),outThere)
 35 | 
 36 |    # now predict
 37 |    predsMainstream <- kNN(x,y,newx[mainstream,],k,scaleX=FALSE)
 38 |    predsOutThere <- kNN(x,y,newx[outThere,],k,scaleX=FALSE,
 39 |       smoothingFtn=loclin)
 40 |    preds <- vector(length=nrow(newx))
 41 |    preds[mainstream] <- predsMainstream$regests
 42 |    preds[outThere] <- predsOutThere$regests
 43 | 
 44 |    list(preds=preds,mainstream=mainstream,outThere=outThere,
 45 |       predsMainstream=predsMainstream,predsOutThere=predsOutThere,
 46 |       k=k,scaleX=scaleX,outerThresh=outerThresh)
 47 | }
 48 | 
 49 | # experiments on the value of expanding predictor weights
 50 | 
 51 | # generate data, then fit both mean() and loclin(), at various levels of
 52 | # the Mahalanobis distance threshold; return vector of ratios of MAPE,
 53 | # loclin()/mean()
 54 | 
 55 | simLocLin <- function(n,p,k,catOut=FALSE,seed=9999) 
 56 | {
 57 |    x <- matrix(rexp(n*p),nrow=n)
 58 |    y <- rowSums(x)^2 + p*rnorm(n)
 59 |    newx <- matrix(rexp(n*p),nrow=n)
 60 |    newy <- rowSums(newx)^2 + p*rnorm(n)
 61 |    predsZM <- kNN(x,y,newx,50,scaleX=T)$regests
 62 |    res <- vector(length = 25)
 63 |    mhs <- seq(0.001,0.25,0.001)
 64 |    for (i in 1:length(mhs)) {
 65 |       mhprop <- mhs[i]
 66 |       zLL <- mixedPreds(x,y,newx,50,mhprop,T)
 67 |       predszLL <- zLL$preds
 68 |       mapeLoclin <- mean(abs(predszLL[zLL$outThere] - newy[zLL$outThere]))
 69 |       mapeMean <- mean(abs(predsZM[zLL$outThere] - newy[zLL$outThere]))
 70 |       if (catOut) cat(mhprop,mapeLoclin,mapeMean,'\n')
 71 |       res[i] <- mapeLoclin / mapeMean
 72 |    }
 73 |    res
 74 | }
 75 | 
 76 | doSim <- function() 
 77 | {
 78 |    res.5000.2.25 <<- simLocLin(5000,2,25,catOut=T)
 79 |    res.5000.2.100 <<- simLocLin(5000,2,100,catOut=T)
 80 |    res.5000.10.25 <<- simLocLin(5000,10,25,catOut=T)
 81 |    res.5000.10.100 <<- simLocLin(5000,10,100,catOut=T)
 82 |    res.5000.20.25 <<- simLocLin(5000,20,25,catOut=T)
 83 |    res.5000.20.100 <<- simLocLin(5000,20,100,catOut=T)
 84 | }
 85 | 
 86 | plottingSim <- function() 
 87 | {
 88 | 
 89 |    plot(seq(0.001,0.25,0.001),res.5000.2.25, type = "l",ylim=c(0,1),
 90 |       xlab='MH distance threshold',ylab='MAPE.loclin/mean')
 91 |    lines(seq(0.001,0.25,0.001),res.5000.2.100, type = "l")
 92 |    lines(seq(0.001,0.25,0.001),res.5000.10.25, type = "l")
 93 |    lines(seq(0.001,0.25,0.001),res.5000.10.100, type = "l")
 94 |    lines(seq(0.001,0.25,0.001),res.5000.20.25, type = "l")
 95 |    lines(seq(0.001,0.25,0.001),res.5000.20.100, type = "l")
 96 |    abline(h=1)
 97 |    
 98 |    # see which line is which
 99 |    print(res.5000.2.25[250])
100 |    print(res.5000.2.100[250])
101 |    print(res.5000.10.25[250])
102 |    print(res.5000.10.100[250])
103 |    print(res.5000.20.25[250])
104 |    print(res.5000.20.100[250])
105 |    
106 |    text(0.190,0.910,labels='p=2,k=25')
107 |    text(0.241,0.893,labels='p=2,k=100')
108 |    text(0.199,0.289,labels='p=10')
109 |    text(0.118,0.066,labels='p=20')
110 | 
111 | }
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/man/krsFit.Rd:
--------------------------------------------------------------------------------
  1 | \name{krsFit}
  2 | \alias{krsFit}
  3 | \alias{krsFitImg}
  4 | \alias{diagNeural}
  5 | \alias{predict.krsFit}
  6 | 
  7 | \title{Tools for Neural Networks}
  8 | 
  9 | \description{
 10 | Tools to complement existing neural networks software, notably 
 11 | a more "R-like" wrapper to fitting data with R's \pkg{keras} package.  
 12 | }
 13 | 
 14 | \usage{
 15 | krsFit(x,y,hidden,acts=rep("relu",length(hidden)),learnRate=0.001,
 16 |    conv=NULL,xShape=NULL,classif=TRUE,nClass=NULL,nEpoch=30,
 17 |    scaleX=TRUE,scaleY=TRUE)
 18 | krsFitImg(x,y,hidden=c(100,100),acts=rep("relu",length(hidden)),
 19 |     nClass,nEpoch=30) 
 20 | \method{predict}{krsFit}(object,...)
 21 | diagNeural(krsFitOut)
 22 | }
 23 | 
 24 | \arguments{
 25 |   \item{object}{An object of class 'krsFit'.}
 26 |   \item{...}{Data points to be predicted, 'newx'.}
 27 |   \item{x}{X data, predictors, one row per data point, in the training
 28 |      set.  Must be a matrix.}
 29 |   \item{y}{Numeric vector of Y values.  In classification case
 30 |      must be integers, not an R factor, and take on the values 0,1,2,...,
 31 |      \code{nClass}-1}.
 32 |   \item{hidden}{Vector of number of units per 
 33 |      hidden layer, or the rate for a dropout layer.} 
 34 |   \item{acts}{Vector of names of the activation functions, one per
 35 |      hidden layer.  Choices inclde 'relu', 'sigmoid', 'tanh', 'softmax',
 36 |      'elu', 'selu'.}
 37 |   \item{learnRate}{Learning rate.} 
 38 |   \item{conv}{R list specifying the convolutional layers, if any.}
 39 |   \item{xShape}{Vector giving the number of rows and columns, and in the
 40 |      convolutional case with multiple channels, the number of channels.}
 41 |   \item{classif}{If TRUE, indicates a classification problem.}
 42 |   \item{nClass}{Number of classes.}
 43 |   \item{nEpoch}{Number of epochs.}
 44 |   \item{krsFitOut}{An object returned by \code{krstFit}.}
 45 |   \item{scaleX}{If TRUE, scale X columns.}
 46 |   \item{scaleY}{If TRUE, scale Y columns.}
 47 | }
 48 | 
 49 | \details{
 50 | 
 51 | The \code{krstFit} function is a wrapper for the entire pipeline
 52 | in fitting the R \pkg{keras} package to a dataset:  Defining the model,
 53 | compiling, stating the inputs and so on.  As a result, the wrapper
 54 | allows the user to skip those details (or not need to even know them),
 55 | and define the model in a manner more familiar to R users.
 56 | 
 57 | The paired \code{predict.krsFit} takes as its first argument the output
 58 | of \code{krstFit}, and \code{newx}, the points to be predicted.
 59 | }
 60 | 
 61 | \examples{
 62 | 
 63 | \dontrun{
 64 | library(keras)
 65 | data(peDumms) 
 66 | ped <- peDumms[,c(1,20,22:27,29,32,31)]
 67 | # predict wage income
 68 | x <- ped[,-11] 
 69 | y <- ped[,11] 
 70 | z <- krsFit(x,y,c(50,50,50),classif=FALSE,nEpoch=25) 
 71 | preds <- predict(z,x) 
 72 | mean(abs(preds-y))  # something like 25000
 73 | 
 74 | x <- ped[,-(4:8)] 
 75 | y <- ped[,4:8] 
 76 | y <- dummiesToInt(y,FALSE) - 1
 77 | z <- krsFit(x,y,c(50,50,0.20,50),classif=TRUE,nEpoch=175,nClass=6) 
 78 | preds <- predict(z,x)
 79 | mean(preds == y)   # something like 0.39
 80 | 
 81 | # obtain MNIST training and test sets; the following then uses the
 82 | # example network of 
 83 | 
 84 | # https://databricks-prod-cloudfront.cloud.databricks.com/
 85 | # public/4027ec902e239c93eaaa8714f173bcfc/2961012104553482/
 86 | # 4462572393058129/1806228006848429/latest.html
 87 | 
 88 | # converted to use the krsFit wrapper
 89 | 
 90 | x <- mntrn[,-785] / 255 
 91 | y <- mntrn[,785]
 92 | xShape <- c(28,28)
 93 | 
 94 | # define convolutional layers
 95 | conv1 <- list(type='conv2d',filters=32,kern=3)
 96 | conv2 <- list(type='pool',kern=2)
 97 | conv3 <- list(type='conv2d',filters=64,kern=3) 
 98 | conv4 <- list(type='pool',kern=2)
 99 | conv5 <- list(type='drop',drop=0.5)
100 | 
101 | # call wrapper, 1 dense hidden layer of 128 units, then dropout layer
102 | # with proportion 0.5
103 | z <- krsFit(x,y,conv=list(conv1,conv2,conv3,conv4,conv5),c(128,0.5),
104 |    classif=TRUE,nClass=10,nEpoch=10,xShape=c(28,28),scaleX=FALSE,scaleY=FALSE)
105 | 
106 | # try on test set
107 | preds <- predict(z,mntst[,-785]/255)
108 | mean(preds == mntst[,785])  # 0.98 in my sample run
109 | 
110 | }
111 | 
112 | }
113 | 
114 | \author{
115 | Norm Matloff
116 | }
117 | 
118 | 


--------------------------------------------------------------------------------
/man/ovalogtrn.Rd:
--------------------------------------------------------------------------------
  1 | \name{multiclass routines}
  2 | \alias{boundaryplot}
  3 | \alias{ovalogtrn}
  4 | \alias{ovaknntrn}
  5 | \alias{ovalogpred}
  6 | \alias{avalogtrn}
  7 | \alias{avalogpred}
  8 | \alias{predict.ovaknn}
  9 | \alias{classadjust}
 10 | \alias{confusion}
 11 | \alias{factorTo012ec}
 12 | \alias{classadjust}
 13 | 
 14 | \title{Classification with More Than 2 Classes}
 15 | 
 16 | \description{
 17 | Tools for multiclass classification, parametric and nonparametric.
 18 | }
 19 | 
 20 | \usage{
 21 | avalogtrn(trnxy,yname)
 22 | ovaknntrn(trnxy,yname,k,xval=FALSE)
 23 | avalogpred()
 24 | classadjust(econdprobs,wrongprob1,trueprob1) 
 25 | boundaryplot(y01,x,regests,pairs=combn(ncol(x),2),pchvals=2+y01,cex=0.5,band=0.10)
 26 | }
 27 | 
 28 | \arguments{
 29 | \item{pchvals}{Point size in base-R graphics.}
 30 | \item{trnxy}{Data matrix, Y last.}
 31 | \item{xval}{If TRUE, use leaving-one-out method.}
 32 | \item{y01}{Y vector (1s and 0s).}
 33 | \item{regests}{Estimated regression function values.}
 34 | \item{x}{X data frame or matrix.}
 35 | \item{pairs}{Two-row matrix, column i of which is a pair of predictor
 36 |    variables to graph.}
 37 | \item{cex}{Symbol size for plotting.}
 38 | \item{band}{If \code{band} is non-NULL, only points within \code{band}, 
 39 |    say 0.1, of est. P(Y = 1) are displayed, for a contour-like effect.}
 40 | \item{yname}{Name of the Y column.}
 41 | \item{k}{Number of nearest neighbors.} 
 42 | \item{econdprobs}{Estimated conditional class probabilities, given the
 43 |    predictors.}
 44 | \item{wrongprob1}{Incorrect, data-provenanced, unconditional P(Y = 1).}
 45 | \item{trueprob1}{Correct unconditional P(Y = 1).}
 46 | }
 47 | 
 48 | \details{
 49 | 
 50 | These functions aid classification in the multiclass setting.  
 51 | 
 52 | The function \code{boundaryplot} serves as a visualization technique,
 53 | for the two-class setting.  It draws the boundary between predicted Y =
 54 | 1 and predicted Y = 0 data points in 2-dimensional feature space, as
 55 | determined by the argument \code{regests}.  Used to visually assess
 56 | goodness of fit, typically running this function twice, say one for
 57 | \code{glm} then for \code{kNN}.  If there is much discrepancy and the
 58 | analyst wishes to still use glm(), he/she may wish to add polynomial
 59 | terms.
 60 | 
 61 | The functions not listed above are largely deprecated, e.g. in favor of
 62 | \code{qeLogit} and the other \code{qe}-series functions.
 63 | 
 64 | }
 65 | 
 66 | \examples{
 67 | 
 68 | \dontrun{
 69 | 
 70 | 
 71 | data(oliveoils) 
 72 | oo <- oliveoils[,-1] 
 73 | 
 74 | # toy example
 75 | set.seed(9999)
 76 | x <- runif(25)
 77 | y <- sample(0:2,25,replace=TRUE)
 78 | xd <- preprocessx(x,2,xval=FALSE)
 79 | kout <- ovaknntrn(y,xd,m=3,k=2)
 80 | kout$regest  # row 2:  0.0,0.5,0.5
 81 | predict(kout,predpts=matrix(c(0.81,0.55,0.15),ncol=1))  # 0,2,0or2
 82 | yd <- factorToDummies(as.factor(y),'y',FALSE)
 83 | kNN(x,yd,c(0.81,0.55,0.15),2)  # predicts 0, 1or2, 2
 84 | 
 85 | data(peDumms)  # prog/engr data 
 86 | ped <- peDumms[,-33] 
 87 | ped <- as.matrix(ped)
 88 | x <- ped[,-(23:28)]
 89 | y <- ped[,23:28]
 90 | knnout <- kNN(x,y,x,25,leave1out=TRUE) 
 91 | truey <- apply(y,1,which.max) - 1
 92 | mean(knnout$ypreds == truey)  # about 0.37
 93 | xd <- preprocessx(x,25,xval=TRUE)
 94 | kout <- knnest(y,xd,25)
 95 | preds <- predict(kout,predpts=x)
 96 | hats <- apply(preds,1,which.max) - 1
 97 | mean(yhats == truey)  # about 0.37
 98 | 
 99 | data(peFactors)
100 | # discard the lower educ-level cases, which are rare
101 | edu <- peFactors$educ 
102 | numedu <- as.numeric(edu) 
103 | idxs <- numedu >= 12 
104 | pef <- peFactors[idxs,]
105 | numedu <- numedu[idxs]
106 | pef$educ <- as.factor(numedu)
107 | pef1 <- pef[,c(1,3,5,7:9)]
108 | 
109 | # ovalog
110 | ovaout <- ovalogtrn(pef1,"occ")
111 | preds <- predict(ovaout,predpts=pef1[,-3])
112 | mean(preds == factorTo012etc(pef1$occ))  # about 0.39
113 | 
114 | # avalog
115 | 
116 | avaout <- avalogtrn(pef1,"occ")  
117 | preds <- predict(avaout,predpts=pef1[,-3]) 
118 | mean(preds == factorTo012etc(pef1$occ))  # about 0.39 
119 | 
120 | # knn
121 | 
122 | knnout <- ovalogtrn(pef1,"occ",25)
123 | preds <- predict(knnout,predpts=pef1[,-3])
124 | mean(preds == factorTo012etc(pef1$occ))  # about 0.43
125 | 
126 | data(oliveoils)
127 | oo <- oliveoils
128 | oo <- oo[,-1]
129 | knnout <- ovaknntrn(oo,'Region',10)
130 | # predict a new case that is like oo1[1,] but with palmitic = 950
131 | newx <- oo[1,2:9,drop=FALSE]
132 | newx[,1] <- 950
133 | predict(knnout,predpts=newx)  # predicts class 2, South
134 | 
135 | }
136 | 
137 | }
138 | 
139 | \author{
140 | Norm Matloff
141 | }
142 | 
143 | 


--------------------------------------------------------------------------------
/man/lmac.Rd:
--------------------------------------------------------------------------------
  1 | \name{lmac,makeNA,coef.lmac,vcov.lmac,pcac,loglinac,tbltofakedf}
  2 | \alias{lmac}
  3 | \alias{pcac}
  4 | \alias{coef.lmac}
  5 | \alias{vcov.lmac}
  6 | \alias{loglinac}
  7 | \alias{tbltofakedf}
  8 | \alias{makeNA}
  9 | \alias{NAsTo0s}
 10 | \alias{ZerosToNAs}
 11 | 
 12 | \title{Available Cases Method for Missing Data}
 13 | 
 14 | \description{
 15 | Various estimators that handle missing data via the Available Cases Method
 16 | }
 17 | 
 18 | \usage{
 19 | lmac(xy,nboot=0) 
 20 | makeNA(m,probna)
 21 | NAsTo0s(x)
 22 | ZerosToNAs(x,replaceVal=0)
 23 | \method{coef}{lmac}(object,...)
 24 | \method{vcov}{lmac}(object,...)
 25 | pcac(indata,scale=FALSE) 
 26 | loglinac(x,margin) 
 27 | tbltofakedf(tbl)
 28 | }
 29 | 
 30 | \arguments{
 31 |   \item{replaceVal}{Value to be replaced by NA.}
 32 |   \item{xy}{Matrix or data frame, X values in the first columns, Y
 33 |      in the last column.}
 34 |   \item{indata}{Matrix or data frame.}
 35 |   \item{x}{Matrix or data frame, one column per variable.}
 36 |   \item{nboot}{If positive, number of bootstrap samples to take.}
 37 |   \item{probna}{Probability that an element will be NA.}
 38 |   \item{scale}{If TRUE, call \code{cor} instead of \code{cov}.}
 39 |   \item{tbl}{An R table.}
 40 |   \item{m}{Number of synthetic NAs to insert.}
 41 |   \item{object}{Output from \code{lmac}.}
 42 |   \item{...}{Needed for consistency with generic function.  Not used.}
 43 |   \item{margin}{A list of vectors specifying the model, as in
 44 |      \code{loglin}.}
 45 | 
 46 | }
 47 | 
 48 | \details{
 49 | 
 50 |    The Available Cases (AC) approach applies to statistical methods that
 51 |    depend only on products of k of the variables, so that cases having
 52 |    non-NA values for those k variables can be used, as opposed to using
 53 |    only cases that are fully intact in all variables, the Complete Cases
 54 |    (CC) approach.  In the case of linear regression, for instance, the
 55 |    estimated coefficients depend only on covariances between the
 56 |    variables (both predictors and response).  This approach assumes thst
 57 |    the cases with missing values have the same distribution as the
 58 |    intact cases.
 59 | 
 60 |    The \code{lmac} function forms OLS estimates as with \code{lm}, but
 61 |    applying AC, in contrast to \code{lm}, which uses the CC method.
 62 | 
 63 |    The \code{pcac} function is an AC substitute for \code{prcomp}. The
 64 |    data is centered, corresponding to a fixed value of \code{center =
 65 |    TRUE} in \code{prcomp}.  It is also scaled if \code{scale} is TRUE,
 66 |    corresponding \code{scale = TRUE} in \code{prcomp}. Due to AC,
 67 |    there is a small chance of negative eigenvalues, in which case
 68 |    \code{stop} will be called.
 69 | 
 70 |    The \code{loglinac} function is an AC substitute for \code{loglin}.
 71 |    The latter takes tables as input, but \code{loglinac} takes the raw
 72 |    data. If you have just the table, use \code{tbltofakedf} to
 73 |    regenerate a usable data frame.
 74 | 
 75 |    The \code{makeNA} function is used to insert random NA values into
 76 |    data, for testing purposes.
 77 |    
 78 | }
 79 | 
 80 | \value{
 81 | 
 82 |    For \code{lmac}, an object of class \code{lmac}, with components
 83 |    
 84 |    \itemize{
 85 |    
 86 |       \item{coefficients}, as with \code{lm}; 
 87 |       accessible directly or by calling \code{coef}, as with \code{lm}
 88 |    
 89 |       \item{fitted.values}, as with \code{lm}
 90 |    
 91 |       \item{residuals}, as with \code{lm}
 92 |    
 93 |       \item{r2}, (unadjusted) R-squared
 94 |    
 95 |       \item{cov}, for \code{nboot > 0} the estimated covariance matrix
 96 |       of the vector of estimated regression coefficients; accessible
 97 |       directly or by calling \code{vcov}, as with \code{lm} 
 98 |    
 99 |    }
100 |    
101 |    For \code{pcac}, an R list, with components
102 |    
103 |    \itemize{
104 |    
105 |       \item{sdev}, as with \code{prcomp}
106 |    
107 |       \item{rotation}, as with \code{prcomp}
108 |    
109 |    }
110 |    
111 |    For \code{loglinac}, an R list, with components
112 |    
113 |    \itemize{
114 |    
115 |       \item{param}, estimated coefficients, as in \code{loglin}
116 |    
117 |       \item{fit}, estimated expected call counts, as in \code{loglin}
118 |    
119 |    }
120 | 
121 | }
122 | 
123 | \examples{
124 | n <- 25000
125 | w <- matrix(rnorm(2*n),ncol=2)  # x and epsilon
126 | x <- w[,1]
127 | y <- x + w[,2]
128 | # insert some missing values
129 | nmiss <- round(0.1*n)
130 | x[sample(1:n,nmiss)] <- NA
131 | nmiss <- round(0.2*n)
132 | y[sample(1:n,nmiss)] <- NA
133 | acout <- lmac(cbind(x,y))
134 | coef(acout)  # should be near pop. values 0 and 1
135 | }
136 | 
137 | \author{
138 | Norm Matloff
139 | }
140 | 
141 | 


--------------------------------------------------------------------------------
/inst/ScalingInPCA.md:
--------------------------------------------------------------------------------
  1 | #  Clearing the Confusion: Scaling in PCA
  2 | 
  3 | Many resources on machine learning (ML) methodology recommend, or even
  4 | state as crucial, that one *scale* (or *standardize*) one's data, i.e.
  5 | divide each variable by its standard deviation (after subtracting the
  6 | mean), before applying Principal Component Analysis (PCA).  Here we will
  7 | show why that can be problematic, and provide alternatives.
  8 | 
  9 | ## Overview
 10 | 
 11 | The recommendation to scale is common.  Here are some examples: 
 12 | 
 13 | * R **prcomp()** man page
 14 | 
 15 |     They say "scaling is advisable":
 16 | 
 17 | > scale.: a logical value indicating whether the variables should be
 18 | >           scaled to have unit variance before the analysis takes place.
 19 | >           The default is ‘FALSE’ for consistency with S, but in general
 20 | >           scaling is advisable.
 21 | 
 22 | * [Scikit-Learn](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html):
 23 | 
 24 |     Actually, the mention of normal distributions is misleading and in
 25 | any case not relevant, but again there is a rather imperative statement
 26 | to scale:
 27 | 
 28 | > Feature scaling through standardization (or Z-score normalization) can
 29 | > be an important preprocessing step for many machine learning algorithms.
 30 | > Standardization involves rescaling the features such that they have the
 31 | > properties of a standard normal distribution with a mean of zero and a
 32 | > standard deviation of one.
 33 | 
 34 | * [DataCamp](https://www.datacamp.com/community/tutorials/pca-analysis-r)
 35 | 
 36 | Again, their phrasing is rather imperative:
 37 | 
 38 | > Note that the units used [in the **mtcars** dataset] vary and occupy
 39 | > different scales...You will also set two arguments, center and scale, to
 40 | > be TRUE. 
 41 | 
 42 | * [caret](https://cran.r-project.org/package=caret), **preProcess** man
 43 |   page
 44 | 
 45 |     Scaling done unless you say no:
 46 | 
 47 | > If PCA is requested but centering and scaling are not, the values will
 48 | > still be centered and scaled. 
 49 | 
 50 | * [Visually Enforced](https://www.gastonsanchez.com/visually-enforced/how-to/2012/06/17/PCA-in-R/)
 51 | 
 52 | The word "must" is used here:
 53 | 
 54 | > Since most of the times the variables are measured in different scales,
 55 | > the PCA must be performed with standardized data (mean = 0, variance =
 56 | > 1).
 57 | 
 58 | ## The perceived problem
 59 | 
 60 | As the DataCamp statement notes, some data may be "large" while other
 61 | data are "small."  There is a concern that, without scaling, the large
 62 | ones will artificially dominate.  This is especially an issue in light
 63 | of the variation in measurement systems -- should a variable measured in
 64 | kilometers be given more weight than one measured in miles?
 65 | 
 66 | ## Motivating counterexample
 67 | 
 68 | Consider a setting with two independent variables, A and B, with means
 69 | 100, and with Var(A) = 500 and Var(B) = 2.  Let A' and B' denote these
 70 | variables after centering and scaling.
 71 | 
 72 | PCA is all about removing variables with small variance, as they are
 73 | essentially constant.  If we work with A and B, we would of course use
 74 | only A.  **But if we work with A' and B', we would use both of them, as
 75 | they both have variance 1.0.**  Scaling has seriously misled us here.
 76 | 
 77 | ## Alternatives
 78 | 
 79 | The real goal should be to make the variables *commensurate*.
 80 | Standardizing to mean 0, variance 1 is not the only way one can do this.
 81 | Consider the following alternatives.
 82 | 
 83 | * Do nothing.  In many data sets, the variables of interest are already
 84 |   commensurate.  Consider survey data, say, with each survey question
 85 | asking for a response on a scale of 1 to 5.  No need to transform the
 86 | data here, and worse, standardizing would have the distoritionary effect
 87 | of exaggerating rare values in items with small variance.
 88 | 
 89 | * Map each variable to the interval [0,1], i.e. t -> (t-m)/(M-m), where
 90 |   m and M are the minimum and maximum values of the given variable.
 91 | This is typically better than standardizing, but it does have some
 92 | problems.  First, it is sensitive to outliers.  This might be
 93 | ameliorated with a modified form of the transformation, but a second
 94 | problem is that new data -- new data in prediction applications, say --
 95 | may stray from this [0,1] world.
 96 | 
 97 | * Instead of changing the *standard deviation* of a variable to 1.0,
 98 |   change its *mean* to 1.0.  This addresses the miles-vs.-kilometers
 99 | concern more directly, without inducing the distortions I described
100 | above.  And if one is worried about outliers, then divide the variable
101 | by the median or other trimmed mean.
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/R/Penrose.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # routines to explore effect of deliberate overfitting beyond
  3 | # "interpolation," i.e. beyond "perfect fit"
  4 | 
  5 | ###################  penroseLM()  #########################
  6 | 
  7 | # Penrose inverse version of lm(); a 1s col is added as in lm()
  8 | 
  9 | # arguments:
 10 | 
 11 | #    d:data frame; must be numeric
 12 | #    yName: name of "Y" column
 13 | 
 14 | # value:
 15 | 
 16 | #     object of class 'penroseLM',with beta-hat as 'bh' and colnames(x)
 17 | 
 18 | penroseLM <- function(d,yName) 
 19 | {
 20 |    ycol <- which(names(d) == yName)
 21 |    x <- cbind(1,as.matrix(d[,-ycol]))
 22 |    xnms <- colnames(x)
 23 |    y <- d[,ycol]
 24 |    # MASS::ginv() does Penrose inverse
 25 |    res <- list(bh=MASS::ginv(x) %*% y, xnms=xnms)
 26 |    class(res) <- 'penroseLM'
 27 |    res
 28 | }
 29 | 
 30 | # arguments:
 31 | 
 32 | #    object: return value of penroseLM()
 33 | #    newx: data frame in the same format as x in penroseLM(); numeric
 34 | 
 35 | predict.penroseLM <- function(object,...) 
 36 | {
 37 |    arglist <- list(...)
 38 |    newx <- arglist[[1]]
 39 | 
 40 |    if(names(newx) != object$xnms) stop('name mismatch')
 41 |    newx <- cbind(1,as.matrix(newx))
 42 |    bh <- object$bh
 43 |    newx %*% bh
 44 | }
 45 | 
 46 | ###################  penrosePoly()  #########################
 47 | 
 48 | # polynomial regression with Penrose inverse; uses polyreg
 49 | 
 50 | penrosePoly <- function(d,yName,deg,maxInteractDeg=deg) 
 51 | {
 52 |    requireNamespace('polyreg')
 53 |    ycol <- which(names(d) == yName)
 54 |    x <- as.matrix(d[,-ycol,drop=FALSE])
 55 |    polyout <- polyreg::getPoly(x,deg=deg,maxInteractDeg=maxInteractDeg)
 56 |    xPoly <- polyout$xdata  # polynomial version of x
 57 |    y <- d[,ycol]
 58 |    xy <- cbind(xPoly,y)
 59 |    res <- list(bh=penroseLM(xy,'y')$bh,
 60 |       deg=deg,
 61 |       maxInteractDeg=maxInteractDeg,
 62 |       modelFormula=polyout$modelFormula,
 63 |       XtestFormula=polyout$XtestFormula,
 64 |       retainedNames=polyout$retainedNames,
 65 |       standardize=FALSE
 66 |       )
 67 |    class(res) <- 'penrosePoly'
 68 |    res
 69 | }
 70 | 
 71 | predict.penrosePoly <- function(object,...) 
 72 | {
 73 |    requireNamespace('polyreg')
 74 |    arglist <- list(...)
 75 |    newx <- arglist[[1]]
 76 | 
 77 |    if (nrow(newx) == 1) {
 78 |       # problem in getPoly() for case of a 1-row newx, reported to PM;
 79 |       # have this workaround for now
 80 |       oneRow <- TRUE
 81 |       newx <- rbind(newx,newx)
 82 |    } else oneRow <- FALSE
 83 |    polyout <- polyreg::getPoly(newx,
 84 |       deg=object$deg,
 85 |       maxInteractDeg = object$maxInteractDeg,
 86 |       modelFormula = object$modelFormula,
 87 |       retainedNames = object$retainedNames)
 88 |    xPoly <- polyout$xdata  # polynomial version of newx
 89 |    xPoly <- as.matrix(xPoly)
 90 |    xPoly <- cbind(1,xPoly)
 91 |    bh <- object$bh
 92 |    res <- xPoly %*% bh
 93 |    if (oneRow) res <- res[1]
 94 |    res
 95 | }
 96 | 
 97 | predpnr <- predict.penrosePoly
 98 | 
 99 | ###################  ridgePoly()  #########################
100 | 
101 | # according to Hastie et al, "properly tuned" ridge regression beats
102 | # mininum norm
103 | 
104 | # polynomial regression with Penrose inverse; uses polyreg
105 | 
106 | ridgePoly <- function(d,yName,deg,maxInteractDeg=deg) 
107 | {
108 |    requireNamespace('polyreg')
109 |    if (!allNumeric(d)) stop('for now, X,Y must be numeric')
110 |    ycol <- which(names(d) == yName)
111 |    x <- as.matrix(d[,-ycol])
112 |    polyout <- polyreg::getPoly(x,deg=deg,maxInteractDeg=maxInteractDeg)
113 |    xPoly <- polyout$xdata  # polynomial version of x
114 |    xPoly <- as.matrix(xPoly)
115 |    y <- d[,ycol]
116 |    cvgout <- glmnet::cv.glmnet(x=xPoly,y=y,alpha=0,family='gaussian')
117 |    res <- list(cvgout=cvgout,
118 |       deg=deg,
119 |       maxInteractDeg=maxInteractDeg,
120 |       modelFormula=polyout$modelFormula,
121 |       XtestFormula=polyout$XtestFormula,
122 |       retainedNames=polyout$retainedNames,
123 |       standardize=FALSE
124 |       )
125 |    class(res) <- 'ridgePoly'
126 |    res
127 | }
128 | 
129 | predict.ridgePoly <- function(object,...) 
130 | {
131 |    requireNamespace('polyreg')
132 |    arglist <- list(...)
133 |    newx <- arglist[[1]]
134 | 
135 |    # newx must be a matrix, with the original column names
136 |    if (nrow(newx) == 1) {
137 |       # problem in getPoly() for case of a 1-row newx, reported to PM;
138 |       # have this workaround for now
139 |       oneRow <- TRUE
140 |       newx <- rbind(newx,newx)
141 |    } else oneRow <- FALSE
142 |    polyout <- polyreg::getPoly(newx,
143 |       deg=object$deg,
144 |       maxInteractDeg = object$maxInteractDeg,
145 |       modelFormula = object$modelFormula,
146 |       retainedNames = object$retainedNames)
147 |    xPoly <- polyout$xdata  # polynomial version of newx
148 |    xPoly <- as.matrix(xPoly)
149 |    glmObject <- object$cvgout
150 |    res <- predict(glmObject,s=glmObject$lambda.min,newx=xPoly)
151 | #    bh <- object$bh
152 | #    res <- xPoly %*% bh
153 |    if (oneRow) res <- res[1]
154 |    res
155 | }
156 | 
157 | 


--------------------------------------------------------------------------------
/inst/InterpretedR.md:
--------------------------------------------------------------------------------
  1 | #  Clearing the Confusion:  R and Python as Interpreted Languages, and the Roles of Vectorization and Pointers
  2 | 
  3 | One often hears statements like "R is an interpreted language, and thus
  4 | is slow."  The same is true for Python and Java.  But what does that
  5 | really mean?
  6 | 
  7 | ## Example
  8 | 
  9 | Consider this code snippet:
 10 | 
 11 | ``` r
 12 | # x, creating previously, is a vector of 100 elements
 13 | tot <- 0
 14 | for (i in 1:100)
 15 |    tot <- tot + x[i]
 16 | ```
 17 | 
 18 | The *vectorized* alternative is
 19 | 
 20 | ``` r
 21 | tot <- sum(x)
 22 | ```
 23 | 
 24 | The second version will indeed be faster than the first (though with a
 25 | small vector like this, both will execute essentially instantaneously).
 26 | But why?
 27 | 
 28 | ## Interpreted languages
 29 | 
 30 | R does not run directly on your machine.  What runs directly is another
 31 | program, the *interpreter*.  Like any program running on your machine,
 32 | it is written in machine language, patterns of 0s and 1s that code
 33 | primitive operations, e.g. ``Add these 2 numbers.''  In most cases, that
 34 | machine language was generated from C/C++ source, translated to machine
 35 | code by a *compiler*.  The interpreter in essence simulates a fake
 36 | machine, on which your R code "runs."  The details are not important for
 37 | us.
 38 | 
 39 | The reason that first code snippet above will run slowly is that the
 40 | compiler will have to do a lot of repetitive work.  For instance, it
 41 | will have to look up where in memory **tot** and **x** are--100 times!
 42 | 
 43 | By contrast, in that second code snippet, the **sum()** function is
 44 | actual machine code (again, generated from C by the compiler).  The
 45 | location of **x** is set just once, and the code marches through the
 46 | vector without any lookup.  That code will still repeat a summation
 47 | operation 100 times, but without the time-consuming lookup.  The
 48 | situation is similar for **tot**.
 49 | 
 50 | ## Vectorized code
 51 | 
 52 | We say that the second snippet is *vectorized* code, because the entire
 53 | vector is processed directly by machine code.  As noted, that code will
 54 | still loop around 100 times, so it is not the case that the code
 55 | operates on the entire vector at once (there are special types 
 56 | of hardware, e.g. GPUs, that *can* do this), but it will be faster for
 57 | the reasons cited above.  Some R code is vectorizable, some not.
 58 | 
 59 | ## Pointers
 60 | 
 61 | A computer's memory is broken down into *bytes*, with a certain number
 62 | of bytes forming a *word*.  On today's machines, word size is typically
 63 | 8 bytes.  So, if your machine has, say 4 Gb of RAM, then it has 0.5
 64 | billion words.  Your code's numbers are typically stored one to a word,
 65 | e.g. 100 consecutive words in the above code, while your text data is
 66 | stored one to a byte.
 67 | 
 68 | Each byte, and each word, has an ID, called an *address*.  When your
 69 | code refers to a vector, say **x** above, internally it refers to the
 70 | address in memory at which the vector starts.  Internally, that starting
 71 | address is kept in a variable called a *pointer*.
 72 | 
 73 | Now, here is a key point:  Say the vector **u** is rather long, say 100
 74 | million elements, and we execute
 75 | 
 76 | ``` r
 77 | v <- u
 78 | ```
 79 | 
 80 | Will that operation be slow, due to the need to do all that copy of one
 81 | vector to another?  No!  Since **u** and **v** are referenced by
 82 | pointers, executing the above line of code merely means copying one
 83 | pointer to another; they both point to the same place.
 84 | 
 85 | But...what if we then execute
 86 | 
 87 | ``` r
 88 | v[88] <-3
 89 | ```
 90 | 
 91 | We want **v** to change but **u** to NOT change.  Now the interpreter
 92 | must do some work.  In preparing to make **v** separate from **u**,
 93 | the interpreter must (a) find some unused part of memory at which to
 94 | create the new vector, (b) copy all of **u** to that space, (c) point
 95 | **v**'s pointer to that space, and (d) set the third word in that space
 96 | to 3.  
 97 | 
 98 | So, here we see two lines of code, the first appearing to be slow but
 99 | actually not slow, and the second looking innocuous and fast but
100 | actually slow.  Writing fast R code does take some sophistication.
101 | 
102 | ## Every operation is a function
103 | 
104 | This is not directly related to the above material, but worth mentioning
105 | in this context.
106 | 
107 | You are probably well familiar with functions, e.g. **sum()** above, but
108 | may think of something like
109 | 
110 | ``` r
111 | 3 + 8
112 | ```
113 | 
114 | as a different animal.  Actually, the latter is also a function call!
115 | The name of the function is **`+`**!  (Note the backticks.)   For
116 | example:
117 | 
118 | ``` r
119 | > 3 + 8
120 | [1] 11
121 | > `+`(3,8)
122 | [1] 11
123 | ```
124 | 
125 | So, addition is done via the **`+`()** function, with the addends as
126 | arguments.  The R interpreter converts that first form to the second.
127 | 
128 | Similarly, we have a function **`[`** for vector element access,
129 | a function **`$`** for list element access, and so on.
130 | 
131 | 


--------------------------------------------------------------------------------
/inst/tests/QuickTests.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # built-in data on major league baseball players
  3 | data(mlb)  
  4 | mlb <- mlb[,3:6]  # position, height, weight, age
  5 | 
  6 | # note:  qeNeural has its own random number stream (in Python),
  7 | # and thus will give different numbers for the same R seed, around 13.8
  8 | qeCompare(mlb,'Weight',
  9 |    c('qeLin','qePolyLin','qeKNN','qeRF','qeLASSO','qeNeural'),25)
 10 | #       qeFtn  meanAcc
 11 | # 1     qeLin 13.30490
 12 | # 2 qePolyLin 13.33584
 13 | # 3     qeKNN 13.72708
 14 | # 4      qeRF 13.46515
 15 | # 5   qeLASSO 13.34612
 16 | # 6  qeNeural 13.89695
 17 | 
 18 | qeCompare(mlb,'Position',
 19 |    c('qeLogit','qePolyLog','qeKNN','qeRF','qeNeural','qeSVM','qeGBoost'),25)
 20 | #       qeFtn   meanAcc
 21 | # 1   qeLogit 0.6677228
 22 | # 2 qePolyLog 0.6843564
 23 | # 3     qeKNN 0.6819802
 24 | # 4      qeRF 0.6780198
 25 | # 5  qeNeural 0.6708911
 26 | # 6     qeSVM 0.6542574
 27 | # 7  qeGBoost 0.6657426
 28 | 
 29 | 
 30 | data(peFactors)  
 31 | pef <- peFactors[,c(1,3,5,7:9)]  
 32 | qeCompare(pef,'occ',c('qeLogit','qePolyLog','qeKNN','qeRF','qeNeural'),25)
 33 | #       qeFtn meanAcc
 34 | # 1   qeLogit 0.61444
 35 | # 2 qePolyLog 0.61136
 36 | # 3     qeKNN 0.62524
 37 | # 4      qeRF 0.61520
 38 | # 5  qeNeural 0.61204
 39 | # UCI vertebrae dataset, column_3C
 40 | #  vert <- read.table('~/Datasets/Vertebrae/column_3C.dat',header=FALSE,stringsAsFactors=TRUE)
 41 | #  qeCompare(vert,'V7',c('qeLogit','qePolyLog','qeKNN','qeRF','qeNeural'),25)
 42 | #        qeFtn   meanAcc
 43 | #  1   qeLogit 0.1419355
 44 | #  2 qePolyLog 0.1974194
 45 | #  3     qeKNN 0.2193548
 46 | #  4      qeRF 0.1625806
 47 | #  5  qeNeural 0.3393548
 48 | 
 49 | set.seed(9999)
 50 | 
 51 | # fit models
 52 | knnout <- qeKNN(mlb,'Weight',k=25)
 53 | rfout <- qeRF(mlb,'Weight')
 54 | 
 55 | # mean abs. pred. error on holdout set, in pounds
 56 | knnout$testAcc
 57 | # [1] 11.75644
 58 | rfout$testAcc
 59 | # [1] 12.6787
 60 | 
 61 | # predict a new case
 62 | newx <- data.frame(Position='Catcher',Height=73.5,Age=26)
 63 | predict(knnout,newx)
 64 |        [,1]
 65 | # [1,] 204.04
 66 | predict(rfout,newx)
 67 |       11 
 68 | # 199.1714
 69 | 
 70 | set.seed(9999)
 71 | # how about some other ML methods?
 72 | lassout <- qeLASSO(mlb,'Weight')
 73 | lassout$testAcc
 74 | # [1] 12.31019
 75 | # poly reg, degree 3 
 76 | polyout <- qePolyLin(mlb,'Weight',3)
 77 | polyout$testAcc
 78 | # [1] 13.83444
 79 | nnout <- qeNeural(mlb,'Weight')
 80 | # ...
 81 | nnout$testAcc
 82 | # [1] 10.23094
 83 | # try some nondefault hyperparams
 84 | nnout <- qeNeural(mlb,'Weight',hidden=c(200,200),nEpoch=50)
 85 | nnout$testAcc
 86 | # [1] 13.40559
 87 | 
 88 | # predict player position, 6 categories
 89 | knnout <- qeKNN(mlb,'Position',k=25)
 90 | rfout <- qeRF(mlb,'Position')
 91 | knnout$testAcc
 92 | # [1] 0.7524752
 93 | rfout$testAcc
 94 | # [1] 0.6138614
 95 | table(mlb$Pos) / sum(table(mlb$Pos))
 96 | #          Catcher    First_Baseman       Outfielder   Relief_Pitcher 
 97 | #       0.07487685       0.05418719       0.19113300       0.31034483 
 98 | #   Second_Baseman        Shortstop Starting_Pitcher    Third_Baseman 
 99 | #       0.05714286       0.05123153       0.21674877       0.04433498 
100 | 
101 | # kNN worse than always guessing Relief_Pitcher, RF about the same
102 | z <- qePolyLog(mlb,'Position',holdout=NULL)
103 | predict(z,mlb[8,-1])
104 | # $predClasses
105 | # [1] "Outfielder"
106 | # $probs
107 | #      Catcher First_Baseman Outfielder Relief_Pitcher Second_Baseman Shortstop
108 | # [1,]   0.125         0.125      0.125          0.125          0.125     0.125
109 | #      Starting_Pitcher Third_Baseman
110 | # [1,]            0.125         0.125
111 | z <- qePolyLog(mlb[,c(1,3)],'Position',holdout=NULL)
112 | predict(z,mlb[8,3])
113 | 
114 | set.seed(9999)
115 | lgout <- qeLogit(mlb,'Position')
116 | lgout$testAcc
117 | # [1] 0.6732673
118 | newx <- data.frame(Height=73.5,Age=26,Weight=200)
119 | predict(lgout,newx)
120 | # $predClasses
121 | # [1] "Relief_Pitcher"
122 | # $probs
123 | #         Catcher First_Baseman Outfielder Relief_Pitcher Second_Baseman
124 | # [1,] 0.06527784    0.05201025   0.214516      0.3336662     0.03421254
125 | #       Shortstop Starting_Pitcher Third_Baseman
126 | # [1,] 0.03345139        0.2252583    0.04160745
127 | 
128 | z <- qePolyLog(mlb,'Position',holdout=NULL)
129 | predict(z,mlb[8,-1])
130 | # $predClasses
131 | # [1] "Outfielder"
132 | # $probs
133 | #       Catcher First_Baseman Outfielder Relief_Pitcher Second_Baseman Shortstop
134 | # [1,] 0.173676    0.04955253  0.1418191     0.06851684     0.04072947 0.2907195
135 | #      Starting_Pitcher Third_Baseman
136 | # [1,]       0.07216886     0.1628177
137 | 
138 | # check via qeLogit()
139 | mlb1 <- mlb[,c(1,3)]  # Position, Weight only
140 | z <- qePolyLog(mlb1,'Position',holdout=NULL)
141 | predict(z,mlb1[8,-1])
142 | # $predClasses
143 | # [1] "Relief_Pitcher"
144 | # $probs
145 | #         Catcher First_Baseman Outfielder Relief_Pitcher Second_Baseman
146 | # [1,] 0.09858535    0.05010766 0.03951525     0.03506279     0.05183345
147 | #      Shortstop Starting_Pitcher Third_Baseman
148 | # [1,] 0.2148096        0.2074912     0.3025947
149 | mlb2 <- mlb1
150 | mlb2$wt2 <- mlb2$Weight^2
151 | z <- qeLogit(mlb2,'Position',holdout=NULL)
152 | predict(z,mlb2[8,-1])
153 | # same as above
154 | 
155 | # what about having only 2 classes?
156 | catcher <- toSubFactor(mlb$Position,'Catcher')
157 | mlb3 <- mlb
158 | mlb3$Position <- catcher
159 | predict(z,mlb3[8,-1])
160 | # $predClasses
161 | # [1] "zzzOther"
162 | # $probs
163 | #        Catcher  zzzOther
164 | # [1,] 0.1533529 0.8466471
165 | 
166 | # day2
167 | d2 <- day2[,-(13:14)]
168 | z <- pcaQE(0.6,d2,'tot','qeKNN',k=25,holdout=NULL)
169 | newx <- d2[8,-13]
170 | predict(z,newx)
171 | #         [,1]
172 | # [1,] 1440.44
173 | 
174 | 


--------------------------------------------------------------------------------
/inst/PoissonReg.md:
--------------------------------------------------------------------------------
  1 | #  Clearing the Confusion: Poisson regression
  2 | 
  3 | One of the most commonly used statistical methods is the *general linear
  4 | model*, implemented in R as the **glm()** function.  The most common
  5 | usage of that function is for *logistic regression*, but it's also
  6 | popular for *Poisson regression* (PR), the subject of this entry in our
  7 | Clearing the Confusion series.  PR is often used as a regression model
  8 | in which the response variable Y consists of counts, typically in the
  9 | one- or two-digit range.
 10 | 
 11 | This is not a tutorial on Poisson regression (PR).  It is assumed that
 12 | the reader already has some familiarity with the model, and the
 13 | treatment here is somewhat critical.  There are two main themes:
 14 | 
 15 | * Unlike the linear and logistic cases, there is no theoretical or
 16 |   modeling justification for PR.
 17 | 
 18 | * PR changes the nature of the &beta; coefficients in major ways that
 19 |   may not be desirable.
 20 | 
 21 | ## Notation
 22 | 
 23 | Y: the response variable
 24 | 
 25 | X<sub>1</sub>, X<sub>2</sub>,...,X<sub>p</sub>:  the predictor
 26 | variables
 27 | 
 28 | X: (X<sub>1</sub>, X<sub>2</sub>,...,X<sub>p</sub>)
 29 | 
 30 | n: the sample size, i.e. number of data points
 31 | 
 32 | ## The linear and logistic models: theoretical justifications
 33 | 
 34 | It will be helpful to first take a brief look at the theory behind the
 35 | assumptions of standard linear and logistic models.
 36 | 
 37 | **linear model:**  The classic linear model assumes that:  mean Y
 38 | given X is a linear combination of the X<sub>i</sub>; the conditional
 39 | distribution of Y given X is Gaussian; and the conditional variance of Y
 40 | given X is constant in X (homoscedasticity).
 41 | 
 42 | Key point:  *All the assumptions of this model hold if (X,Y) has a
 43 | multivariate normal distribution.*
 44 | 
 45 | In other words, the multivariate normal model implies the classic linear
 46 | model.
 47 | 
 48 | **logistic model:** For binary Y, the logistic model is
 49 | 
 50 | P(Y = 1 | X) = 1 / [1 + exp{-(
 51 | &beta;<sub>0</sub> +
 52 | &beta;<sub>1</sub> X<sub>1</sub> +
 53 | ...
 54 | &beta;<sub>p</sub> X<sub>p</sub> 
 55 | )}]
 56 | 
 57 | Key point:  *This assumption holds if X | Y = i is multivariate normal
 58 | with covariance matrix independent of i.* 
 59 | 
 60 | Those assumptions, by the way, are exactly those of Fisher linear
 61 | discriminant analysis.  In other words, LDA settings imply the logistic
 62 | model.
 63 | 
 64 | -----------------
 65 | 
 66 | Of course, models are always approximations, and the linear and logistic
 67 | models are no exception.  But multivariate data is indeed often roughly
 68 | mound-shaped, i.e. multivariate Gaussian-like, making the above
 69 | theoretical models reasonable choices for consideration. 
 70 | 
 71 | ## Reasons cited for using "exponential" PR
 72 | 
 73 | When Y has the form of counts, a Poisson model naturally comes to mind.
 74 | However, unlike the linear and logistic cases, *there is no theoretical
 75 | justification for the standard PR model*,
 76 | 
 77 | mean Y =
 78 | exp[&beta;<sub>0</sub> +
 79 | &beta;<sub>1</sub> X<sub>1</sub> +
 80 | ...
 81 | &beta;<sub>p</sub> X<sub>p</sub>]
 82 | 
 83 | Let's call this the Exponential Poisson model (EP).
 84 | 
 85 | Since most parametric regression analyses use linear models, a more
 86 | natural model would be the standard linear one,
 87 | 
 88 | mean Y = 
 89 | &beta;<sub>0</sub> +
 90 | &beta;<sub>1</sub> X<sub>1</sub> +
 91 | ...
 92 | &beta;<sub>p</sub> X<sub>p</sub> 
 93 | 
 94 | Let's call this the Linear Poisson model (LP).
 95 | 
 96 | Advocates of EP are uncomfortable with LP.  Under the linear model, mean
 97 | Y could be negative in some instances, contrary to its being a mean of
 98 | counts.  Thus they use **exp()** to force the mean to be nonnegative.
 99 | 
100 | ## A closer look
101 | 
102 | A fundamental problem, often overlooked, is this:
103 | 
104 | -----------------
105 | 
106 | With use of EP instead of LP, the predictor effects &beta;<sub>i</sub>
107 | change from **additive** to **multiplicative**.
108 | 
109 | -----------------
110 | 
111 | Say X<sub>2</sub> is age.  Under LP, 1 extra year of age adds
112 | &beta;<sub>2</sub> to mean Y.  Under EP, 1 extra year of age
113 | *multiplies* mean Y by exp(&beta;<sub>2</sub>).
114 | 
115 | In some applications, a multiplicative model is appropriate.  But users
116 | should be aware of this major difference in models, and thus this major
117 | difference in interpretations of the coefficients.
118 | 
119 | It must be noted that factor effects are not additive in logistic models
120 | either.  However, the "data is often mound-shaped" argument at least
121 | gives a theoretical basis for considering a logistic model.  EP has no
122 | such basis, and if the application at hand does not have a qualitative
123 | reason to assume multiplicativity, EP may not be justified.
124 | 
125 | ## Issues with assumptions in LP
126 | 
127 | Count data, at least for small mean, is not approximately normal, and in
128 | most cases it is not homoscedastic either.  However, neither of these is
129 | a major problem.
130 | 
131 | For large n, the Central Limit Theorem (applied in large-sample theory)
132 | shows that non-normality of the distribution of Y given X is not
133 | relevant.  For small n, Gaussian linear model theory is not reliable,
134 | since no variable in the real world is normally distributed.  One can
135 | and should still do inference, of course, but not take it so literally. 
136 | 
137 | As to heteroscedastic variance of Y given X, one can use the *sandwich
138 | estimator*.  See for instance the **car** and **sandwich** packages in
139 | CRAN.
140 | 
141 | ## Recommendations
142 | 
143 | One can of course try both models, LP and EP, doing model fit assessment
144 | if the goal is Description or doing cross-validated assessment of
145 | predictive ability if the goal is Prediction.  But again, in the
146 | Description case, one must take care in interpeting the coefficients of
147 | the two models.
148 | 


--------------------------------------------------------------------------------
/man/FineTuning.Rd:
--------------------------------------------------------------------------------
  1 | \name{fineTuning,knnFineTune,fineTuningMult}
  2 | \alias{fineTuning}
  3 | \alias{fineTuningMult}
  4 | \alias{knnFineTune}
  5 | \alias{fineTuningPar}
  6 | \alias{plot.tuner}
  7 | 
  8 | \title{Grid Search Plus More}
  9 | 
 10 | \description{
 11 | Adds various extra features to grid search for specified tuning 
 12 | parameter/hyperparameter combinations:  There is a plot() function, using
 13 | parallel coordinates graphs to show trends among the different
 14 | combinations; and Bonferroni confidence intervals are computed to avoid
 15 | p-hacking.  An experimental smoothing facility is also included.
 16 | }
 17 | 
 18 | \usage{
 19 | fineTuning(dataset,pars,regCall,nCombs=NULL,specCombs=NULL,nTst=500,
 20 |    nXval=1,up=TRUE,k=NULL,dispOrderSmoothed=FALSE,
 21 |    showProgress=TRUE,...)
 22 | fineTuningMult(dataset,pars,regCall,nCombs=NULL,
 23 |    nTst=500,nXval=1,up=TRUE,k=NULL,dispOrderSmoothed=FALSE,
 24 |    showProgress=TRUE,outDim=1,...)
 25 | \method{plot}{tuner}(x,...)
 26 | knnFineTune(data,yName,k,expandVars,ws,classif=FALSE,seed=9999)
 27 | fineTuningPar(cls,dataset,pars,regCall,nCombs=NULL,specCombs=NULL,
 28 |    nTst=500,nXval=1,up=TRUE,k=NULL,dispOrderSmoothed=FALSE)
 29 | }
 30 | 
 31 | \arguments{
 32 |    \item{...}{Arguments to be passed on by \code{fineTuning} or
 33 |       \code{plot.tuner}.}
 34 |    \item{x}{Output object from \code{fineTuning}.}
 35 |    \item{cls}{A \code{parallel} cluster.}
 36 |    \item{dataset}{Data frame etc. containing the data to be analyzed.}
 37 |    \item{data}{The data to be analyzed.}
 38 |    \item{yName}{Quoted name of "Y" in the column names of \code{data}.}
 39 |    \item{expandVars}{Indices of columns in \code{data} to be weighted in
 40 |       distance calculations.}
 41 |    \item{ws}{Weights to be used for \code{expandVars}.}
 42 |    \item{classif}{Set to TRUE for classification problems.}
 43 |    \item{seed}{Seed for random number generation.}
 44 |    \item{pars}{R list, showing the desired tuning parameter values.}
 45 |    \item{regCall}{Function to be called at each parameter combination,
 46 |       performing the model fit etc.}
 47 |    \item{nCombs}{Number of parameter combinations to run.  If Null, all
 48 |       will be run}.
 49 |    \item{nTst}{Number of data points to be in each holdout set.}
 50 |    \item{nXval}{Number of holdout sets/folds to be run for a 
 51 |       given data partition and parameter combination.}
 52 |    \item{k}{Nearest-neighbor smoothing parameter.}
 53 |    \item{up}{If TRUE, display results in ascending order of performance
 54 |       value.}
 55 |    \item{dispOrderSmoothed}{Display in order of smoothed results.}
 56 |    \item{showProgress}{If TRUE, print each output line as it becomes ready.}
 57 |    \item{specCombs}{A data frame in which the user specifies 
 58 |        hyperparameter parameter combinations to evaluate.}
 59 |    \item{outDim}{Number of components in the value returned by \code{theCall}.}
 60 | 
 61 | }
 62 | 
 63 | \details{
 64 | 
 65 | The user specifies the values for each tuning parameter in 
 66 | \code{pars}.  This leads to a number of possible combinations of the
 67 | parameters.  In many cases, there are more combinations than the user
 68 | wishes to try, so \code{nCombs} of them will be chosen at random.
 69 | 
 70 | For each combination, the function will run the analysis specified by
 71 | the user in \code{regCall}.  The latter must have the call form
 72 | 
 73 | \code{ftnName(dtrn,dtst,cmbi}
 74 | 
 75 | Again, note that it is \code{fineTuning} that calls this function.  It
 76 | will provide the training and test sets \code{dtrn} and \code{dtst}, as
 77 | well as \code{cmbi} ("combination i"), the particular parameter
 78 | combination to be run at this moment.
 79 | 
 80 | Each chosen combination is run in \code{nXval} folds.  All specified
 81 | combinations are run fully, as opposed to a directional "hill descent"
 82 | search that hopes it might eliminate poor combinations early in the process.
 83 | 
 84 | The function \code{knnFineTune} is a wrapper for \code{fineTuning} for
 85 | k-NN problems.
 86 | 
 87 | The function \code{plot.tuner} draws a parallel coordinates plot to
 88 | visualize the grid. The argument \code{x} is the output of
 89 | \code{fineTuning}.  Arguments to specify in the ellipsis are:
 90 | \code{col} is the column to be plotted;
 91 | \code{disp} is the number to display, with \code{0}, \code{-m} and
 92 | \code{+m} meaning cases with the \code{m} smallest 'smoothed' values, all
 93 | cases and the \code{m} largest values of 'smoothed', respectively;
 94 | \code{jit} avoids plotting coincident lines by adding jitter in the
 95 | amount \code{jit * range(x) * runif(n,-0.5,0.5)}.
 96 | 
 97 | 
 98 | }
 99 | 
100 | \value{
101 | Object of class **''tuner'**.  Contains the grid results, including
102 | upper bounds of approximate one-sided 95% confidence intervals, both
103 | univariate and Bonferroni-Dunn (adjusted for the
104 | number of parameter combinations).
105 | }
106 | 
107 | \examples{
108 | 
109 | # mlb data set, predict weight using k-NN, try various values of k
110 | 
111 | tc <- function(dtrn,dtst,cmbi,...)
112 | {
113 |    knnout <- kNN(dtrn[,-10],dtrn[,10],dtst[,-10],as.integer(cmbi[1]))
114 |    preds <- knnout$regests
115 |    mean(abs(preds - dtst[,10]))
116 | }
117 | 
118 | data(mlb)
119 | mlb <- mlb[,3:6]
120 | mlb.d <- factorsToDummies(mlb)
121 | fineTuning(mlb.d,list(k=c(5,25)),tc,nTst=100,nXval=2)
122 | 
123 | tc <- function(dtrn,dtst,cmbi,...)
124 | {
125 |    knnout <- kNN(dtrn[,-10],dtrn[,10],dtst[,-10],as.integer(cmbi[1]))
126 |    preds <- knnout$regests
127 |    mean(abs(preds - dtst[,10]))
128 | }
129 | 
130 | fineTuningMult(mlb.d,list(k=c(5,25)),tc,nTst=100,nXval=2) 
131 | 
132 | \dontrun{
133 | library(qeML)
134 | data(svcensus)
135 | tc1 <- function(dtrn,dtst,cmbi,...)
136 | {
137 |    knnout <- qeKNN(dtrn,'wageinc',as.integer(cmbi[1]),holdout=NULL)
138 |    preds <- predict(knnout,dtst[,-4])
139 |    mape <- mean(abs(preds - dtst[,4]))
140 |    bigprobs75 <- mean(preds > 75000)
141 |    c(mape,bigprobs75)
142 | }
143 | 
144 | fineTuningMult(svcensus,list(k = c(10,25)),tc1,outDim=2)
145 | }
146 | 
147 | }
148 | 
149 | \author{
150 | Norm Matloff
151 | }
152 | 
153 | 


--------------------------------------------------------------------------------
/man/factorsDummies.Rd:
--------------------------------------------------------------------------------
  1 | \name{factorsToDummies}
  2 | \alias{factorToDummies}
  3 | \alias{factorsToDummies}
  4 | \alias{dummiesToFactor}
  5 | \alias{charsToFactors}
  6 | \alias{factorTo012etc}
  7 | \alias{getDFclasses}
  8 | \alias{hasCharacters}
  9 | \alias{hasFactors}
 10 | \alias{toAllNumeric}
 11 | \alias{toSubFactor}
 12 | \alias{toSuperFactor}
 13 | \alias{toAllNumeric}
 14 | \alias{discretize}
 15 | \alias{dummiesToInt}
 16 | \alias{xyDataframeToMatrix}
 17 | 
 18 | \title{Factor Conversion Utilities}
 19 | 
 20 | \description{
 21 | Utilities from converting back and forth between factors and dummy
 22 | variables.
 23 | }
 24 | 
 25 | \usage{
 26 | xyDataframeToMatrix(xy)
 27 | dummiesToInt(dms,inclLast=FALSE)
 28 | factorToDummies(f,fname,omitLast=FALSE,factorInfo=NULL)
 29 | factorsToDummies(dfr,omitLast=FALSE,factorsInfo=NULL,dfOut=FALSE)
 30 | dummiesToFactor(dms,inclLast=FALSE) 
 31 | charsToFactors(dtaf)
 32 | factorTo012etc(f,earlierLevels = NULL)
 33 | discretize(x,endpts)
 34 | getDFclasses(dframe)
 35 | hasCharacters(dfr)
 36 | hasFactors(x)
 37 | toAllNumeric(w,factorsInfo=NULL)
 38 | toSubFactor(f,saveLevels,lumpedLevel="zzzOther")
 39 | toSuperFactor(inFactor,superLevels)
 40 | }
 41 | 
 42 | \arguments{
 43 |   \item{dfOut}{If TRUE, return a data frame, otherwise a matrix.}
 44 |   \item{dms}{Matrix or data frame of dummy columns.}
 45 |   \item{inclLast}{When forming a factor from dummies, include the last
 46 |      dummy as a level if this is TRUE.}
 47 |   \item{xy}{A data frame mentioned for prediction, "Y" in last column.}
 48 |   \item{saveLevels}{In collapsing a factor, which levels to retain.}
 49 |   \item{lumpedLevel}{Name of new level to be created from levels not retained.}
 50 |   \item{x}{A numeric vector, except in \code{hasFactors}, where it is a
 51 |      data frame.}
 52 |   \item{endpts}{Vector to be used as \code{breaks} in call to
 53 |      \code{cut}. To avoid NAs, range of the vector must cover the 
 54 |      range of the input vector.}
 55 |   \item{f}{A factor.}
 56 |   \item{inFactor}{Original factor, to be extended.}
 57 |   \item{superLevels}{New levels to be added to the original factor.}
 58 |   \item{earlierLevels}{Previous levels found for this factor.}
 59 |   \item{fname}{A factor name.}
 60 |   \item{dfr}{A data frame.}
 61 |   \item{w}{A data frame.}
 62 |   \item{dframe}{A data frame, for which we wish to find the column classes.}
 63 |   \item{omitLast}{If TRUE, then generate only k-1 dummies from k factor
 64 |      levels.}
 65 |   \item{factorsInfo}{Attribute from output of \code{factorsToDummies}.}
 66 |   \item{factorInfo}{Attribute from output of \code{factorToDummies}.}
 67 |   \item{dtaf}{A data frame.}
 68 | }
 69 | 
 70 | \details{
 71 | 
 72 | Many R users prefer to express categorical data as R factors, or often
 73 | work with data that is of this type to begin with.  On the other hand,
 74 | many regression packages, e.g. \pkg{lars}, disallow factors.  These
 75 | utilities facilitate conversion from one form to another.
 76 | 
 77 | Here is an overview of the roles of the various functions:
 78 | 
 79 | \itemize{
 80 | 
 81 | \item \code{factorToDummies}: Convert one factor to dummies, yielding a
 82 | matrix of dummies corresponding to that factor.
 83 | 
 84 | \item \code{factorsToDummies}: Convert all factors to dummies, yielding
 85 | a matrix of dummies, corresponding to all factors in the input data
 86 | frame.
 87 | 
 88 | \item \code{dummiesToFactor}: Convert a set of related dummies to a
 89 | factor.
 90 | 
 91 | \item \code{factorTo012etc}: Convert a factor to a numeric code,
 92 | starting at 0.
 93 | 
 94 | \item \code{dummiesToInt}: Convert a related set of dummies to a numeric code,
 95 | starting at 0.
 96 | 
 97 | \item \code{charsToFactors}:  Convert all character columns in a data
 98 | frame to factors.
 99 | 
100 | \item \code{toAllNumeric}: Convert all factors in a data frame to
101 | dummies, yielding a new version of the data frame, including its
102 | original nonfactor columns.
103 | 
104 | \item \code{toSubFactor}: Coalesce some levels of a factor, yielding a
105 | new factor.
106 | 
107 | \item \code{toSuperFactor}: Add levels to a factor. Typically used in
108 | prediction contexts, in which a factor in a data point to be predicted
109 | does not have all the levels of the same factor in the training set.
110 | 
111 | \\item \code{xyDataframeToMatrix}: Given a data frame to be used in
112 | a training set, with "Y" a factor in the last column, change to all
113 | numeric, with dummies in place of all "X" factors and in place of the
114 | "Y" factor.
115 | 
116 | }
117 | 
118 | The optional argument \code{factorsInfo} is intended for use in prediction
119 | contexts.  Typically a set of new cases will not have all levels of the
120 | factor in the training set.  Without this argument, only an incomplete
121 | set of dummies would be generated for the set of new cases.
122 | 
123 | 
124 | A key point about changing factors to dummies is that, for later
125 | prediction after fitting a model in our training set, one needs to use
126 | the same transformations.  Say a factor has levels 'abc', 'de' and 'f'
127 | (and \code{omitLast = FALSE}).  If we later have a set of say two new
128 | cases to predict, and their values for this factor are 'de' and 'f', we
129 | would generate dummies for them but not for 'abc', incompatible with the
130 | three dummies used in the training set.
131 | 
132 | Thus the factor names and levels are saved in attributes, and can be
133 | used as input:  The relations are as follows:
134 | 
135 | \itemize{
136 | 
137 | \item \code{factorsToDummies} calls \code{factorToDummies} on each
138 | factor it finds in its input data frame
139 | 
140 | \item \code{factorToDummies} outputs and later inputs \code{factorsInfo}
141 | 
142 | \item \code{factorsToDummies} outputs and later inputs \code{factorsInfo}
143 | 
144 | }
145 | 
146 | Other functions:
147 | 
148 | \itemize{
149 | 
150 | \item \code{getDFclasses}: Return a vector of the classes of the columns
151 | of a data frame.
152 | 
153 | \item \code{discretize}: Partition range of a vector into (not
154 | necessarily equal-length) intervals, and construct a factor from the
155 | labels of the intervals that the input elements fall into.
156 | 
157 | \item \code{hasCharacters, hasFactors}: Logical scalars, TRUE if the
158 | input data frame has any character or factor columns.
159 | 
160 | }
161 | 
162 | 
163 | }
164 | 
165 | \value{The function \code{factorToDummies} returns a matrix of dummy
166 | variables, while \code{factorsToDummies} returns a new version of the
167 | input data frame, in which each factor is replaced by columns of
168 | dummies.  The function \code{factorToDummies} is similar, but changes
169 | character vectors to factors.
170 | }
171 | 
172 | \examples{
173 | x <- factor(c('abc','de','f','de'))
174 | xd <- factorToDummies(x,'x')  
175 | xd 
176 | #      x.abc x.de
177 | # [1,]     1    0
178 | # [2,]     0    1
179 | # [3,]     0    0
180 | # [4,]     0    1
181 | # attr(,"factorInfo")
182 | # attr(,"factorInfo")$fname
183 | # [1] "x"
184 | # 
185 | # attr(,"factorInfo")$omitLast
186 | # [1] TRUE
187 | # 
188 | # attr(,"factorInfo")$fullLvls
189 | # [1] "abc" "de"  "f"  
190 | w <- factor(c('de','abc','abc'))
191 | wd <- factorToDummies(w,'x',factorInfo=attr(xd,'factorInfo')) 
192 | wd 
193 | #      x.abc x.de
194 | # [1,]     0    1
195 | # [2,]     1    0
196 | # [3,]     1    0
197 | # attr(,"factorInfo")
198 | # attr(,"factorInfo")$fname
199 | # [1] "x"
200 | # 
201 | # attr(,"factorInfo")$omitLast
202 | # [1] TRUE
203 | # 
204 | # attr(,"factorInfo")$fullLvls
205 | # [1] "abc" "de"  "f"  
206 | 
207 | }
208 | 
209 | \author{
210 | Norm Matloff
211 | }
212 | 
213 | 


--------------------------------------------------------------------------------
/inst/DstrFit.md:
--------------------------------------------------------------------------------
  1 | 
  2 | #  Fitting Continuous Parametric Distriibutions
  3 | 
  4 | Say one has some data and wishes to find a parametric distribution family.
  5 | We address two questions here:
  6 | 
  7 | * Why is this desirable?
  8 | 
  9 | * How might it be done?
 10 | 
 11 | ## Why fit a parametric distribution?
 12 | 
 13 | Many statistical and probability models assume there are some parametric
 14 | distributions driving the various variables.  Even in more basic
 15 | settings, having a parametric model for our data allows us to compactly
 16 | describe our data, with a small number of parameters sufficing.
 17 | 
 18 | ## How can the fitting be done?
 19 | 
 20 | In statistics, we treat our data as a sample from a parent population,
 21 | in which the variable of interest, say X, has unknown density
 22 | f<sub>X</sub>(t).  We estimate that density from our data.
 23 | 
 24 | ### Running example:  Pima diabetes data
 25 | 
 26 | This is a widely-used dataset, available for instance from the **mlbench**
 27 | package:
 28 | 
 29 | ``` r
 30 | > library(mlbench)
 31 | > data(PimaIndiansDiabetes2)
 32 | > pima <- PimaIndiansDiabetes2  # shorter to type
 33 | > head(pima)  # always look at your data
 34 |   pregnant glucose pressure triceps insulin mass pedigree age diabetes
 35 | 1        6     148       72      35      NA 33.6    0.627  50      pos
 36 | 2        1      85       66      29      NA 26.6    0.351  31      neg
 37 | 3        8     183       64      NA      NA 23.3    0.672  32      pos
 38 | 4        1      89       66      23      94 28.1    0.167  21      neg
 39 | 5        0     137       40      35     168 43.1    2.288  33      pos
 40 | 6        5     116       74      NA      NA 25.6    0.201  30      neg
 41 | > bmi <- pima$mass
 42 | > bmi <- na.exclude(bmi)  # exclude any row with NAs
 43 | ```
 44 | ### Visual inspection
 45 | 
 46 | So, let's plot the data.  We'll use R's basic histogram function,
 47 | **hist()**.  A more advanced alternative is **density()**, which plots a
 48 | smooth curve.  In calling **hist(bmi,freq=FALSE)** (that second
 49 | argument means we want area = 1.0), we produce this:
 50 | 
 51 | ![alt text](BMIhist.png)
 52 | 
 53 | Remember, this is a sample estimate of f<sub>X</sub>, so it can be used
 54 | for deciding whether to use a given parametric model.
 55 | 
 56 | We see a rather steep rise from 0, quickly reaching a peak, then a
 57 | gradual tapering off toward 0.  This suggests that the gamma
 58 | distribution family may work well.
 59 | 
 60 | There is of course the question as to how *well* this data is fit by the
 61 | gamma family.  We will return to this later.
 62 | 
 63 | ### Estimating parameter values
 64 | 
 65 | Once we decide to use a certain parametric family, we must decide which
 66 | *member* of that family to use. In other words, what parameter values?
 67 | 
 68 | The two classic ways to estimate the parameters are Maximum Likelihood
 69 | Estimation (MLE) and the Method of Moments (MM).  I'm partial to MM,
 70 | and will use that here.
 71 | 
 72 | The population k<sup>th</sup> moment of X is defined to be
 73 | E(X<sup>k</sup>).  It can be estimated by its sample analog
 74 | 
 75 | M<sub>k</sub> = 
 76 | (1/n) &Sigma;<sub>i=1</sub><sup>n</sup> X<sub>i</sub><sup>k</sup>
 77 | 
 78 | where our data are X<sub>1</sub>,...,X<sub>n</sub>.  (The two quantities
 79 | are analogous because E(X<sup>k</sup>) is the average of X<sup>k</sup>
 80 | in the population, while M<sub>k</sub> is the average of X<sup>k</sup>
 81 | in the sample.) So M<sub>1</sub> is simply our sample mean, "X-bar."
 82 | 
 83 | One may also use *central* moments, e.g. Var(X) and s<sup>2</sup>.
 84 | Note that s<sup>2</sup> = M<sub>2</sub> - (M<sub>1</sub>)<sup>2</sup>.
 85 | 
 86 | The idea of MM is to set the population moments equal to their sample
 87 | analogs.  Since the former are functions of the parameters, we can solve
 88 | for the parameters, which serve actually as our estimated parameters.
 89 | 
 90 | If we have q parameters, we form equations corresponding to the first q
 91 | moments.  For the gamma family with paraemeters r and &lambda;, we use
 92 | the first two moments.  For convenience, we'll use variance 
 93 | rather than E(X<sup>2</sup>).
 94 | 
 95 | For the gamma family, EX = r/&lambda; and Var(X) = r/&lambda;<sup>2</sup>.
 96 | So our equations are
 97 | 
 98 | M<sub>1</sub> = r<sub>est</sub> / &lambda;<sub>est</sub>
 99 | 
100 | s<sup>2</sup> = r<sub>est</sub> / &lambda;<sub>est</sub><sup>2</sup>
101 | 
102 | Luckily, these are easy to solve.  We divide the first by the second, yielding
103 | 
104 | &lambda;<sub>est</sub> = M<sub>1</sub> / s<sup>2</sup>
105 | 
106 | and
107 | 
108 | r<sub>est</sub> = M<sub>1</sub><sup>2</sup> / s<sup>2</sup>
109 | 
110 | Let's superimpose the fitted gamma density onto the histogram:
111 | 
112 | ``` r
113 | > curve(dgamma(x,rest,lambest),0,70,add=TRUE)
114 | ```
115 | 
116 | (The function **curve()** plots a function **x**, which in this case has
117 | range (070); **add=TRUE** means superimpose this new graph onto the old
118 | one.)
119 | 
120 | ![alt text](BMIfitted.png)
121 | 
122 | Our fitted parametric density estimate is rather close to the one that
123 | is model-free, so the parametric model seems pretty good.
124 | 
125 | Now let's try comparing CDFs.  Just as a histogram is
126 | a model-free estimate of f<sub>X</sub>, a model-free estimate of
127 | F<sub>X</sub> is the *empirical CDF*:
128 | 
129 | F<sub>est</sub>(t) = proportion of X<sub>i</sub> that are &leq; t.
130 | 
131 | The R function **ecdf** computes this and sets it up for plotting.
132 | (Actually, **ecdf** returns a function of class **'ecdf'**, so calling
133 | **plot()** on the return value invokes **plot.ecdf()**.) Let's go ahead:
134 | 
135 | ``` r
136 | > plot(ecdf(bmi),pch=19,cex=0.1) 
137 | > curve(pgamma(x,ch,lh),0,70,add=TRUE)  # range of data is (0,70)
138 | ```
139 | 
140 | ![alt text](BMIfitwell.png)
141 | 
142 | The ECDF and the fitted CDF are almost identical, wonderful.  We were
143 | lucky here; in most real applications, we do not achieve such a close
144 | fit, even though the fit is usable.
145 | 
146 | **Related software:**
147 | 
148 | The **regtools** package includes a function **mm()** for computing
149 | Method of Moments estimators, and base R has the **mle()** function for
150 | MLE.
151 | 
152 | ### Assessing fit
153 | 
154 | Well then, how well did our gamma model fit?
155 | 
156 | First, note that there are several possible sources of discrepancies between
157 | the histogram and the fitted gamma:
158 | 
159 | * The true f<sub>X</sub> is not gamma.  No model is ever perfectly
160 |   right.  Keep in mind the famous George Box quote, "All models are
161 | false, but some are useful."
162 | 
163 | * We have just a sample of values from the population, so even if our
164 |   gamma model were exactly correct for the true f<sub>X</sub>, our
165 | sample estimate would not form a perfect match. The larger the sample,
166 | the more likely we will have a close fit, but never exact.
167 | 
168 | * Again due to our finite sample, we have nonzero widths for the
169 |   histogram intervals.
170 | 
171 | So, assessing fit is a matter of attempting to decide how much of the
172 | discrepancy is due to having a finite sample, and how much is due to
173 | model error.
174 | 
175 | There are formal methods of assessment, known as *goodness of fit
176 | tests*, but these days hypothesis testing and p-values are frowned upon
177 | for any type of statistical situation, for good reason.  Testing is
178 | especially irrelevant in assessing model fit.  They don't tell us
179 | whether our fit is "good enough" for our purposes.
180 | 
181 | If you feel you must do formal assessment, I suggest forming a
182 | *Kolmogorov-Smirnov confidence band*.  We will not pursue that here.
183 | 
184 | 


--------------------------------------------------------------------------------
/inst/ChoosingKinKFoldCV.md:
--------------------------------------------------------------------------------
  1 | #  Clearing the Confusion:  Choosing K in K-Fold Cross Validation
  2 | 
  3 | **N. Matloff, UC Davis**
  4 | 
  5 | In model selection, a key goal is the avoidance of overfitting.
  6 | Typically that is done by running the various models on a *training*
  7 | dataset, then validating them on *test* data.  The two datasets are
  8 | usually obtaining by one of various kinds of partitioning of one's
  9 | original data.  This process is known as *cross-validation* (CV).
 10 | 
 11 | The most commonly used form of this is *K-fold cross-validation*.  Here
 12 | K is the number of partitioned subsets.  Note that K must be chosen by
 13 | the user. 
 14 | 
 15 | We also note in passing that in machine learning circles, it's common to
 16 | partition one's data into *three* sets:  One chooses one's model by
 17 | fitting several to the training set, often meaning choosing values of
 18 | several tuning parameters, finally selecting the one with the best
 19 | performance on the test set.  The third set, the *validation* set, is
 20 | then used to get a realistic evaluation of the performance of the final
 21 | model, as even that test set stage will induce optimistic bias.
 22 | 
 23 | ## Goals of this document
 24 | 
 25 | 1.  Explain the real motivations behind K-fold CV, and implications for
 26 |     choosing K.
 27 | 
 28 | 2.  Explain the severe limitations of the related theoretical work.
 29 | 
 30 | 3.  There is no really good solution, but we will make suggestions for
 31 |     practical alternatives to CV.  
 32 | 
 33 | ## Notation
 34 | 
 35 | Say we are using a vector of predictor variables/features X to predict
 36 | Y.  The latter may be continuous, categorical or whatever.  Let n denote
 37 | our number of data points, and let p be the number of predictors.
 38 | 
 39 | If say we are predicting human weight from height and age, with a
 40 | dataset consisting of 1000 people, then p = 2 and n = 1000.
 41 | 
 42 | ## What is K-fold CV?
 43 | 
 44 | Let n denote the number of data points.  In the simplest form, we
 45 | randomly split our data into two groups of n/2 points each.  We fit each
 46 | of the candidate models to Group 1.  We then temporarily pretend
 47 | we don't know the Y values in Group 2, and do the following:
 48 | 
 49 | For each model, we take the fitted model from Group 1, and apply it on
 50 | the X values in Group 2 to predict the Y values in that group.  Since
 51 | we know the actual Y values, we can then compare how the various models
 52 | fared.  We choose the one that did the best.
 53 | 
 54 | Or we could split the data into three groups of n/3 data
 55 | points each, say Groups 1, 2 and 3.  For each model, we would fit that
 56 | model on the 2n/3 data in Groups 1 and 2, and predict Group 3.  We would
 57 | next fit to Groups 1 and 3, predicting Group 2, then finally predict
 58 | Group 1 from Grups 2 and 3.  We would then select the model that had the
 59 | best overall accuracy in this process.
 60 | 
 61 | K-fold CV refers to this approach, in which we partition the data into K
 62 | groups.  In the descriptions above, we first explained K = 2, then K = 3.  An 
 63 | important special case is K = n, sometimes called the "leaving one out" method.
 64 | 
 65 | ## How we might choose K: a first try
 66 | 
 67 | Why burden the user with choosing K?  She may already have many other
 68 | tuning paramers to worry about, e.g. node size in random forests.  Here
 69 | is the tradeoff:
 70 | 
 71 | 1.  Consider K = 2, perhaps the most "natural" choice of K.
 72 |     A problem is that our results won't be very representative.  After
 73 | all, in the end, in order to use our final model in predicting future cases,
 74 | we will want to use all n points in our dataset.  But 2-fold CV will
 75 | only find the best model for n/2 amount of data.  In view of the fact
 76 | that, the larger our dataset, the more detailed a model we can form, K=2
 77 | is not really providing what we need.
 78 | 
 79 | 2.  Now consider the other extreme, K = n.  During CV, we will be
 80 |     fitting models to data subsets of size n-1.  But since the best model for
 81 |     n-1 amount of data will essentially be the same as for n amount
 82 | (though possibly not the case if p is large), this choice of K seems best.  But 
 83 | there are two problems.
 84 | 
 85 | - Now we are facing a seriously large amount of computation -- for each
 86 |   of our candidate models, we do the computing n times, once for each
 87 | fold.
 88 | 
 89 | - Some theoretical work has indicated that this will not work for large
 90 |   n anyway, i.e. the probability of choosing the best model will
 91 | actually decrease as n grows.
 92 | 
 93 | Thus a "good" value of K would seem to be somewhere between 2 and n.
 94 | Well, then, where?
 95 | 
 96 | ## Role of the theory
 97 | 
 98 | There has been much theoretical math done in answering the question of
 99 | how one should choose K, beginning with Shao in 1993.  A nice, updated
100 | account is in the book by Clark, Fokoue and Zhang (2009).  See also the
101 | recent paper by Lei (2019).  
102 | 
103 | There has has been theoretical work aimed at deciding how large
104 | p can be, relative to n, for statistical estimators to have desired
105 | properties.  The major work on this issue continues to be that of
106 | Portnoy (1988); see for instance Anirban DasGupta (2008).
107 | 
108 | Though such work is impressive math, it is of rather little value in
109 | practice.  Here's why:
110 | 
111 | -  The mathematical conditions assumed by the theorems are impossible to
112 |    verify in practice, and indeed rather meaningless.
113 | 
114 | -  Even more important, consider a linear regression setting, so we have
115 |    p+1 parameters. The theory typically assumes that most of the true
116 | regression coefficients are 0, with model selection amounting to
117 | determining which are non-0.  This is not how things work in the real
118 | world.  First, some true coefficients may be non-0 but small.  Second,
119 | it is not necesarily true that we should use all variables with non-0
120 | coefficients, even if we knew which ones they are; there still may be
121 | too many of them to avoid overfitting.  variables may result in
122 | overfitting.  We thus may need to eliminate some, even with non-0
123 | values, a very different setting than what is covered by the theory.
124 | 
125 | ## So, what CAN be done?
126 | 
127 | Unfortunately, **there is no magic solution here**.  But a reasonable
128 | approach is to limit model complexity (measured by p) in the first place.  
129 | 
130 | Here is the central issue:  We do CV because of the optimistic bias that
131 | occurs when we assess a model by fitting and predicting on the same
132 | dataset.  This is the motivation for partitioning.  But if p << n, the
133 | amount of bias is negligible.  In such situations, there is no need for
134 | CV.
135 | 
136 | A good rule of thumb is to keep p < sqrt(n).  This too is based on
137 | theory, but at least with rather minimal assumptions.  Portnoy called
138 | this a "safe" strategy, and indeed I've found it to be conservative in
139 | practice, but one may consider that a virtue here. 
140 | 
141 | So, a reasonable approach to the CV question would be to keep p <
142 | sqrt(n), obviating the need for CV in the first place.  We then would
143 | choose the richest model, i.e. the one that consists of all p candidate
144 | predictors.
145 | 
146 | But the old saying, "Easier said than done," does apply.  If our number
147 | of candidate features is a sizable portion of n, or even larger than n,
148 | we still must do some kind of preliminary dimension reduction to attain
149 | p < sqrt(n) before we begin model fitting.  Here are a few possible
150 | approaches:
151 | 
152 | - Apply PCA to the original candidate features, then use the first
153 |   sqrt(n) principal components.  
154 | 
155 | - Do some kind of forward selection in linear regression, analysis,
156 |   limiting the number of steps to sqrt(n).  
157 | 
158 | So for example in the PCA approach, say we have 100 candidate
159 | predictors.  We would run PCA, then fit our model to the first 10
160 | components, and that would be our final model.
161 | 
162 | Again, none of these is a fully satisfying solution.  For instance, PCA
163 | has its own problems if p >> n, and anyway it is always possible that a
164 | small PC can have a large regression coefficient (Zumel, 2016).  But
165 | they are reasonable solutions worth trying in what remains to be one of
166 | the top knotty problems in statistics.
167 | 
168 | ## References
169 | 
170 | Bertrand Clarke, Ernest Fokoue, Hao Helen Zhang.  *Principles and Theory
171 | for Data Mining and Machine Learning*, Springer, 2009.
172 | 
173 | Anirban DasGupta.  *Asymptotic Theory of Statistics and Probability*,
174 | Springer, 2008
175 | 
176 | Jing Lei.  Cross-Validation with Confidence, arXiv:1703.07904, 2017
177 | 
178 | Stephen Portnoy.  Asymptotic Behavior of Likelihood Methods for
179 | Exponential Families when the Number of Parameters Tends to Infinity,
180 | *Ann. Statist.*, Volume 16, Number 1 (1988), 356-366.
181 | 
182 | Jun Shao. Linear Model Selection by Cross-Validation. *Journal of the
183 | American Statistical Association*, 88(422):486–494, 1993
184 | 
185 | Nina Zumel blog, https://ninazumel.com/tag/principal-components-analysis/,
186 | 2016
187 | 
188 | 


--------------------------------------------------------------------------------
/inst/vn.save/vignettes/regtools.Rmd:
--------------------------------------------------------------------------------
  1 | 
  2 | ---
  3 | title: "regtools"
  4 | author: "Norm Matloff"
  5 | vignette: >
  6 |   %\VignetteIndexEntry{regtools}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 | output: rmarkdown::html_vignette
  9 | ---
 10 | 
 11 | # regtools 
 12 | 
 13 | ## Novel tools tools for linear, nonlinear and nonparametric regression.
 14 | 
 15 | These tools are associated with my forthcoming book, <i>From Linear
 16 | Models to Machine Learning: Modern Statistical Regression and
 17 | Classification</i>, N. Matloff, CRC, 2017.  
 18 | 
 19 | <i>The tools are
 20 | useful in general, independently of the book</i>.
 21 | 
 22 | ## FEATURES:
 23 | 
 24 | * Nonparametric regression for general dimensions in predictor and
 25 | response variables, using k-Nearest Neighbors.  Local-linear option.
 26 | Allows for user-specified smoothing method.  Allows for accelerated
 27 | exploration of multiple values of <i>k</i> at once.  Tool to aid in
 28 | choosing <i>k</i>.
 29 | 
 30 | * Innovative tools for assessing fit in linear and nonlinear parametric
 31 | models, via nonparametric methods.  Model evaluation, examination of
 32 | quadratic effects, investigation of nonhomogeneity of variance.
 33 | 
 34 | * Tools for multiclass classification, parametric and nonparametric.
 35 | One vs. All and All vs. All.  Novel adjustment for artificially
 36 | balanced data.
 37 | 
 38 | * Linear regression, PCA and log-linear model estimation in missing-data
 39 | setting, via the Available Cases method.
 40 | 
 41 | * Nicer implementation of ridge regression, with more meaningful scaling
 42 | and better plotting.
 43 | 
 44 | * Extension to nonlinear parametric regression with of Eickert-White
 45 | technique to handle heteroscedasticity.
 46 | 
 47 | * Misc. tools, e.g. Method of Moments estimation (including for
 48 | nonregression settings).
 49 | 
 50 | ## EXAMPLE:  MODEL FIT ASSESSMENT
 51 | 
 52 | Let's take a look at the data set <b>prgeng</b>, some Census data for
 53 | California engineers and programmers in the year 2000. The response
 54 | variable in this example is wage income, and the predictors are age,
 55 | number of weeks worked, and dummy variables for MS and PhD degrees.
 56 | (Some data wrangling was performed first; type <b>?knnest</b> for the
 57 | details.)
 58 | 
 59 | The fit assessment techniques in <b>regtools</b> gauge the fit of
 60 | parametric models by comparing to nonparametric ones.  Since the latter
 61 | are free of model bias, they are very useful in assessing the parametric
 62 | models.
 63 | 
 64 | The function <b>nonparvsxplot()</b> plots the nonparametric fits against
 65 | each predictor variable, for instance to explore nonlinear effects.
 66 | Here is the plot for wage versus (scaled) age:
 67 | 
 68 | <img src = "wagevsage.png">
 69 | 
 70 | Of course, the effects of the other predictors don't show up here, but
 71 | there does seem to be a quadratic effect. The same was true for the
 72 | predictor measuring the number of weeks worked (slightly concave up, not
 73 | shown here).  In our linear parametric model, then, we will include
 74 | squared terms for these two predictors.
 75 | 
 76 | So, after fitting the linear model, run <b>parvsnonparplot()</b>, which
 77 | plots the fit of the parametric model against the nonparametric one.
 78 | Here is the result:
 79 | 
 80 | <img src = "parvsnonpar.png">
 81 | 
 82 | There is quite a bit suggested in this picture:
 83 | 
 84 | * There seems to be some overfitting near the low end, and underfitting at
 85 | the high end.  
 86 | 
 87 | * The outliers, meaning points far from the fitted linear model, are
 88 | almost all below the linear fit.
 89 | 
 90 | * There are intriguing "sreaks" or "tails" of points, suggesting the
 91 | possible existence of important subpopulations.
 92 | 
 93 | * There appear to be a number of people with 0 wage income. Depending on
 94 | the goals of our analysis, we might consider removing them.
 95 | 
 96 | Finally, let's check the classical assumption of homoscedasticity,
 97 | meaning that the conditional variance of Y given X is constant.  The
 98 | function <b>nonparvarplot()</b> plots the estimated conditional variance
 99 | against the estimated conditional mean, both computed nonparametrically::
100 | 
101 | <img src = "varvsmean.png">
102 | 
103 | Wow, a hockey stick!  Though there is a mild rise in <i>coefficient of
104 | determination</i>, i.e.  standard deviation relative to the mean, up to
105 | about $80K, the slope increases sharply after that.
106 | 
107 | What to do?  As long as our linear regression model assumption holds,
108 | violation of the homoscedasticity assumption won't invalidate our
109 | estimates; they still will be <i>statistically consistent</i>.  But the
110 | standard errors we compute, and thus the statistical inference we
111 | perform, will be affected.  This is correctible using the  Eickert-White
112 | procedure, which for linear models is available in the <b>car</b>
113 | package, included in <b>regtools</b>.  Our package also extends
114 | this to nonlinear parametric models, in our function <b>nlshc()</b> (the
115 | validity of this extension is shown in the book).
116 | 
117 | Of course, the "hockey stick" form is another indication that we should
118 | further investigate the model itself.  It may well be useful to fit two
119 | separate linear models, one for incomes below $80K and the other for the
120 | higher incomes.  For a more formal approach to this, we might consider
121 | <i>changepoint</i> methods, such as in the CRAN package
122 | <strong>chngpt</strong>.
123 | 
124 | <strong>What is different:</strong>
125 | 
126 | Note carefully that the above graph is unaffected by the validity of
127 | the parametric model; it is based purely on nonparametric analysis.
128 | This is in contrast to classic regression fit methods, most of which are
129 | based on examination of residuals of the fitted model.
130 | 
131 | ## EXAMPLE: OVA VS. AVA IN MULTICLASS PROBLEMS
132 | 
133 | A very popular prediction method in 2-class problems is to use logistic
134 | (logit) regression. In analyzing click-through patterns of Web users,
135 | for instance, we have 2 classes, Click and Nonclick.  We might fit a
136 | logistic model for Click, given user Web history, demographics and so
137 | on.  Note that logit actually models probabilities, e.g. the probability
138 | of Click given the predictor variables.
139 | 
140 | But the situation is much less simple in multiclass settings. Suppose
141 | our application is recognition of hand-written digits (a famous machine
142 | learning example). The predictor variables are pixel patterns in images.
143 | There are two schools of thought on this:
144 | 
145 | * <i>One vs. All (OVA):</i>  We would run 26 logistic regression models,
146 |   one for predicting '0' vs. non-'0', one for '1' vs. non-'1', and so
147 | on.  For a particular image, we would thus obtain 26 estimated
148 | probabilities.  Let i<sub>max</sub> be the image that yields the largest
149 | probability; we would then guess the digit for the image to be 'i'.
150 | 
151 | * <i>All vs. All (AVA):</i>  Here we would run C(10,2) = 45 logit
152 | analyses, one for each pair of digits.  There would be one for '0' vs.
153 | '1', one for '0' vs. '2', etc., all the way up through '8' vs. '9'.
154 | Many in the machine learning literature recommend AVA over OVA, on the
155 | grounds that might be linearly separable (in the statistical sense) in
156 | pairs but not otherwise.  My book counters by positing that such a
157 | situation could be remedied under OVA by adding quadratic terms to the
158 | logit models.
159 | 
160 | At any rate, the <strong>regtools</strong> package gives you a choice,
161 | OVA or AVA, for both parametric and nonparametric methods.  For example,
162 | <strong>avalogtrn()</strong> and <strong>avalogpred()</strong> do
163 | training and prediction operations for logit with AVA.
164 | 
165 | Another feature concerns adjustment of class probabilities.  In many
166 | multiclass data sets, the numbers of points in each class is the same,
167 | or least not reflective of the population class probabilities. In
168 | <strong>regtools</strong>, the user can specify estimates of the latter,
169 | for logit and nonparametric methods.
170 | 
171 | So, let's look at an example, using the UCI Letter Recognition data set,
172 | another image recognition example.  Again, the code below was preceded
173 | by some data wrangling, which changed the letter data from character to
174 | numeric, and which divided the data set into training and test sets.
175 | Here is the OVA run:
176 | 
177 | <pre>
178 | > ologout <- ovalogtrn(26,lrtrn[,c(2:17,1)]) 
179 | > ypred <- ovalogpred(ologout,lrtest[,-1]) 
180 | > mean(ypred == lrtest[,1]) 
181 | [1] 0.7193333 
182 | </pre>
183 | 
184 | So, we get about a 72% rate of correct classification.  Now let's try
185 | AVA:
186 | 
187 | <pre>
188 | > alogout <- avalogtrn(26,lrtrn[,c(2:17,1)])
189 | > ypred <- avalogpred(26,alogout,lrtest[,-1])
190 | > mean(ypred == lrtest[,1])
191 | [1] 0.8355
192 | </pre>
193 | 
194 | AVA did considerably better, 84%.  So, apparently AVA fixed a poor
195 | model. But of course, it's better to make a good model in the first
196 | place. Based on our previous observation that the boundaries may be
197 | better approximated by curves than lines, let's try a quadratic model.
198 | 
199 | There were 16 predictors, thus 16 possible quadratic terms, and C(16,2)
200 | = 120 possible interaction terms.  Inclusion of all such variables would
201 | probably produce too rich a model for the 14000 points in our training
202 | set.  We'll settle for adding just the squared terms (not shown):
203 | 
204 | <pre>
205 | > ologout <- ovalogtrn(26,lrtrn[,c(2:33,1)])
206 | > ypred <- ovalogpred(ologout,lrtest[,-1])
207 | > mean(ypred == lrtest[,1])
208 | [1] 0.8086667
209 | </pre>
210 | 
211 | Ah, much better, though still not quite as good as AVA.
212 | 
213 | 


--------------------------------------------------------------------------------
/inst/vn.save/vignettes/regtools.Rmd.save:
--------------------------------------------------------------------------------
  1 | 
  2 | ---
  3 | title: "regtools"
  4 | author: "Norm Matloff"
  5 | vignette: >
  6 |   %\VignetteIndexEntry{regtools}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 | output: rmarkdown::html_vignette
  9 | ---
 10 | 
 11 | # regtools 
 12 | 
 13 | ## Novel tools tools for linear, nonlinear and nonparametric regression.
 14 | 
 15 | These tools are associated with my forthcoming book, <i>From Linear
 16 | Models to Machine Learning: Modern Statistical Regression and
 17 | Classification</i>, N. Matloff, CRC, 2017.  
 18 | 
 19 | <i>The tools are
 20 | useful in general, independently of the book</i>.
 21 | 
 22 | ## FEATURES:
 23 | 
 24 | * Nonparametric regression for general dimensions in predictor and
 25 | response variables, using k-Nearest Neighbors.  Local-linear option.
 26 | Allows for user-specified smoothing method.  Allows for accelerated
 27 | exploration of multiple values of <i>k</i> at once.  Tool to aid in
 28 | choosing <i>k</i>.
 29 | 
 30 | * Innovative tools for assessing fit in linear and nonlinear parametric
 31 | models, via nonparametric methods.  Model evaluation, examination of
 32 | quadratic effects, investigation of nonhomogeneity of variance.
 33 | 
 34 | * Tools for multiclass classification, parametric and nonparametric.
 35 | One vs. All and All vs. All.  Novel adjustment for artificially
 36 | balanced data.
 37 | 
 38 | * Linear regression, PCA and log-linear model estimation in missing-data
 39 | setting, via the Available Cases method.
 40 | 
 41 | * Nicer implementation of ridge regression, with more meaningful scaling
 42 | and better plotting.
 43 | 
 44 | * Extension to nonlinear parametric regression with of Eickert-White
 45 | technique to handle heteroscedasticity.
 46 | 
 47 | * Misc. tools, e.g. Method of Moments estimation (including for
 48 | nonregression settings).
 49 | 
 50 | ## EXAMPLE:  MODEL FIT ASSESSMENT
 51 | 
 52 | Let's take a look at the data set <b>prgeng</b>, some Census data for
 53 | California engineers and programmers in the year 2000. The response
 54 | variable in this example is wage income, and the predictors are age,
 55 | number of weeks worked, and dummy variables for MS and PhD degrees.
 56 | (Some data wrangling was performed first; type <b>?knnest</b> for the
 57 | details.)
 58 | 
 59 | The fit assessment techniques in <b>regtools</b> gauge the fit of
 60 | parametric models by comparing to nonparametric ones.  Since the latter
 61 | are free of model bias, they are very useful in assessing the parametric
 62 | models.
 63 | 
 64 | The function <b>nonparvsxplot()</b> plots the nonparametric fits against
 65 | each predictor variable, for instance to explore nonlinear effects.
 66 | Here is the plot for wage versus (scaled) age:
 67 | 
 68 | <img src = "wagevsage.png">
 69 | 
 70 | Of course, the effects of the other predictors don't show up here, but
 71 | there does seem to be a quadratic effect. The same was true for the
 72 | predictor measuring the number of weeks worked (slightly concave up, not
 73 | shown here).  In our linear parametric model, then, we will include
 74 | squared terms for these two predictors.
 75 | 
 76 | So, after fitting the linear model, run <b>parvsnonparplot()</b>, which
 77 | plots the fit of the parametric model against the nonparametric one.
 78 | Here is the result:
 79 | 
 80 | <img src = "parvsnonpar.png">
 81 | 
 82 | There is quite a bit suggested in this picture:
 83 | 
 84 | * There seems to be some overfitting near the low end, and underfitting at
 85 | the high end.  
 86 | 
 87 | * The outliers, meaning points far from the fitted linear model, are
 88 | almost all below the linear fit.
 89 | 
 90 | * There are intriguing "sreaks" or "tails" of points, suggesting the
 91 | possible existence of important subpopulations.
 92 | 
 93 | * There appear to be a number of people with 0 wage income. Depending on
 94 | the goals of our analysis, we might consider removing them.
 95 | 
 96 | Finally, let's check the classical assumption of homoscedasticity,
 97 | meaning that the conditional variance of Y given X is constant.  The
 98 | function <b>nonparvarplot()</b> plots the estimated conditional variance
 99 | against the estimated conditional mean, both computed nonparametrically::
100 | 
101 | <img src = "varvsmean.png">
102 | 
103 | Wow, a hockey stick!  Though there is a mild rise in <i>coefficient of
104 | determination</i>, i.e.  standard deviation relative to the mean, up to
105 | about $80K, the slope increases sharply after that.
106 | 
107 | What to do?  As long as our linear regression model assumption holds,
108 | violation of the homoscedasticity assumption won't invalidate our
109 | estimates; they still will be <i>statistically consistent</i>.  But the
110 | standard errors we compute, and thus the statistical inference we
111 | perform, will be affected.  This is correctible using the  Eickert-White
112 | procedure, which for linear models is available in the <b>car</b>
113 | package, included in <b>regtools</b>.  Our package also extends
114 | this to nonlinear parametric models, in our function <b>nlshc()</b> (the
115 | validity of this extension is shown in the book).
116 | 
117 | Of course, the "hockey stick" form is another indication that we should
118 | further investigate the model itself.  It may well be useful to fit two
119 | separate linear models, one for incomes below $80K and the other for the
120 | higher incomes.  For a more formal approach to this, we might consider
121 | <i>changepoint</i> methods, such as in the CRAN package
122 | <strong>chngpt</strong>.
123 | 
124 | <strong>What is different:</strong>
125 | 
126 | Note carefully that the above graph is unaffected by the validity of
127 | the parametric model; it is based purely on nonparametric analysis.
128 | This is in contrast to classic regression fit methods, most of which are
129 | based on examination of residuals of the fitted model.
130 | 
131 | ## EXAMPLE: OVA VS. AVA IN MULTICLASS PROBLEMS
132 | 
133 | A very popular prediction method in 2-class problems is to use logistic
134 | (logit) regression. In analyzing click-through patterns of Web users,
135 | for instance, we have 2 classes, Click and Nonclick.  We might fit a
136 | logistic model for Click, given user Web history, demographics and so
137 | on.  Note that logit actually models probabilities, e.g. the probability
138 | of Click given the predictor variables.
139 | 
140 | But the situation is much less simple in multiclass settings. Suppose
141 | our application is recognition of hand-written digits (a famous machine
142 | learning example). The predictor variables are pixel patterns in images.
143 | There are two schools of thought on this:
144 | 
145 | * <i>One vs. All (OVA):</i>  We would run 26 logistic regression models,
146 |   one for predicting '0' vs. non-'0', one for '1' vs. non-'1', and so
147 | on.  For a particular image, we would thus obtain 26 estimated
148 | probabilities.  Let i<sub>max</sub> be the image that yields the largest
149 | probability; we would then guess the digit for the image to be 'i'.
150 | 
151 | * <i>All vs. All (AVA):</i>  Here we would run C(10,2) = 45 logit
152 | analyses, one for each pair of digits.  There would be one for '0' vs.
153 | '1', one for '0' vs. '2', etc., all the way up through '8' vs. '9'.
154 | Many in the machine learning literature recommend AVA over OVA, on the
155 | grounds that might be linearly separable (in the statistical sense) in
156 | pairs but not otherwise.  My book counters by positing that such a
157 | situation could be remedied under OVA by adding quadratic terms to the
158 | logit models.
159 | 
160 | At any rate, the <strong>regtools</strong> package gives you a choice,
161 | OVA or AVA, for both parametric and nonparametric methods.  For example,
162 | <strong>avalogtrn()</strong> and <strong>avalogpred()</strong> do
163 | training and prediction operations for logit with AVA.
164 | 
165 | Another feature concerns adjustment of class probabilities.  In many
166 | multiclass data sets, the numbers of points in each class is the same,
167 | or least not reflective of the population class probabilities. In
168 | <strong>regtools</strong>, the user can specify estimates of the latter,
169 | for logit and nonparametric methods.
170 | 
171 | So, let's look at an example, using the UCI Letter Recognition data set,
172 | another image recognition example.  Again, the code below was preceded
173 | by some data wrangling, which changed the letter data from character to
174 | numeric, and which divided the data set into training and test sets.
175 | Here is the OVA run:
176 | 
177 | <pre>
178 | > ologout <- ovalogtrn(26,lrtrn[,c(2:17,1)]) 
179 | > ypred <- ovalogpred(ologout,lrtest[,-1]) 
180 | > mean(ypred == lrtest[,1]) 
181 | [1] 0.7193333 
182 | </pre>
183 | 
184 | So, we get about a 72% rate of correct classification.  Now let's try
185 | AVA:
186 | 
187 | <pre>
188 | > alogout <- avalogtrn(26,lrtrn[,c(2:17,1)])
189 | > ypred <- avalogpred(26,alogout,lrtest[,-1])
190 | > mean(ypred == lrtest[,1])
191 | [1] 0.8355
192 | </pre>
193 | 
194 | AVA did considerably better, 84%.  So, apparently AVA fixed a poor
195 | model. But of course, it's better to make a good model in the first
196 | place. Based on our previous observation that the boundaries may be
197 | better approximated by curves than lines, let's try a quadratic model.
198 | 
199 | There were 16 predictors, thus 16 possible quadratic terms, and C(16,2)
200 | = 120 possible interaction terms.  Inclusion of all such variables would
201 | probably produce too rich a model for the 14000 points in our training
202 | set.  We'll settle for adding just the squared terms (not shown):
203 | 
204 | <pre>
205 | > ologout <- ovalogtrn(26,lrtrn[,c(2:33,1)])
206 | > ypred <- ovalogpred(ologout,lrtest[,-1])
207 | > mean(ypred == lrtest[,1])
208 | [1] 0.8086667
209 | </pre>
210 | 
211 | Ah, much better, though still not quite as good as AVA.
212 | 
213 | 


--------------------------------------------------------------------------------
/inst/vn.save/vignettes/regtools.Rmd~:
--------------------------------------------------------------------------------
  1 | 
  2 | ---
  3 | title: "regtools"
  4 | output: rmarkdown::html_vignette
  5 | vignette: >
  6 |   %\VignetteIndexEntry{regtools}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 |   \usepackage[utf8]{inputenc}
  9 | ---
 10 | 
 11 | # regtools 
 12 | 
 13 | ## Novel tools tools for linear, nonlinear and nonparametric regression.
 14 | 
 15 | These tools are associated with my forthcoming book, <i>From Linear
 16 | Models to Machine Learning: Modern Statistical Regresison and
 17 | Classification</i>, N. Matloff, CRC, 2017.  
 18 | 
 19 | <i>The tools are
 20 | useful in general, independently of the book</i>.
 21 | 
 22 | ## FEATURES:
 23 | 
 24 | * Nonparametric regression for general dimensions in predictor and
 25 | response variables, using k-Nearest Neighbors.  Local-linear option.
 26 | Allows for user-specified smoothing method.  Allows for accelerated
 27 | exploration of multiple values of <i>k</i> at once.  Tool to aid in
 28 | choosing <i>k</i>.
 29 | 
 30 | * Innovative tools for assessing fit in linear and nonlinear parametric
 31 | models, via nonparametric methods.  Model evaluation, examination of
 32 | quadratic effects, investigation of nonhomogeneity of variance.
 33 | 
 34 | * Tools for multiclass classification, parametric and nonparametric.
 35 | One vs. All and All vs. All.  Novel adjustment for artificially
 36 | balanced data.
 37 | 
 38 | * Linear regression, PCA and log-linear model estimation in missing-data
 39 | setting, via the Available Cases method.
 40 | 
 41 | * Nicer implementation of ridge regression, with more meaningful scaling
 42 | and better plotting.
 43 | 
 44 | * Extension to nonlinear parametric regression with of Eickert-White
 45 | technique to handle heteroscedasticity.
 46 | 
 47 | * Misc. tools, e.g. Method of Moments estimation (including for
 48 | nonregression settings).
 49 | 
 50 | ## EXAMPLE:  MODEL FIT ASSESSMENT
 51 | 
 52 | Let's take a look at the data set <b>prgeng</b>, some Census data for
 53 | California engineers and programmers in the year 2000. The response
 54 | variable in this example is wage income, and the predictors are age,
 55 | number of weeks worked, and dummy variables for MS and PhD degrees.
 56 | (Some data wrangling was performed first; type <b>?knnest</b> for the
 57 | details.)
 58 | 
 59 | The fit assessment techniques in <b>regtools</b> gauge the fit of
 60 | parametric models by comparing to nonparametric ones.  Since the latter
 61 | are free of model bias, they are very useful in assessing the parametric
 62 | models.
 63 | 
 64 | The function <b>nonparvsxplot()</b> plots the nonparametric fits against
 65 | each predictor variable, for instance to explore nonlinear effects.
 66 | Here is the plot for wage versus (scaled) age:
 67 | 
 68 | <img src = "wagevsage.png">
 69 | 
 70 | Of course, the effects of the other predictors don't show up here, but
 71 | there does seem to be a quadratic effect. The same was true for the
 72 | predictor measuring the number of weeks worked (slightly concave up, not
 73 | shown here).  In our linear parametric model, then, we will include
 74 | squared terms for these two predictors.
 75 | 
 76 | So, after fitting the linear model, run <b>parvsnonparplot()</b>, which
 77 | plots the fit of the parametric model against the nonparametric one.
 78 | Here is the result:
 79 | 
 80 | <img src = "parvsnonpar.png">
 81 | 
 82 | There is quite a bit suggested in this picture:
 83 | 
 84 | * There seems to be some overfitting near the low end, and underfitting at
 85 | the high end.  
 86 | 
 87 | * The outliers, meaning points far from the fitted linear model, are
 88 | almost all below the linear fit.
 89 | 
 90 | * There are intriguing "sreaks" or "tails" of points, suggesting the
 91 | possible existence of important subpopulations.
 92 | 
 93 | * There appear to be a number of people with 0 wage income. Depending on
 94 | the goals of our analysis, we might consider removing them.
 95 | 
 96 | Finally, let's check the classical assumption of homoscedasticity,
 97 | meaning that the conditional variance of Y given X is constant.  The
 98 | function <b>nonparvarplot()</b> plots the estimated conditional variance
 99 | against the estimated conditional mean, both computed nonparametrically::
100 | 
101 | <img src = "varvsmean.png">
102 | 
103 | Wow, a hockey stick!  Though there is a mild rise in <i>coefficient of
104 | determination</i>, i.e.  standard deviation relative to the mean, up to
105 | about $80K, the slope increases sharply after that.
106 | 
107 | What to do?  As long as our linear regression model assumption holds,
108 | violation of the homoscedasticity assumption won't invalidate our
109 | estimates; they still will be <i>statistically consistent</i>.  But the
110 | standard errors we compute, and thus the statistical inference we
111 | perform, will be affected.  This is correctible using the  Eickert-White
112 | procedure, which for linear models is available in the <b>car</b>
113 | package, included in <b>regtools</b>.  Our package also extends
114 | this to nonlinear parametric models, in our function <b>nlshc()</b> (the
115 | validity of this extension is shown in the book).
116 | 
117 | Of course, the "hockey stick" form is another indication that we should
118 | further investigate the model itself.  It may well be useful to fit two
119 | separate linear models, one for incomes below $80K and the other for the
120 | higher incomes.  For a more formal approach to this, we might consider
121 | <i>changepoint</i> methods, such as in the CRAN package
122 | <strong>chngpt</strong>.
123 | 
124 | <strong>What is different:</strong>
125 | 
126 | Note carefully that the above graph is unaffected by the validity of
127 | the parametric model; it is based purely on nonparametric analysis.
128 | This is in contrast to classic regression fit methods, most of which are
129 | based on examination of residuals of the fitted model.
130 | 
131 | ## EXAMPLE; OVA VS. AVA IN MULTICLASS PROBLEMS
132 | 
133 | A very popular prediction method in 2-class problems is to use logistic
134 | (logit) regression. In analyzing click-through patterns of Web users,
135 | for instance, we have 2 classes, Click and Nonclick.  We might fit a
136 | logistic model for Click, given user Web history, demographics and so
137 | on.  Note that logit actually models probabilities, e.g. the probability
138 | of Click given the predictor variables.
139 | 
140 | But the situation is much less simple in multiclass settings. Suppose
141 | our application is recognition of hand-written digits (a famous machine
142 | learning example). The predictor variables are pixel patterns in images.
143 | There are two schools of thought on this:
144 | 
145 | * <i>One vs. All (OVA):</i>  We would run 26 logistic regression models,
146 |   one for predicting '0' vs. non-'0', one for '1' vs. non-'1', and so
147 | on.  For a particular image, we would thus obtain 26 estimated
148 | probabilities.  Let i<sub>max</sub> be the image that yields the largest
149 | probability; we would then guess the digit for the image to be 'i'.
150 | 
151 | * <i>All vs. All (AVA):</i>  Here we would run C(10,2) = 45 logit
152 | analyses, one for each pair of digits.  There would be one for '0' vs.
153 | '1', one for '0' vs. '2', etc., all the way up through '8' vs. '9'.
154 | Many in the machine learning literature recommend AVA over OVA, on the
155 | grounds that might be linearly separable (in the statistical sense) in
156 | pairs but not otherwise.  My book counters by positing that such a
157 | situation could be remedied under OVA by adding quadratic terms to the
158 | logit models.
159 | 
160 | At any rate, the <strong>regtools</strong> package gives you a choice,
161 | OVA or AVA, for both parametric and nonparametric methods.  For example,
162 | <strong>avalogtrn()</strong> and <strong>avalogpred()</strong> do
163 | training and prediction operations for logit with AVA.
164 | 
165 | Another feature concerns adjustment of class probabilities.  In many
166 | multiclass data sets, the numbers of points in each class is the same,
167 | or least not reflective of the population class probabilities. In
168 | <strong>regtools</strong>, the user can specify estimates of the latter,
169 | for logit and nonparametric methods.
170 | 
171 | So, let's look at an example, using the UCI Letter Recognition data set,
172 | another image recognition example.  Again, the code below was preceded
173 | by some data wrangling, which changed the letter data from character to
174 | numeric, and which divided the data set into training and test sets.
175 | Here is the OVA run:
176 | 
177 | ```{r}
178 | &>; ologout <- ovalogtrn(26,lrtrn[,c(2:17,1)]) 
179 | &>; ypred <- ovalogpred(ologout,lrtest[,-1]) 
180 | &>; mean(ypred == lrtest[,1]) 
181 | [1] 0.7193333 
182 | ```
183 | 
184 | So, we get about a 72% rate of correct classification.  Now let's try
185 | AVA:
186 | 
187 | ```{r}
188 | > alogout <- avalogtrn(26,lrtrn[,c(2:17,1)])
189 | > ypred <- avalogpred(26,alogout,lrtest[,-1])
190 | > mean(ypred == lrtest[,1])
191 | [1] 0.8355
192 | ```
193 | 
194 | AVA did considerably better, 84%.  So, apparently AVA fixed a poor
195 | model. But of course, it’s better to make a good model in the first
196 | place. Based on our previous observation that the boundaries may be
197 | better approximated by curves than lines, let's try a quadratic model.
198 | 
199 | There were 16 predictors, thus 16 possible quadratic terms, and C(16,2)
200 | = 120 possible interaction terms.  Inclusion of all such variables would
201 | probably produce too rich a model for the 14000 points in our training
202 | set.  We'll settle for adding just the squared terms (not shown):
203 | 
204 | ```{r}
205 | > ologout <- ovalogtrn(26,lrtrn[,c(2:33,1)])
206 | > ypred <- ovalogpred(ologout,lrtest[,-1])
207 | > mean(ypred == lrtest[,1])
208 | [1] 0.8086667
209 | ```
210 | 
211 | Ah, much better, though still not quite as good as AVA.
212 | 
213 | 


--------------------------------------------------------------------------------
/R/AC.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # Missing Values routines; also see polyanNA package
  3 | 
  4 | ######################################################################
  5 | ######################################################################
  6 | 
  7 | # code to implement the Available Cases method (also called Pairwise
  8 | # Complete) for handling missing data
  9 | 
 10 | ########################  linear regression  ##########################
 11 | 
 12 | # arguments:
 13 | 
 14 | # xy: data, with predictors in the first columns and the
 15 | #     response variable in the last column
 16 | # nboot:  if nonzero, this requests bootstrapped computation of the
 17 | #         estimated covariance matrix of the estimated vector of
 18 | #         regression coefficients
 19 | 
 20 | # value:  an object of class 'lmac', with components 
 21 | #
 22 | #         coefficients:  estimated regression coefficients
 23 | #         fitted.values:  est. regression ftn. values at 
 24 | #                         complete cases (but with full coefs.)
 25 | #         residuals:  residuals at complete cases (but with full coefs.)
 26 | #         r2:  R-squared
 27 | #         cov:  optional est. covariance matrix of the coefs.
 28 | 
 29 | lmac <- function(xy,nboot=0) {
 30 |    p1 <- ncol(xy)
 31 |    p <- p1 - 1
 32 |    tmp <- cov(xy,use='pairwise.complete.obs')
 33 |    upu <- tmp[1:p,1:p]
 34 |    upv <- tmp[1:p,p+1]
 35 |    bhat <- solve(upu,upv)
 36 |    lmacout <- list()
 37 |    class(lmacout) <- 'lmac'
 38 |    # bhat0 <- mean(y) - colMeans(x) %*% bhat
 39 |    bhat0 <- colMeans(xy,na.rm=TRUE) %*% c(-bhat,1)
 40 |    bhat <- c(bhat0,bhat)
 41 |    lmacout$coefficients <- bhat
 42 |    xycc <- na.omit(xy)
 43 |    yhat <- cbind(1,xycc[,-p1]) %*% bhat
 44 |    lmacout$fitted.values <- yhat
 45 |    lmacout$residuals <- xycc[,p1] - yhat
 46 |    lmacout$r2 <- (cor(yhat,xycc[,p1]))^2
 47 |    if (nboot > 0)  {
 48 |       n <- nrow(xy)
 49 |       bootonce <- function() {
 50 |          idxs <- sample(1:n,n,replace=TRUE)
 51 |          lmac(xy[idxs,],nboot=0)$coefficients
 52 |       }
 53 |       bootout <- replicate(nboot,bootonce())
 54 |       lmacout$cov<- cov(t(bootout))
 55 |    }
 56 |    lmacout
 57 | }
 58 | 
 59 | coef.lmac <- function(object,...) {
 60 |    object$coefficients
 61 | }
 62 | 
 63 | vcov.lmac <- function(object,...) {
 64 |    object$cov
 65 | }
 66 | 
 67 | #############################  PCA  ###############################
 68 | 
 69 | #  arguments:
 70 | #
 71 | #     indata: data frame or matrix
 72 | #
 73 | #  value: list with components 'values' and 'vectors', as with eigen()
 74 | 
 75 | pcac <- function(indata,scale=FALSE) {
 76 |    covcor <- if(scale) cor else cov
 77 |    cvr <- covcor(indata,use='pairwise.complete.obs')
 78 |    tmp <- eigen(cvr)
 79 |    res <- list()
 80 |    if (any(tmp$values < 0)) 
 81 |       stop('at least one negative eigenvalue')
 82 |    res$sdev <- sqrt(tmp$values)
 83 |    res$rotation <- tmp$vectors
 84 |    res
 85 | }
 86 | 
 87 | ######################  log-linear model`  ##########################
 88 | 
 89 | # log-linear model; at present, handles only the 3-factor casea
 90 | #
 91 | # arguments:
 92 | #
 93 | #    x: data frame/matrix, one row per observation; use tbltofakedf()
 94 | #       if data is in table form
 95 | #    margin: a list of vectors specifying the model, 
 96 | #            as in loglin()
 97 | #
 98 | # value:  $param and $fit components in the value emitted from R's loglin()
 99 | 
100 | loglinac <- function(x,margin) {
101 |    # find lengths of the elements in the model, to determine what
102 |    # situtation we are in
103 |    termlengths <- Map(length,margin)
104 |    n1 <- sum(termlengths == 1)  # singletons
105 |    n2 <- sum(termlengths == 2)  # 2-way interactions
106 |    # mdlf() ("model function") will find the right cell means 
107 |    # for the specified 'margin'
108 |    # fully independent?
109 |    if (n1 == 3) mdlf <- mindep else
110 |       # one var. independent of the other 2?
111 |       if (n2 == 1) mdlf <- mxindyz else
112 |       # 2 vars. conditionally independent, given the 3rd?
113 |       if (n2 == 2) mdlf <-myzcondindx else
114 |       # case of all possible 2-way interactions not implemented, for
115 |       # lack of a closed-form solution
116 |       stop('case of all 2-way terms not implemented')
117 |    # need an appropriate shell, with the right dimensions, labels etc.;
118 |    # the contents here are irrelevant and will be overwritten
119 |    x <- as.data.frame(x)
120 |    tbl <- table(x)
121 |    tbl <- mdlf(x,margin,tbl,termlengths)
122 |    loglin(tbl,margin,param=TRUE,fit=TRUE)
123 | }
124 | 
125 | # fully independent case
126 | mindep <- function(x,margin,tbl,termlengths) {
127 |    nc <- ncol(x)  # currently must be 3
128 |    probs <- list()
129 |    # find number of distinct values found in each variable, and the
130 |    # estimated marginal probabilities of each value
131 |    nvals <- vector(length=nc)
132 |    for (i in 1:nc) {
133 |       tmp <- table(x[,i])
134 |       probs[[i]] <- tmp / sum(tmp)
135 |       nvals[i] <- length(tmp)
136 |    }
137 |    # now find estimated cell probabilities
138 |    for (i in 1:nvals[1]) 
139 |       for (j in 1:nvals[2]) 
140 |          for (k in 1:nvals[3]) {
141 |             tbl[i,j,k] <- 
142 |                probs[[1]][i] *
143 |                probs[[2]][j] *
144 |                probs[[3]][k] 
145 |          }
146 |     # convert to estimated expected cell counts
147 |     tbl <- nrow(x) * tbl
148 | }
149 | 
150 | # case of 1 variable, X, being independent of the other 2, Y and Z
151 | mxindyz <- function(x,margin,tbl,termlengths) {
152 |    # which ones are Y and Z?
153 |    iyz <- margin[[1]]
154 |    nc <- ncol(x)  # 3
155 |    # which variable is X?
156 |    ix <- setdiff((1:nc),iyz)
157 |    # find number of distinct values found in each variable, and the
158 |    # estimated marginal probabilities of each value
159 |    probs <- list()
160 |    nvals <- vector(length=nc)
161 |    nvals[1] <- length(table(x[,ix]))
162 |    nvals[2] <- length(table(x[,iyz[1]]))
163 |    nvals[3] <- length(table(x[,iyz[2]]))
164 |    tmp <- table(x[,ix])
165 |    probs[[1]] <- tmp / sum(tmp)
166 |    tmp <- table(x[,iyz])
167 |    probs[[2]] <- tmp / sum(tmp)
168 |    for (i in 1:nvals[1]) 
169 |       for (j in 1:nvals[2]) 
170 |          for (k in 1:nvals[3]) {
171 |             if (ix == 1) {
172 |             tbl[i,j,k] <- 
173 |                probs[[1]][i] *
174 |                probs[[2]][j,k] 
175 |             } else if (ix == 2) {
176 |                tbl[i,j,k] <- 
177 |                   probs[[1]][j] *
178 |                   probs[[2]][i,k] 
179 |             } else {  # ix = 3
180 |                tbl[i,j,k] <- 
181 |                   probs[[1]][k] *
182 |                   probs[[2]][i,j] 
183 |             }
184 |          }
185 |     tbl <- nrow(x) * tbl
186 | }
187 | 
188 | # case of 2 variables being conditionally independent, given the 3rd
189 | myzcondindx <- function(x,margin,tbl,termlengths) {
190 |    # which variable is X?
191 |    ix <- intersect(margin[[1]],margin[[2]])
192 |    # which ones are Y and Z?
193 |    iyz <- setdiff(union(margin[[1]],margin[[2]]),ix)
194 |    iy <- iyz[1]
195 |    iz <- iyz[2]
196 |    # easier to keep track of all if iy < iz
197 |    if (iy > iz) {
198 |       tmp <- iz
199 |       iz <- iy
200 |       iy <- tmp
201 |    }
202 |    nc <- ncol(x)  # currently 3
203 |    # find number of distinct values found in each variable, and the
204 |    # estimated marginal probabilities of each value
205 |    probs <- list()
206 |    nvals <- vector(length=nc)
207 |    # nvals[1] <- length(table(x[,ix]))
208 |    # nvals[2] <- length(table(x[,iy]))
209 |    # nvals[3] <- length(table(x[,iz]))
210 |    nvals[ix] <- length(table(x[,ix]))
211 |    nvals[iy] <- length(table(x[,iy]))
212 |    nvals[iz] <- length(table(x[,iz]))
213 |    tmp <- table(x[,ix])
214 |    probs[[1]] <- tmp / sum(tmp)
215 |    tmp <- table(x[,c(ix,iy)])
216 |    probs[[2]] <- tmp / sum(tmp)
217 |    tmp <- table(x[,c(ix,iz)])
218 |    probs[[3]] <- tmp / sum(tmp)
219 |    for (i in 1:nvals[1]) 
220 |       for (j in 1:nvals[2]) 
221 |          for (k in 1:nvals[3]) {
222 |             if (ix == 1) {
223 |                tbl[i,j,k] <- 
224 |                   probs[[3]][i,k] *
225 |                   probs[[2]][i,j] /
226 |                   probs[[1]][i] 
227 |             } else if (ix == 2) {
228 |                tbl[i,j,k] <- 
229 |                   probs[[3]][j,k] *
230 |                   probs[[2]][j,i] /
231 |                   probs[[1]][j] 
232 |                
233 |             } else {  # ix == 3
234 |                tbl[i,j,k] <- 
235 |                   probs[[3]][k,j] *
236 |                   probs[[2]][k,i] /
237 |                   probs[[1]][k] 
238 |             }
239 |          }
240 |     tbl <- nrow(x) * tbl
241 | }
242 | 
243 | # converts an R table to a fake data frame; the number of rows will be
244 | # the number of cases in the table, i.e. sum(tbl), and the number of
245 | # columns will be the dimension of the table, i.e. length(dim(tbl));
246 | # if a cell has frequency k, it will appear k times in the output
247 | tbltofakedf <- function(tbl) {
248 |    adf <- as.data.frame(tbl)
249 |    nc <- ncol(adf)
250 |    onecell <- function(adfrow) {
251 |       freq <- as.numeric(adfrow[nc])
252 |       if (freq == 0) return(NULL)
253 |       remainingrow <- adfrow[-nc]
254 |       matrix(rep(remainingrow,freq),byrow=TRUE,nrow=freq)
255 |    }
256 |    m <- Reduce(rbind,apply(adf,1,onecell))
257 |    as.data.frame(m)
258 | }
259 | 
260 | ######################################################################
261 | ######################################################################
262 | 
263 | #############################  misc.  ###############################
264 | 
265 | # for testing purposes; randomly replacing each element of matrix m by 
266 | 
267 | makeNA <- function(m,probna) {
268 |    if (!is.matrix(m)) stop('m must be a matrix')
269 |    n <- length(m)
270 |    nmiss <- rbinom(1,n,probna)
271 |    naidxs <- sample(1:n,nmiss,replace=FALSE)
272 |    m[naidxs] <- NA
273 |    m
274 | }
275 | 
276 | # replace NAs by 0s
277 | 
278 | NAsTo0s <- function(x) 
279 | {
280 |    x[is.na(x)] <- 0
281 |    x
282 | }
283 | 
284 | # replace 0s (or other) by NAs 
285 | 
286 | ZerosToNAs <- function(x,replaceVal=0) 
287 | {
288 |    x[x == replaceVal] <- NA
289 |    x
290 | }
291 | 
292 | 


--------------------------------------------------------------------------------
/inst/vn.save/vignettes/regtools.Rnw:
--------------------------------------------------------------------------------
  1 | 
  2 | \documentclass[11pt]{article}
  3 | 
  4 | \setlength{\oddsidemargin}{0in} 
  5 | \setlength{\evensidemargin}{0in}  
  6 | \setlength{\topmargin}{0.0in}  
  7 | \setlength{\headheight}{0in}  
  8 | \setlength{\headsep}{0in} 
  9 | \setlength{\textwidth}{6.5in}
 10 | \setlength{\textheight}{9.0in}
 11 | \setlength{\parindent}{0in}
 12 | \setlength{\parskip}{0.1in}
 13 | 
 14 | \usepackage{listings}  
 15 | 
 16 | \usepackage{graphicx}
 17 | 
 18 | % library(knitr)
 19 | %\VignetteIndexEntry{Partools}
 20 | 
 21 | \begin{document}
 22 | 
 23 | \title{regtools:  Novel Tools for Linear, Nonlinear and
 24 | Nonparametric Regression}
 25 | 
 26 | \author{Norm Matloff}
 27 | 
 28 | \date{November 6, 2016}
 29 | 
 30 | \maketitle
 31 | These tools are associated with my forthcoming book, {\it From Linear Models
 32 | to Machine Learning: Modern Statistical Regression and Classification},
 33 | N. Matloff, CRC, 2017.
 34 | 
 35 | {\it The tools are useful in general, independently of the book.}
 36 | 
 37 | \section{FEATURES:}\label{features}
 38 | 
 39 | \begin{itemize}
 40 | \item
 41 |   Nonparametric regression for general dimensions in predictor and
 42 |   response variables, using k-Nearest Neighbors. Local-linear option.
 43 |   Allows for user-specified smoothing method. Allows for accelerated
 44 |   exploration of multiple values of k at once. Tool to aid in choosing
 45 |   k.
 46 | \item
 47 |   Innovative tools for assessing fit in linear and nonlinear parametric
 48 |   models, via nonparametric methods. Model evaluation, examination of
 49 |   quadratic effects, investigation of nonhomogeneity of variance.
 50 | \item
 51 |   Tools for multiclass classification, parametric and nonparametric. One
 52 |   vs.~All and All vs.~All. Novel adjustment for artificially balanced
 53 |   data.
 54 | \item
 55 |   Linear regression, PCA and log-linear model estimation in missing-data
 56 |   setting, via the Available Cases method.
 57 | \item
 58 |   Nicer implementation of ridge regression, with more meaningful scaling
 59 |   and better plotting.
 60 | \item
 61 |   Extension to nonlinear parametric regression with of Eickert-White
 62 |   technique to handle heteroscedasticity.
 63 | \item
 64 |   Misc. tools, e.g.~Method of Moments estimation (including for
 65 |   nonregression settings).
 66 | \end{itemize}
 67 | 
 68 | \section{EXAMPLE: MODEL FIT
 69 | ASSESSMENT}\label{example-model-fit-assessment}
 70 | 
 71 | Let's take a look at the data set prgeng, some Census data for
 72 | California engineers and programmers in the year 2000. The response
 73 | variable in this example is wage income, and the predictors are age,
 74 | number of weeks worked, and dummy variables for MS and PhD degrees.
 75 | (Some data wrangling was performed first; type ?knnest for the details.)
 76 | 
 77 | The fit assessment techniques in regtools gauge the fit of parametric
 78 | models by comparing to nonparametric ones. Since the latter are free of
 79 | model bias, they are very useful in assessing the parametric models.
 80 | 
 81 | The function nonparvsxplot() plots the nonparametric fits against each
 82 | predictor variable, for instance to explore nonlinear effects. Here is
 83 | the plot for wage versus (scaled) age:
 84 | 
 85 | \includegraphics[width=3.75in]{wagevsage.png}
 86 | 
 87 | Of course, the effects of the other predictors don't show up here, but
 88 | there does seem to be a quadratic effect. The same was true for the
 89 | predictor measuring the number of weeks worked (slightly concave up, not
 90 | shown here). In our linear parametric model, then, we will include
 91 | squared terms for these two predictors.
 92 | 
 93 | So, after fitting the linear model, run parvsnonparplot(), which plots
 94 | the fit of the parametric model against the nonparametric one. Here is
 95 | the result:
 96 | 
 97 | \includegraphics[width=4.25in]{parvsnonpar.png}
 98 | 
 99 | There is quite a bit suggested in this picture:
100 | 
101 | \begin{itemize}
102 | \item
103 |   There seems to be some overfitting near the low end, and underfitting
104 |   at the high end.
105 | \item
106 |   The outliers, meaning points far from the fitted linear model, are
107 |   almost all below the linear fit.
108 | \item
109 |   There are intriguing ``sreaks'' or ``tails'' of points, suggesting the
110 |   possible existence of important subpopulations.
111 | \item
112 |   There appear to be a number of people with 0 wage income. Depending on
113 |   the goals of our analysis, we might consider removing them.
114 | \end{itemize}
115 | 
116 | Finally, let's check the classical assumption of homoscedasticity,
117 | meaning that the conditional variance of Y given X is constant. The
118 | function nonparvarplot() plots the estimated conditional variance
119 | against the estimated conditional mean, both computed
120 | nonparametrically:
121 | 
122 | \includegraphics[width=3.75in]{varvsmean.png}
123 | 
124 | Wow, a hockey stick! Though there is a mild rise in coefficient of
125 | determination, i.e. standard deviation relative to the mean, up to about
126 | \$80K, the slope increases sharply after that.
127 | 
128 | What to do? As long as our linear regression model assumption holds,
129 | violation of the homoscedasticity assumption won't invalidate our
130 | estimates; they still will be statistically consistent. But the standard
131 | errors we compute, and thus the statistical inference we perform, will
132 | be affected. This is correctible using the Eickert-White procedure,
133 | which for linear models is available in the car package, included in
134 | regtools. Our package also extends this to nonlinear parametric models,
135 | in our function nlshc() (the validity of this extension is shown in the
136 | book).
137 | 
138 | Of course, the ``hockey stick'' form is another indication that we
139 | should further investigate the model itself. It may well be useful to
140 | fit two separate linear models, one for incomes below \$80K and the
141 | other for the higher incomes. For a more formal approach to this, we
142 | might consider changepoint methods, such as in the CRAN package chngpt.
143 | 
144 | What is different:
145 | 
146 | Note carefully that the above graph is unaffected by the validity of the
147 | parametric model; it is based purely on nonparametric analysis. This is
148 | in contrast to classic regression fit methods, most of which are based
149 | on examination of residuals of the fitted model.
150 | 
151 | \section{EXAMPLE; OVA VS. AVA IN MULTICLASS
152 | PROBLEMS}\label{example-ova-vs.-ava-in-multiclass-problems}
153 | 
154 | A very popular prediction method in 2-class problems is to use logistic
155 | (logit) regression. In analyzing click-through patterns of Web users,
156 | for instance, we have 2 classes, Click and Nonclick. We might fit a
157 | logistic model for Click, given user Web history, demographics and so
158 | on. Note that logit actually models probabilities, e.g.~the probability
159 | of Click given the predictor variables.
160 | 
161 | But the situation is much less simple in multiclass settings. Suppose
162 | our application is recognition of hand-written digits (a famous machine
163 | learning example). The predictor variables are pixel patterns in images.
164 | There are two schools of thought on this:
165 | 
166 | \begin{itemize}
167 | \item
168 |   One vs.~All (OVA): We would run 26 logistic regression models, one for
169 |   predicting `0' vs.~non-`0', one for `1' vs.~non-`1', and so on. For a
170 |   particular image, we would thus obtain 26 estimated probabilities. Let
171 |   imax be the image that yields the largest probability; we would then
172 |   guess the digit for the image to be `i'.
173 | \item
174 |   All vs.~All (AVA): Here we would run C(10,2) = 45 logit analyses, one
175 |   for each pair of digits. There would be one for `0' vs. `1', one for
176 |   `0' vs. `2', etc., all the way up through `8' vs. `9'. Many in the
177 |   machine learning literature recommend AVA over OVA, on the grounds
178 |   that might be linearly separable (in the statistical sense) in pairs
179 |   but not otherwise. My book counters by positing that such a situation
180 |   could be remedied under OVA by adding quadratic terms to the logit
181 |   models.
182 | \end{itemize}
183 | 
184 | At any rate, the regtools package gives you a choice, OVA or AVA, for
185 | both parametric and nonparametric methods. For example, avalogtrn() and
186 | avalogpred() do training and prediction operations for logit with AVA.
187 | 
188 | Another feature concerns adjustment of class probabilities. In many
189 | multiclass data sets, the numbers of points in each class is the same,
190 | or least not reflective of the population class probabilities. In
191 | regtools, the user can specify estimates of the latter, for logit and
192 | nonparametric methods.
193 | 
194 | So, let's look at an example, using the UCI Letter Recognition data set,
195 | another image recognition example. Again, the code below was preceded by
196 | some data wrangling, which changed the letter data from character to
197 | numeric, and which divided the data set into training and test sets.
198 | Here is the OVA run:
199 | 
200 | \begin{lstlisting}
201 | > ologout <- ovalogtrn(26,lrtrn[,c(2:17,1)])
202 | > ypred <- ovalogpred(ologout,lrtest[,-1])
203 | > mean(ypred == lrtest[,1])
204 | [1] 0.7193333
205 | \end{lstlisting}
206 | 
207 | So, we get about a 72\% rate of correct classification. Now let's try
208 | AVA:
209 | 
210 | \begin{lstlisting}
211 | > alogout <- avalogtrn(26,lrtrn[,c(2:17,1)])
212 | > ypred <- avalogpred(26,alogout,lrtest[,-1])
213 | > mean(ypred == lrtest[,1])
214 | [1] 0.8355
215 | \end{lstlisting}
216 | 
217 | AVA did considerably better, 84\%. So, apparently AVA fixed a poor
218 | model. But of course, it's better to make a good model in the first
219 | place. Based on our previous observation that the boundaries may be
220 | better approximated by curves than lines, let's try a quadratic model.
221 | 
222 | There were 16 predictors, thus 16 possible quadratic terms, and C(16,2)
223 | = 120 possible interaction terms. Inclusion of all such variables would
224 | probably produce too rich a model for the 14000 points in our training
225 | set. We'll settle for adding just the squared terms (not shown):
226 | 
227 | \begin{lstlisting}
228 | > ologout <- ovalogtrn(26,lrtrn[,c(2:33,1)])
229 | > ypred <- ovalogpred(ologout,lrtest[,-1])
230 | > mean(ypred == lrtest[,1])
231 | [1] 0.8086667
232 | \end{lstlisting}
233 | 
234 | Ah, much better, though still not quite as good as AVA.
235 | 
236 | \end{document}
237 | `
238 | 


--------------------------------------------------------------------------------
/inst/vn.save/vignettes/regtools.Rnw.save:
--------------------------------------------------------------------------------
  1 | 
  2 | \documentclass[11pt]{article}
  3 | 
  4 | \setlength{\oddsidemargin}{0in} 
  5 | \setlength{\evensidemargin}{0in}  
  6 | \setlength{\topmargin}{0.0in}  
  7 | \setlength{\headheight}{0in}  
  8 | \setlength{\headsep}{0in} 
  9 | \setlength{\textwidth}{6.5in}
 10 | \setlength{\textheight}{9.0in}
 11 | \setlength{\parindent}{0in}
 12 | \setlength{\parskip}{0.1in}
 13 | 
 14 | \usepackage{listings}  
 15 | 
 16 | \usepackage{graphicx}
 17 | 
 18 | % library(knitr)
 19 | %\VignetteIndexEntry{Partools}
 20 | 
 21 | \begin{document}
 22 | 
 23 | \title{regtools:  Novel Tools for Linear, Nonlinear and
 24 | Nonparametric Regression}
 25 | 
 26 | \author{Norm Matloff}
 27 | 
 28 | \date{November 6, 2016}
 29 | 
 30 | \maketitle
 31 | These tools are associated with my forthcoming book, {\it From Linear Models
 32 | to Machine Learning: Modern Statistical Regression and Classification},
 33 | N. Matloff, CRC, 2017.
 34 | 
 35 | {\it The tools are useful in general, independently of the book.}
 36 | 
 37 | \section{FEATURES:}\label{features}
 38 | 
 39 | \begin{itemize}
 40 | \item
 41 |   Nonparametric regression for general dimensions in predictor and
 42 |   response variables, using k-Nearest Neighbors. Local-linear option.
 43 |   Allows for user-specified smoothing method. Allows for accelerated
 44 |   exploration of multiple values of k at once. Tool to aid in choosing
 45 |   k.
 46 | \item
 47 |   Innovative tools for assessing fit in linear and nonlinear parametric
 48 |   models, via nonparametric methods. Model evaluation, examination of
 49 |   quadratic effects, investigation of nonhomogeneity of variance.
 50 | \item
 51 |   Tools for multiclass classification, parametric and nonparametric. One
 52 |   vs.~All and All vs.~All. Novel adjustment for artificially balanced
 53 |   data.
 54 | \item
 55 |   Linear regression, PCA and log-linear model estimation in missing-data
 56 |   setting, via the Available Cases method.
 57 | \item
 58 |   Nicer implementation of ridge regression, with more meaningful scaling
 59 |   and better plotting.
 60 | \item
 61 |   Extension to nonlinear parametric regression with of Eickert-White
 62 |   technique to handle heteroscedasticity.
 63 | \item
 64 |   Misc. tools, e.g.~Method of Moments estimation (including for
 65 |   nonregression settings).
 66 | \end{itemize}
 67 | 
 68 | \section{EXAMPLE: MODEL FIT
 69 | ASSESSMENT}\label{example-model-fit-assessment}
 70 | 
 71 | Let's take a look at the data set prgeng, some Census data for
 72 | California engineers and programmers in the year 2000. The response
 73 | variable in this example is wage income, and the predictors are age,
 74 | number of weeks worked, and dummy variables for MS and PhD degrees.
 75 | (Some data wrangling was performed first; type ?knnest for the details.)
 76 | 
 77 | The fit assessment techniques in regtools gauge the fit of parametric
 78 | models by comparing to nonparametric ones. Since the latter are free of
 79 | model bias, they are very useful in assessing the parametric models.
 80 | 
 81 | The function nonparvsxplot() plots the nonparametric fits against each
 82 | predictor variable, for instance to explore nonlinear effects. Here is
 83 | the plot for wage versus (scaled) age:
 84 | 
 85 | \includegraphics[width=3.75in]{wagevsage.png}
 86 | 
 87 | Of course, the effects of the other predictors don't show up here, but
 88 | there does seem to be a quadratic effect. The same was true for the
 89 | predictor measuring the number of weeks worked (slightly concave up, not
 90 | shown here). In our linear parametric model, then, we will include
 91 | squared terms for these two predictors.
 92 | 
 93 | So, after fitting the linear model, run parvsnonparplot(), which plots
 94 | the fit of the parametric model against the nonparametric one. Here is
 95 | the result:
 96 | 
 97 | \includegraphics[width=4.25in]{parvsnonpar.png}
 98 | 
 99 | There is quite a bit suggested in this picture:
100 | 
101 | \begin{itemize}
102 | \item
103 |   There seems to be some overfitting near the low end, and underfitting
104 |   at the high end.
105 | \item
106 |   The outliers, meaning points far from the fitted linear model, are
107 |   almost all below the linear fit.
108 | \item
109 |   There are intriguing ``sreaks'' or ``tails'' of points, suggesting the
110 |   possible existence of important subpopulations.
111 | \item
112 |   There appear to be a number of people with 0 wage income. Depending on
113 |   the goals of our analysis, we might consider removing them.
114 | \end{itemize}
115 | 
116 | Finally, let's check the classical assumption of homoscedasticity,
117 | meaning that the conditional variance of Y given X is constant. The
118 | function nonparvarplot() plots the estimated conditional variance
119 | against the estimated conditional mean, both computed
120 | nonparametrically:
121 | 
122 | \includegraphics[width=3.75in]{varvsmean.png}
123 | 
124 | Wow, a hockey stick! Though there is a mild rise in coefficient of
125 | determination, i.e. standard deviation relative to the mean, up to about
126 | \$80K, the slope increases sharply after that.
127 | 
128 | What to do? As long as our linear regression model assumption holds,
129 | violation of the homoscedasticity assumption won't invalidate our
130 | estimates; they still will be statistically consistent. But the standard
131 | errors we compute, and thus the statistical inference we perform, will
132 | be affected. This is correctible using the Eickert-White procedure,
133 | which for linear models is available in the car package, included in
134 | regtools. Our package also extends this to nonlinear parametric models,
135 | in our function nlshc() (the validity of this extension is shown in the
136 | book).
137 | 
138 | Of course, the ``hockey stick'' form is another indication that we
139 | should further investigate the model itself. It may well be useful to
140 | fit two separate linear models, one for incomes below \$80K and the
141 | other for the higher incomes. For a more formal approach to this, we
142 | might consider changepoint methods, such as in the CRAN package chngpt.
143 | 
144 | What is different:
145 | 
146 | Note carefully that the above graph is unaffected by the validity of the
147 | parametric model; it is based purely on nonparametric analysis. This is
148 | in contrast to classic regression fit methods, most of which are based
149 | on examination of residuals of the fitted model.
150 | 
151 | \section{EXAMPLE; OVA VS. AVA IN MULTICLASS
152 | PROBLEMS}\label{example-ova-vs.-ava-in-multiclass-problems}
153 | 
154 | A very popular prediction method in 2-class problems is to use logistic
155 | (logit) regression. In analyzing click-through patterns of Web users,
156 | for instance, we have 2 classes, Click and Nonclick. We might fit a
157 | logistic model for Click, given user Web history, demographics and so
158 | on. Note that logit actually models probabilities, e.g.~the probability
159 | of Click given the predictor variables.
160 | 
161 | But the situation is much less simple in multiclass settings. Suppose
162 | our application is recognition of hand-written digits (a famous machine
163 | learning example). The predictor variables are pixel patterns in images.
164 | There are two schools of thought on this:
165 | 
166 | \begin{itemize}
167 | \item
168 |   One vs.~All (OVA): We would run 26 logistic regression models, one for
169 |   predicting `0' vs.~non-`0', one for `1' vs.~non-`1', and so on. For a
170 |   particular image, we would thus obtain 26 estimated probabilities. Let
171 |   imax be the image that yields the largest probability; we would then
172 |   guess the digit for the image to be `i'.
173 | \item
174 |   All vs.~All (AVA): Here we would run C(10,2) = 45 logit analyses, one
175 |   for each pair of digits. There would be one for `0' vs. `1', one for
176 |   `0' vs. `2', etc., all the way up through `8' vs. `9'. Many in the
177 |   machine learning literature recommend AVA over OVA, on the grounds
178 |   that might be linearly separable (in the statistical sense) in pairs
179 |   but not otherwise. My book counters by positing that such a situation
180 |   could be remedied under OVA by adding quadratic terms to the logit
181 |   models.
182 | \end{itemize}
183 | 
184 | At any rate, the regtools package gives you a choice, OVA or AVA, for
185 | both parametric and nonparametric methods. For example, avalogtrn() and
186 | avalogpred() do training and prediction operations for logit with AVA.
187 | 
188 | Another feature concerns adjustment of class probabilities. In many
189 | multiclass data sets, the numbers of points in each class is the same,
190 | or least not reflective of the population class probabilities. In
191 | regtools, the user can specify estimates of the latter, for logit and
192 | nonparametric methods.
193 | 
194 | So, let's look at an example, using the UCI Letter Recognition data set,
195 | another image recognition example. Again, the code below was preceded by
196 | some data wrangling, which changed the letter data from character to
197 | numeric, and which divided the data set into training and test sets.
198 | Here is the OVA run:
199 | 
200 | \begin{lstlisting}
201 | > ologout <- ovalogtrn(26,lrtrn[,c(2:17,1)])
202 | > ypred <- ovalogpred(ologout,lrtest[,-1])
203 | > mean(ypred == lrtest[,1])
204 | [1] 0.7193333
205 | \end{lstlisting}
206 | 
207 | So, we get about a 72\% rate of correct classification. Now let's try
208 | AVA:
209 | 
210 | \begin{lstlisting}
211 | > alogout <- avalogtrn(26,lrtrn[,c(2:17,1)])
212 | > ypred <- avalogpred(26,alogout,lrtest[,-1])
213 | > mean(ypred == lrtest[,1])
214 | [1] 0.8355
215 | \end{lstlisting}
216 | 
217 | AVA did considerably better, 84\%. So, apparently AVA fixed a poor
218 | model. But of course, it's better to make a good model in the first
219 | place. Based on our previous observation that the boundaries may be
220 | better approximated by curves than lines, let's try a quadratic model.
221 | 
222 | There were 16 predictors, thus 16 possible quadratic terms, and C(16,2)
223 | = 120 possible interaction terms. Inclusion of all such variables would
224 | probably produce too rich a model for the 14000 points in our training
225 | set. We'll settle for adding just the squared terms (not shown):
226 | 
227 | \begin{lstlisting}
228 | > ologout <- ovalogtrn(26,lrtrn[,c(2:33,1)])
229 | > ypred <- ovalogpred(ologout,lrtest[,-1])
230 | > mean(ypred == lrtest[,1])
231 | [1] 0.8086667
232 | \end{lstlisting}
233 | 
234 | Ah, much better, though still not quite as good as AVA.
235 | 
236 | \end{document}
237 | `
238 | 


--------------------------------------------------------------------------------
/man/Quick.Rd~:
--------------------------------------------------------------------------------
  1 | \name{qe-Series Wrappers}
  2 | \alias{qeLogit}
  3 | \alias{qeLin}
  4 | \alias{qeKNN}
  5 | \alias{qeRF}
  6 | \alias{qeSVM}
  7 | \alias{qeGBoost}
  8 | \alias{qeNeural}
  9 | \alias{qeLASSO}
 10 | \alias{qePolyLin}
 11 | \alias{qePolyLog}
 12 | \alias{qeIso}
 13 | \alias{qeCompare}
 14 | \alias{predict.qeLogit}
 15 | \alias{predict.qeLin}
 16 | \alias{predict.qeKNN}
 17 | \alias{predict.qeRF}
 18 | \alias{predict.qeSVM}
 19 | \alias{predict.qeGBoost}
 20 | \alias{predict.qeNeural}
 21 | \alias{predict.qeLASSO}
 22 | \alias{predict.qePolyLin}
 23 | \alias{predict.qePolyLog}
 24 | \alias{predict.qeIso}
 25 | \alias{plot.RF}
 26 | \alias{plot.LASSO}
 27 | 
 28 | \title{Quick-Explore Regression/Classification Wrappers}
 29 | 
 30 | \description{
 31 | Quick access to machine learning methods, with a very simple
 32 | interface.  Intended for convenient initial exploration of a dataset,
 33 | both to gauge the predictive effectiveness of a model and to do simple
 34 | prediction of new cases.  Just one call needed to fit, no preliminary
 35 | setup of model etc.  The simplicity also makes the series useful
 36 | for teaching.  For advanced work, analysts may prefer to use 
 37 | the methods directly, in order to utilize specialized options.  
 38 | }
 39 | 
 40 | \usage{
 41 | qeLogit(data,yName,holdout=floor(min(1000,0.1*nrow(data))))
 42 | qeLin(data,yName,holdout=floor(min(1000,0.1*nrow(data))))
 43 | qeKNN(data,yName,k,scaleX=TRUE,holdout=floor(min(1000,0.1*nrow(data))))
 44 | qeRF(data,yName,nTree,minNodeSize,holdout=floor(min(1000,0.1*nrow(data))))
 45 | qeSVM(data,yName,gamma=1.0,cost=1.0,kernel='radial',degree=2,
 46 |    holdout=floor(min(1000,0.1*nrow(data))))
 47 | qeGBoost(data,yName,nTree=100,minNodeSize=10,learnRate=0.1,
 48 |    holdout=floor(min(1000,0.1*nrow(data))))
 49 | qeNeural(data,yName,hidden=c(100,100),nEpoch=30,
 50 |    holdout=floor(min(1000,0.1*nrow(data))))
 51 | qeLASSO(data,yName,alpha=1,holdout=floor(min(1000,0.1*nrow(data))))
 52 | qePolyLin(data,yName,deg=2,maxInteractDeg = deg,
 53 |    holdout=floor(min(1000,0.1*nrow(data))))
 54 | qePolyLog(data,yName,deg=2,maxInteractDeg = deg,
 55 |    holdout=floor(min(1000,0.1*nrow(data))))
 56 | qeCompare(data,yName,qeFtnList,nReps,opts=NULL,seed=9999)
 57 | \method{predict}{qeLogit}(object,newx)
 58 | \method{predict}{qeLin}(object,newx)
 59 | \method{predict}{qeKNN}(object,newx,newxK=1)
 60 | \method{predict}{qeRF}(object,newx)
 61 | \method{predict}{qeSVM}(object,newx,k=25)
 62 | \method{predict}{qeGBoost}(object,newx)
 63 | \method{predict}{qeNeural}(object,newx)
 64 | \method{predict}{qeLASSO}(object,newx)
 65 | \method{predict}{qePoly}(object,newx)
 66 | \method{plot}{qeLASSO}(object,newx)
 67 | \method{plot}{qeSVM}(object,newx,k=25)
 68 | \method{plot}{qeRF}(object,newx)
 69 | }
 70 | 
 71 | \arguments{
 72 |   \item{data}{Dataframe, training set. Classification case is signaled
 73 |      via labels column being an R factor.}
 74 |   \item{yName}{Name of the class labels column.}
 75 |   \item{holdout}{If not NULL, form a holdout set of the specified size.
 76 |      After fitting to the remaining data, evaluate accuracy on the test set.}
 77 |   \item{k}{Number of nearest neighbors. In functions other than
 78 |      \code{qeKNN} for which this is an argument, it is the number of 
 79 |      neighbors to use in finding conditional probabilities via 
 80 |      \code{knnCalib}.} 
 81 |   \item{scaleX}{Scale the features.} 
 82 |   \item{nTree}{Number of trees.} 
 83 |   \item{minNodeSize}{Minimum number of data points in a tree node.} 
 84 |   \item{learnRate}{Learning rate.} 
 85 |   \item{hidden}{Vector of units per hidden layer.  Fractional values
 86 |      indicated dropout proportions.} 
 87 |   \item{nEpoch}{Number of iterations in neural net.}
 88 |   \item{alpha}{1 for LASSO, 2 for ridge.}
 89 |   \item{gamma}{Scale parameter in \code{e1071::svm}.}
 90 |   \item{cost}{Cost parameter in \code{e1071::svm}.}
 91 |   \item{kernel}{One of 'linear','radial','polynomial' and 'sigmoid'.}
 92 |   \item{degree}{Degree of SVM polynomial kernel, if any.}
 93 |   \item{qeFtnList}{Character vector of \code{qe*} names.}
 94 |   \item{nReps}{Number of holdout sets to generate.}
 95 |   \item{opts}{R list of optional arguments for none, some or all of th
 96 |      functions in \code{qeFtnList}.}
 97 |   \item{seed}{Seed for random number generation.}
 98 | }
 99 | 
100 | \details{
101 | 
102 | As noted, these functions are intended for quick, first-level analysis
103 | of regression or multiclass classification problems.  Emphasis here is
104 | on convenience and simplicity.  Currently k-NN, SVM, random forests,
105 | gradient boosting, linear model, LASSO and polynomial regression are
106 | offered. 
107 | 
108 | The idea is that, given a new dataset, the analyst can quickly and
109 | easily try fitting a number of models in succession, say first k-NN,
110 | then random forests: 
111 | 
112 | \preformatted{
113 | # built-in data on major league baseball players
114 | > data(mlb)  
115 | > mlb <- mlb[,3:6]  # position, height, weight, age
116 | 
117 | # fit models
118 | > knnout <- qeKNN(mlb,'Weight',k=25)
119 | > rfout <- qeRF(mlb,'Weight')
120 | 
121 | # mean abs. pred. error on holdout set, in pounds
122 | > knnout$testAcc
123 | [1] 11.75644
124 | > rfout$testAcc
125 | [1] 12.6787
126 | 
127 | # predict a new case
128 | > newx <- data.frame(Position='Catcher',Height=73.5,Age=26)
129 | > predict(knnout,newx)
130 |        [,1]
131 | [1,] 204.04
132 | > predict(rfout,newx)
133 |       11 
134 | 199.1714
135 | 
136 | # how about some other ML methods?
137 | > lassout <- qeLASSO(mlb,'Weight')
138 | > lassout$testAcc
139 | [1] 14.23122
140 | # poly reg, degree 3 
141 | > polyout <- qePolyLin(mlb,'Weight',3)
142 | > polyout$testAcc
143 | [1] 13.55613
144 | > nnout <- qeNeural(mlb,'Weight')
145 | # ...
146 | > nnout$testAcc
147 | [1] 12.2537
148 | # try some nondefault hyperparams
149 | > nnout <- qeNeural(mlb,'Weight',hidden=c(200,200),nEpoch=50)
150 | > nnout$testAcc
151 | [1] 15.17982
152 | 
153 | }
154 | 
155 | The optional \code{holdout} argument triggers formation of a holdout set
156 | and the corresponding cross-validation evaluation of predictive power.
157 | Note that if a holdout is formed, the return value will consist of the
158 | fit on the training set, not on the full original dataset.
159 | 
160 | In most cases, the full basket of options in the wrapped function is not
161 | reflected, and second-level analysis should use the relevant packages
162 | directly.
163 | 
164 | The \code{qe*} functions do model fit.  Each of them has a
165 | \code{predict} method, and some also have a \code{plot} method.
166 | Arguments for \code{qe*} are at least: \code{data} and \code{yName};
167 | arguments for \code{predict} are at least: \code{object}, the return
168 | value from \code{qe*}, and \code{newx}, a data frame of points to be
169 | predicted.  In some cases, there are additional algorithm-specific
170 | parameters; default values are provided.
171 | 
172 | An additional benefit is that the \code{predict} functions work
173 | correctly on new cases with R factors.  The proper levels are assigned
174 | to the new cases.  (Of course, if a new case has a level not in the
175 | original data, nothing can be done.)
176 | 
177 | The function \code{qeLin} handles classification problems as
178 | multivariate-outcome linea models. If one's goal is prediction, it can
179 | be much faster than \code{qeLogit}, often with comparable accuracy.
180 | 
181 | The \code{qePolyLin} function does polynomial regression of the indicated
182 | degree. In the above example degree 3 means all terms through degree 3,
183 | e.g. \code{Height * Age^2}.  Dummy variables are handled properly, e.g.
184 | no powers of a dummy are generatd.  The logistic polynomial regression version
185 | is \code{qePolyLog}.
186 | 
187 | The \code{qeCompare} function does quick-and-easy cross-validated
188 | comparisons among the \code{qe*} functions.  The same holdout sets are
189 | generated and used by all the functions.  Default values of
190 | hyperparameters of those functions can be set via \code{opts}.
191 | 
192 | The \code{qeIso} function is intended mainly for use as a smoothing
193 | method in calibration actions.
194 | 
195 | }
196 | 
197 | \value{
198 | 
199 | The value returned by \code{qe*} functions depends on the algorithm, but
200 | with some commonality, e.g. \code{classif}, a logical value indicating
201 | whether the problem was of classification type.  
202 | 
203 | If a holdout set was requested, an additional returned component will be
204 | \code{testAcc}, the accuracy on the holdout set.  This will be Mean
205 | Absolute Prediction Error in the regression case, and proportion of
206 | misclassified cases in the classification case.
207 | 
208 | The value returned by the \code{predict} functions is an
209 | R list with components as follows:
210 | 
211 | Classification case:
212 | 
213 | \itemize{
214 | 
215 | \item \code{predClasses}:  R factor instance of predicted class labels 
216 | 
217 | \item \code{probs}:  vector/matrix of class probabilities; in the 2-class
218 | case, a vector, the probabilities of Y = 1
219 | 
220 | }
221 | 
222 | Regression case: vector of predicted values
223 | 
224 | }
225 | 
226 | \examples{
227 | 
228 | # see also 'details' above
229 | 
230 | \dontrun{
231 | 
232 | data(peFactors)  
233 | pef <- peFactors[,c(1,3,5,7:9)]  
234 | # most people in the dataset have at least a Bachelor's degree; so let's
235 | # just consider Master's (code 14) and PhD (code 16) as special
236 | pef$educ <- toSubFactor(pef$educ,c('14','16'))  
237 | 
238 | # predict occupation; 6 classes, 100, 101, 102, 106, 140, 141, using SVM
239 | svmout <- qeSVM(pef,'occ',holdout=NULL) 
240 | # as example of prediction, take the 8th case, but change the gender and
241 | # age to female and 25; note that by setting k to non-null, we are
242 | # requesting that conditional probabilities be calculated, via
243 | # knnCalib(), here using 25 nearest neighbors
244 | newx <- pef[8,-3] 
245 | newx$sex <- '2'
246 | newx$age <- 25
247 | predict(svmout,newx,k=25)
248 | # $predClasses
249 | #   8 
250 | # 100 
251 | # Levels: 100 101 102 106 140 141
252 | # $dvals
253 | #      102/101    102/100   102/141  102/140  102/106    101/100  101/141
254 | # 8 -0.7774038 -0.5132022 0.9997894 1.003251 0.999688 -0.4023077 1.000419
255 | #    101/140   101/106  100/141  100/140  100/106   141/140    141/106   140/106
256 | # 8 1.000474 0.9997371 1.000088 1.000026 1.000126 0.9460703 -0.4974625 -1.035721
257 | # 
258 | # $probs
259 | #       100  101  102  106 140  141
260 | # [1,] 0.24 0.52 0.12 0.08   0 0.04
261 | #
262 | # so, occupation code 100 is predicted, with a 0.36 conditional
263 | # probability
264 | 
265 | # if holdout evaluation is desired as well, say 1000 cases, seed 9999:
266 | > svmout <- qeSVM(pef,'occ',holdout=c(1000,9999)) 
267 | > svmout$testAcc
268 | [1] 0.622  # 62% error rate (overall rate for 6 classes)
269 | 
270 | # linear
271 | # lm() doesn't like numeric factor levels, so prepend an 'a'
272 | pef$occ <- prepend('a',pef$occ)
273 | lmout <- qeLin(pef,'occ')
274 | predict(lmout,pef[1,-3])  # occ 100, prob 0.3316
275 | lmout <- qeLin(pef,'wageinc')
276 | predict(lmout,pef[1,-5])  # 70857.79
277 | 
278 | qeCompare(mlb,'Weight',c('qeLin','qeKNN','qeRF'),25)
279 | #   qeFtn  meanAcc
280 | # 1 qeLin 13.30490
281 | # 2 qeKNN 13.72708
282 | # 3  qeRF 13.46515
283 | qeCompare(mlb,'Weight',c('qeLin','qeKNN','qeRF'),25,
284 |    list(qeKNN='k=5',qeRF='nTree = 100, minNodeSize = 15'))
285 | #   qeFtn  meanAcc
286 | # 1 qeLin 13.30490
287 | # 2 qeKNN 14.34051
288 | # 3  qeRF 13.02334
289 | 
290 | 
291 | }
292 | 
293 | }
294 | 
295 | \author{
296 | Norm Matloff
297 | }
298 | 
299 | 


--------------------------------------------------------------------------------