├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_LVar.R
    │   ├── test_binning.R
    │   ├── test_get-dummies.R
    │   ├── test_aglm-input-2.R
    │   ├── test_aglm.R
    │   └── test_aglm-input.R
├── CRAN-SUBMISSION
├── .gitignore
├── inst
    └── WORDLIST
├── .Rbuildignore
├── NEWS.md
├── man
    ├── AGLM_Input-class.Rd
    ├── deviance.AccurateGLM.Rd
    ├── createEqualWidthBins.Rd
    ├── createEqualFreqBins.Rd
    ├── print.AccurateGLM.Rd
    ├── getLVarMatForOneVec.Rd
    ├── executeBinning.Rd
    ├── CVA_AccurateGLM-class.Rd
    ├── getODummyMatForOneVec.Rd
    ├── getUDummyMatForOneVec.Rd
    ├── coef.AccurateGLM.Rd
    ├── AccurateGLM-class.Rd
    ├── residuals.AccurateGLM.Rd
    ├── predict.AccurateGLM.Rd
    ├── cva.aglm.Rd
    ├── cv.aglm.Rd
    ├── plot.AccurateGLM.Rd
    ├── aglm-package.Rd
    └── aglm.Rd
├── aglm.Rproj
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    └── workflows
    │   └── rhub.yaml
├── R
    ├── deviance-aglm.R
    ├── print-aglm.R
    ├── cva-aglm.R
    ├── accurate-glm.R
    ├── binning.R
    ├── predict-aglm.R
    ├── coef-aglm.R
    ├── aglm-package.R
    ├── residuals-aglm.R
    ├── get-dummies.R
    ├── cv-aglm.R
    ├── aglm.R
    ├── plot-aglm.R
    └── aglm-input.R
├── examples
    ├── aglm-2.R
    ├── cv-aglm-1.R
    ├── predict-and-plot-1.R
    ├── cva-aglm-1.R
    ├── aglm-1.R
    └── lvar-and-extrapolation.R
├── README.md
├── cran-comments.md
├── DESCRIPTION
├── NAMESPACE
├── cran-comments_old.md
└── LICENSE.md


/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(aglm)
3 | 
4 | test_check("aglm")
5 | 


--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 0.4.1
2 | Date: 2025-05-11 15:57:27 UTC
3 | SHA: dc5c3b58e6da86f54ede86a1260df933f579a85d
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | local/
 2 | inst/doc
 3 | .Rproj.user
 4 | .Rhistory
 5 | .RData
 6 | .Ruserdata
 7 | aglm.Rcheck/
 8 | aglm*.tar.gz
 9 | aglm*.tgz
10 | *.html
11 | *.rda
12 | 


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
 1 | AccurateGLM
 2 | Banno
 3 | coef
 4 | cv
 5 | cva
 6 | CVA
 7 | Fujita
 8 | glmnet
 9 | Hirokazu
10 | Iwasawa
11 | Kondo
12 | multicollinearity
13 | Suguru
14 | Takahashi
15 | Tanaka
16 | th
17 | Toyoto
18 | Hachemeister
19 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^aglm\.Rcheck$
 4 | ^aglm.*\.tar\.gz$
 5 | ^aglm.*\.tgz$
 6 | ^LICENSE\.md$
 7 | ^cran-comments\.md
 8 | ^cran-comments_old\.md
 9 | ^\.github$
10 | ^examples/*
11 | ^CRAN-RELEASE$
12 | ^CRAN-SUBMISSION$
13 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # aglm 0.4.1
 2 | - Fixed some notes on CRAN checks (without feature changes).
 3 | 
 4 | # aglm 0.4.0
 5 | - Updated all documents and examples.
 6 | - Published in [CRAN](https://cran.r-project.org/package=aglm).
 7 | 
 8 | # aglm 0.3.2
 9 | - Fixed to use `R` 4.0 and `glmnet` 4.0.
10 | 


--------------------------------------------------------------------------------
/man/AGLM_Input-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aglm-input.R
 3 | \docType{class}
 4 | \name{AGLM_Input-class}
 5 | \alias{AGLM_Input-class}
 6 | \title{S4 class for input}
 7 | \description{
 8 | S4 class for input
 9 | }
10 | \section{Slots}{
11 | 
12 | \describe{
13 | \item{\code{vars_info}}{A list, each of whose element is information of one variable.}
14 | 
15 | \item{\code{data}}{The original data.}
16 | }}
17 | 
18 | 


--------------------------------------------------------------------------------
/aglm.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageInstallArgs: --no-multiarch --with-keep.source
20 | PackageCheckArgs: --as-cran
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 | 


--------------------------------------------------------------------------------
/man/deviance.AccurateGLM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deviance-aglm.R
 3 | \name{deviance.AccurateGLM}
 4 | \alias{deviance.AccurateGLM}
 5 | \title{Get deviance}
 6 | \usage{
 7 | \method{deviance}{AccurateGLM}(object, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.}
11 | 
12 | \item{...}{Other arguments are passed directly to \code{deviance.glmnet()}.}
13 | }
14 | \value{
15 | The value of deviance extracted from the object \code{object}.
16 | }
17 | \description{
18 | Get deviance
19 | }
20 | \author{
21 | Kenji Kondo
22 | }
23 | 


--------------------------------------------------------------------------------
/man/createEqualWidthBins.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/binning.R
 3 | \name{createEqualWidthBins}
 4 | \alias{createEqualWidthBins}
 5 | \title{Create bins (equal width binning)}
 6 | \usage{
 7 | createEqualWidthBins(left, right, nbin)
 8 | }
 9 | \arguments{
10 | \item{left}{The leftmost value of the interval to be binned.}
11 | 
12 | \item{right}{The rightmost value of the interval to be binned.}
13 | 
14 | \item{nbin}{The number of bins.}
15 | }
16 | \value{
17 | A numeric vector representing breaks obtained by binning.
18 | }
19 | \description{
20 | Create bins (equal width binning)
21 | }
22 | \author{
23 | Kenji Kondo
24 | }
25 | 


--------------------------------------------------------------------------------
/man/createEqualFreqBins.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/binning.R
 3 | \name{createEqualFreqBins}
 4 | \alias{createEqualFreqBins}
 5 | \title{Create bins (equal frequency binning)}
 6 | \usage{
 7 | createEqualFreqBins(x_vec, nbin.max)
 8 | }
 9 | \arguments{
10 | \item{x_vec}{A numeric vector, whose quantiles are used as breaks.}
11 | 
12 | \item{nbin.max}{The maximum number of bins.}
13 | }
14 | \value{
15 | A numeric vector representing breaks obtained by binning.
16 | Note that the number of bins is equal to \code{min(nbin.max, length(x_vec))}.
17 | }
18 | \description{
19 | Create bins (equal frequency binning)
20 | }
21 | \author{
22 | Kenji Kondo
23 | }
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[Feature]"
 5 | labels: feature
 6 | assignees: kkondo1981
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/man/print.AccurateGLM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print-aglm.R
 3 | \name{print.AccurateGLM}
 4 | \alias{print.AccurateGLM}
 5 | \title{Display textual information of the model}
 6 | \usage{
 7 | \method{print}{AccurateGLM}(x, digits = max(3, getOption("digits") - 3), ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.}
11 | 
12 | \item{digits}{Used to control significant digits in printout.}
13 | 
14 | \item{...}{Other arguments are passed directly to \code{print.glmnet()}.}
15 | }
16 | \value{
17 | No return value, called for side effects.
18 | }
19 | \description{
20 | Display textual information of the model
21 | }
22 | \author{
23 | Kenji Kondo
24 | }
25 | 


--------------------------------------------------------------------------------
/R/deviance-aglm.R:
--------------------------------------------------------------------------------
 1 | #' Get deviance
 2 | #'
 3 | #' @param object
 4 | #'   A model object obtained from `aglm()` or `cv.aglm()`.
 5 | #'
 6 | #' @param ...
 7 | #'   Other arguments are passed directly to `deviance.glmnet()`.
 8 | #'
 9 | #' @return
10 | #'   The value of deviance extracted from the object `object`.
11 | #'
12 | #'
13 | #' @author
14 | #' Kenji Kondo
15 | #'
16 | #'
17 | #' @export
18 | #' @importFrom stats deviance
19 | deviance.AccurateGLM <- function(object, ...) {
20 |   # It's necessary to use same names for some arguments as the original methods,
21 |   # because devtools::check() issues warnings when using inconsistent names.
22 |   # As a result, we sometimes should accept uncomfortable argument names,
23 |   # but still have rights to use preferable names internally.
24 |   model <- object
25 | 
26 |   return(deviance(model@backend_models[[1]], ...))
27 | }
28 | 


--------------------------------------------------------------------------------
/R/print-aglm.R:
--------------------------------------------------------------------------------
 1 | #' Display textual information of the model
 2 | #'
 3 | #' @param x
 4 | #'   A model object obtained from `aglm()` or `cv.aglm()`.
 5 | #'
 6 | #' @param digits
 7 | #'   Used to control significant digits in printout.
 8 | #'
 9 | #' @param ...
10 | #'   Other arguments are passed directly to `print.glmnet()`.
11 | #'
12 | #' @return
13 | #'   No return value, called for side effects.
14 | #'
15 | #'
16 | #' @author
17 | #' Kenji Kondo
18 | #'
19 | #'
20 | #' @export
21 | print.AccurateGLM <- function(x, digits=max(3, getOption("digits") - 3), ...) {
22 |   # It's necessary to use same names for some arguments as the original methods,
23 |   # because devtools::check() issues warnings when using inconsistent names.
24 |   # As a result, we sometimes should accept uncomfortable argument names,
25 |   # but still have rights to use preferable names internally.
26 |   model <- x
27 | 
28 |   print(model@backend_models[[1]], digits, ...)
29 | }
30 | 


--------------------------------------------------------------------------------
/examples/aglm-2.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #################### Binomial case ####################
 3 | 
 4 | library(aglm)
 5 | library(faraway)
 6 | 
 7 | ## Read data
 8 | xy <- nes96
 9 | 
10 | ## Split data into train and test
11 | n <- nrow(xy) # Sample size.
12 | set.seed(2018) # For reproducibility.
13 | test.id <- sample(n, round(n/5)) # ID numbders for test data.
14 | test <- xy[test.id,] # test is the data.frame for testing.
15 | train <- xy[-test.id,] # train is the data.frame for training.
16 | x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
17 | y <- train$vote
18 | newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
19 | 
20 | ## Fit the model
21 | model <- aglm(x, y, family="binomial")
22 | 
23 | ## Make the confusion matrix
24 | lambda <- 0.1
25 | y_true <- test$vote
26 | y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))]
27 | 
28 | print(table(y_true, y_pred))
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # What is this?
 2 | Accurate Generalized Linear Model (AGLM) is defined as a regularized GLM which applying a sort of feature transformations using a discretization of numerical features and specific coding methodologies of dummy variables.
 3 | More details can be found in [our paper](https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1).
 4 | 
 5 | 2021/6/6:
 6 | Now our paper won [Charles A. Hachemeister Prize](https://www.casact.org/about/awards-prizes-scholarships/charles-hachemeister-prize#:~:text=This\%20prize\%20was\%20established\%20in,between\%20the\%20CAS\%20and\%20ASTIN.).
 7 | 
 8 | # Installation
 9 | ```r
10 | # The simplest way:
11 | install.packages("aglm")
12 | 
13 | # Or the development version from GitHub:
14 | # install.packages("devtools")
15 | devtools::install_github("kkondo1981/aglm", ref="develop")
16 | ```
17 | 
18 | # Usage
19 | See the help as below after installing `aglm`.
20 | 
21 | ```r
22 | library(aglm)
23 | ?"aglm-package"
24 | ```
25 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG]"
 5 | labels: bug
 6 | assignees: kkondo1981
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/man/getLVarMatForOneVec.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get-dummies.R
 3 | \name{getLVarMatForOneVec}
 4 | \alias{getLVarMatForOneVec}
 5 | \title{Create L-variable matrix for one variable}
 6 | \usage{
 7 | getLVarMatForOneVec(x_vec, breaks = NULL, nbin.max = 100, only_info = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x_vec}{A numeric vector representing original variable.}
11 | 
12 | \item{breaks}{A numeric vector representing breaks of bins (If \code{NULL}, automatically generated).}
13 | 
14 | \item{nbin.max}{The maximum number of bins (used only if \code{breaks=NULL}).}
15 | 
16 | \item{only_info}{If \code{TRUE}, only information fields of returned values are filled and no dummy matrix is returned.}
17 | }
18 | \value{
19 | A list with the following fields:
20 | \itemize{
21 | \item \code{breaks}: Same as input
22 | \item \code{dummy_mat}: The created L-variable matrix (only if \code{only_info=FALSE}).
23 | }
24 | }
25 | \description{
26 | Create L-variable matrix for one variable
27 | }
28 | \author{
29 | Kenji Kondo
30 | }
31 | 


--------------------------------------------------------------------------------
/man/executeBinning.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/binning.R
 3 | \name{executeBinning}
 4 | \alias{executeBinning}
 5 | \title{Binning the data to given bins.}
 6 | \usage{
 7 | executeBinning(x_vec, breaks = NULL, nbin.max = 100, method = "freq")
 8 | }
 9 | \arguments{
10 | \item{x_vec}{The data to be binned.}
11 | 
12 | \item{breaks}{A numeric vector representing breaks of bins (If \code{NULL}, automatically generated).}
13 | 
14 | \item{nbin.max}{The maximum number of bins (used only if \code{breaks=NULL}).}
15 | 
16 | \item{method}{\code{"freq"} for equal frequency binning or \code{"width"} for equal width binning (used only if \code{breaks=NULL}).}
17 | }
18 | \value{
19 | A list with the following fields:
20 | \itemize{
21 | \item \code{labels}: An integer vector with same length as \code{x_vec}, where \code{labels[i]==k} means the i-th element of \code{x_vec} is in the k-th bin.
22 | \item \code{breaks}: Breaks of bins used for binning.
23 | }
24 | }
25 | \description{
26 | Binning the data to given bins.
27 | }
28 | \author{
29 | Kenji Kondo
30 | }
31 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | # Test environments
 2 | - local
 3 |   - x86_64-w64-mingw32
 4 | - r-hub
 5 |   - linux(R-devel)
 6 |   - macos(R-devel)
 7 |   - windows(R-devel)
 8 | - win-builder (for R-release)
 9 |   - x86_64-w64-mingw32
10 | 
11 | 
12 | # Results of `R CMD check --as-cran`
13 | 
14 | Got 2 notes as below, but I guess it's not so critical and acceptable ones.
15 | 
16 | ## local
17 | ```
18 | * checking top-level files ... NOTE
19 | Files 'README.md' or 'NEWS.md' cannot be checked without 'pandoc' being installed.
20 | ```
21 | 
22 | ## win-builer
23 | ```
24 | * checking CRAN incoming feasibility ... [12s] NOTE
25 | Maintainer: 'Kenji Kondo <kkondo.odnokk@gmail.com>'
26 | 
27 | Found the following (possibly) invalid URLs:
28 |   URL: https://www.casact.org/about/awards-prizes-scholarships/charles-hachemeister-prize#:~:text=This%20prize%20was%20established%20in,between%20the%20CAS%20and%20ASTIN.
29 |     From: README.md
30 |     Status: 403
31 |     Message: Forbidden
32 | ```
33 | 
34 | ## others
35 | ```
36 | Status: OK
37 | ```
38 | 
39 | # `revdepcheck` results
40 | ```
41 | OK: 0
42 | BROKEN: 0
43 | Total time: <1 min
44 | ```
45 | 


--------------------------------------------------------------------------------
/man/CVA_AccurateGLM-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/accurate-glm.R
 3 | \docType{class}
 4 | \name{CVA_AccurateGLM-class}
 5 | \alias{CVA_AccurateGLM-class}
 6 | \title{Class for results of \code{cva.aglm()}}
 7 | \description{
 8 | Class for results of \code{cva.aglm()}
 9 | }
10 | \section{Slots}{
11 | 
12 | \describe{
13 | \item{\code{models_list}}{A list consists of \code{cv.glmnet()}'s results for all \eqn{\alpha} values.}
14 | 
15 | \item{\code{alpha}}{Same as in \link{cv.aglm}.}
16 | 
17 | \item{\code{nfolds}}{Same as in \link{cv.aglm}.}
18 | 
19 | \item{\code{alpha.min.index}}{The index of \code{alpha.min} in the vector \code{alpha}.}
20 | 
21 | \item{\code{alpha.min}}{The \eqn{\alpha} value achieving the minimum loss among all the values of \code{alpha}.}
22 | 
23 | \item{\code{lambda.min}}{The \eqn{\lambda} value achieving the minimum loss when \eqn{\alpha} is equal to \code{alpha.min}.}
24 | 
25 | \item{\code{call}}{An object of class \code{call}, corresponding to the function call when this \code{CVA_AccurateGLM} object is created.}
26 | }}
27 | 
28 | \author{
29 | Kenji Kondo
30 | }
31 | 


--------------------------------------------------------------------------------
/examples/cv-aglm-1.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #################### Cross-validation for lambda ####################
 3 | 
 4 | library(aglm)
 5 | library(faraway)
 6 | 
 7 | ## Read data
 8 | xy <- nes96
 9 | 
10 | ## Split data into train and test
11 | n <- nrow(xy) # Sample size.
12 | set.seed(2018) # For reproducibility.
13 | test.id <- sample(n, round(n/5)) # ID numbders for test data.
14 | test <- xy[test.id,] # test is the data.frame for testing.
15 | train <- xy[-test.id,] # train is the data.frame for training.
16 | x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
17 | y <- train$vote
18 | newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
19 | 
20 | # NOTE: Codes bellow will take considerable time, so run it when you have time.
21 | \donttest{
22 | 
23 | ## Fit the model
24 | model <- cv.aglm(x, y, family="binomial")
25 | 
26 | ## Make the confusion matrix
27 | lambda <- model@lambda.min
28 | y_true <- test$vote
29 | y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))]
30 | 
31 | cat(sprintf("Confusion matrix for lambda=%.5f:\n", lambda))
32 | print(table(y_true, y_pred))
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/examples/predict-and-plot-1.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #################### using plot() and predict() ####################
 3 | 
 4 | library(MASS) # For Boston
 5 | library(aglm)
 6 | 
 7 | ## Read data
 8 | xy <- Boston # xy is a data.frame to be processed.
 9 | colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y.
10 | 
11 | ## Split data into train and test
12 | n <- nrow(xy) # Sample size.
13 | set.seed(2018) # For reproducibility.
14 | test.id <- sample(n, round(n/4)) # ID numbders for test data.
15 | test <- xy[test.id,] # test is the data.frame for testing.
16 | train <- xy[-test.id,] # train is the data.frame for training.
17 | x <- train[-ncol(xy)]
18 | y <- train$y
19 | newx <- test[-ncol(xy)]
20 | y_true <- test$y
21 | 
22 | ## With the result of aglm()
23 | model <- aglm(x, y)
24 | lambda <- 0.1
25 | 
26 | plot(model, s=lambda, resid=TRUE, add_rug=TRUE,
27 |      verbose=FALSE, layout=c(3, 3))
28 | 
29 | y_pred <- predict(model, newx=newx, s=lambda)
30 | plot(y_true, y_pred)
31 | 
32 | ## With the result of cv.aglm()
33 | model <- cv.aglm(x, y)
34 | lambda <- model@lambda.min
35 | 
36 | plot(model, s=lambda, resid=TRUE, add_rug=TRUE,
37 |      verbose=FALSE, layout=c(3, 3))
38 | 
39 | y_pred <- predict(model, newx=newx, s=lambda)
40 | plot(y_true, y_pred)
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/man/getODummyMatForOneVec.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get-dummies.R
 3 | \name{getODummyMatForOneVec}
 4 | \alias{getODummyMatForOneVec}
 5 | \title{Create a O-dummy matrix for one variable}
 6 | \usage{
 7 | getODummyMatForOneVec(
 8 |   x_vec,
 9 |   breaks = NULL,
10 |   nbin.max = 100,
11 |   only_info = FALSE,
12 |   dummy_type = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{x_vec}{A numeric vector representing original variable.}
17 | 
18 | \item{breaks}{A numeric vector representing breaks of bins (If \code{NULL}, automatically generated).}
19 | 
20 | \item{nbin.max}{The maximum number of bins (used only if \code{breaks=NULL}).}
21 | 
22 | \item{only_info}{If \code{TRUE}, only information fields of returned values are filled and no dummy matrix is returned.}
23 | 
24 | \item{dummy_type}{Used to control the shape of linear combinations obtained by O-dummies for quantitative variables (deprecated).}
25 | }
26 | \value{
27 | A list with the following fields:
28 | \itemize{
29 | \item \code{breaks}: Same as input
30 | \item \code{dummy_mat}: The created O-dummy matrix (only if \code{only_info=FALSE}).
31 | }
32 | }
33 | \description{
34 | Create a O-dummy matrix for one variable
35 | }
36 | \author{
37 | Kenji Kondo
38 | }
39 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: aglm
 2 | Type: Package
 3 | Title: Accurate Generalized Linear Model
 4 | Version: 0.4.1
 5 | Authors@R: c(
 6 |     person("Kenji", "Kondo", role=c("aut", "cre", "cph"), email="kkondo.odnokk@gmail.com"),
 7 |     person("Kazuhisa", "Takahashi", role=c("ctb")),
 8 |     person("Hikari", "Banno", role=c("ctb"))
 9 |     )
10 | Description: Provides functions to fit Accurate Generalized Linear Model (AGLM) models, visualize them, and predict for new data. AGLM is defined as a regularized GLM which applies a sort of feature transformations using a discretization of numerical features and specific coding methodologies of dummy variables. For more information on AGLM, see Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa (2020) <https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1>.
11 | URL: https://github.com/kkondo1981/aglm
12 | BugReports: https://github.com/kkondo1981/aglm/issues
13 | License: GPL-2
14 | Encoding: UTF-8
15 | Language: en-US
16 | RoxygenNote: 7.3.2
17 | Roxygen: list(markdown = TRUE)
18 | Depends:
19 |     R (>= 4.0.0),
20 | Imports:
21 |     glmnet (>= 4.0.2),
22 |     assertthat,
23 |     methods,
24 |     mathjaxr
25 | Suggests: 
26 |     testthat,
27 |     knitr,
28 |     rmarkdown,
29 |     MASS,
30 |     faraway
31 | RdMacros:
32 |     mathjaxr
33 | 


--------------------------------------------------------------------------------
/man/getUDummyMatForOneVec.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get-dummies.R
 3 | \name{getUDummyMatForOneVec}
 4 | \alias{getUDummyMatForOneVec}
 5 | \title{Create a U-dummy matrix for one variable}
 6 | \usage{
 7 | getUDummyMatForOneVec(
 8 |   x_vec,
 9 |   levels = NULL,
10 |   drop_last = TRUE,
11 |   only_info = FALSE
12 | )
13 | }
14 | \arguments{
15 | \item{x_vec}{A vector representing original variable.
16 | The class of \code{x_vec} should be one of \code{integer}, \code{character}, or \code{factor}.}
17 | 
18 | \item{levels}{A character vector representing values of \code{x_vec} used to create U-dummies.
19 | If \code{NULL}, all the unique values of \code{x_vec} are used to create dummies.}
20 | 
21 | \item{drop_last}{If \code{TRUE}, the last column of the resulting matrix is dropped to avoid multicollinearity.}
22 | 
23 | \item{only_info}{If \code{TRUE}, only information fields of returned values are filled and no dummy matrix is returned.}
24 | }
25 | \value{
26 | A list with the following fields:
27 | \itemize{
28 | \item \code{levels}: Same as input.
29 | \item \code{drop_last}: Same as input.
30 | \item \code{dummy_mat}: The created U-dummy matrix (only if \code{only_info=FALSE}).
31 | }
32 | }
33 | \description{
34 | Create a U-dummy matrix for one variable
35 | }
36 | \author{
37 | Kenji Kondo
38 | }
39 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(coef,AccurateGLM)
 4 | S3method(deviance,AccurateGLM)
 5 | S3method(plot,AccurateGLM)
 6 | S3method(predict,AccurateGLM)
 7 | S3method(print,AccurateGLM)
 8 | S3method(residuals,AccurateGLM)
 9 | export(aglm)
10 | export(createEqualFreqBins)
11 | export(createEqualWidthBins)
12 | export(cv.aglm)
13 | export(cva.aglm)
14 | export(executeBinning)
15 | export(getLVarMatForOneVec)
16 | export(getODummyMatForOneVec)
17 | export(getUDummyMatForOneVec)
18 | exportClasses(AccurateGLM)
19 | exportClasses(CVA_AccurateGLM)
20 | import(mathjaxr)
21 | importFrom(assertthat,assert_that)
22 | importFrom(glmnet,cv.glmnet)
23 | importFrom(glmnet,glmnet)
24 | importFrom(glmnet,predict.glmnet)
25 | importFrom(grDevices,devAskNewPage)
26 | importFrom(graphics,barplot)
27 | importFrom(graphics,boxplot)
28 | importFrom(graphics,lines)
29 | importFrom(graphics,mtext)
30 | importFrom(graphics,par)
31 | importFrom(graphics,points)
32 | importFrom(graphics,rug)
33 | importFrom(methods,new)
34 | importFrom(stats,IQR)
35 | importFrom(stats,coef)
36 | importFrom(stats,deviance)
37 | importFrom(stats,getCall)
38 | importFrom(stats,ksmooth)
39 | importFrom(stats,predict)
40 | importFrom(stats,quantile)
41 | importFrom(stats,residuals)
42 | importFrom(stats,smooth.spline)
43 | importFrom(utils,flush.console)
44 | importFrom(utils,str)
45 | 


--------------------------------------------------------------------------------
/examples/cva-aglm-1.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #################### Cross-validation for alpha and lambda ####################
 3 | 
 4 | library(aglm)
 5 | library(faraway)
 6 | 
 7 | ## Read data
 8 | xy <- nes96
 9 | 
10 | ## Split data into train and test
11 | n <- nrow(xy) # Sample size.
12 | set.seed(2018) # For reproducibility.
13 | test.id <- sample(n, round(n/5)) # ID numbders for test data.
14 | test <- xy[test.id,] # test is the data.frame for testing.
15 | train <- xy[-test.id,] # train is the data.frame for training.
16 | x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
17 | y <- train$vote
18 | newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
19 | 
20 | # NOTE: Codes bellow will take considerable time, so run it when you have time.
21 | \donttest{
22 | 
23 | ## Fit the model
24 | cva_result <- cva.aglm(x, y, family="binomial")
25 | 
26 | alpha <- cva_result@alpha.min
27 | lambda <- cva_result@lambda.min
28 | 
29 | mod_idx <- cva_result@alpha.min.index
30 | model <- cva_result@models_list[[mod_idx]]
31 | 
32 | ## Make the confusion matrix
33 | y_true <- test$vote
34 | y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))]
35 | 
36 | cat(sprintf("Confusion matrix for alpha=%.5f and lambda=%.5f:\n", alpha, lambda))
37 | print(table(y_true, y_pred))
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/examples/aglm-1.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #################### Gaussian case ####################
 3 | 
 4 | library(MASS) # For Boston
 5 | library(aglm)
 6 | 
 7 | ## Read data
 8 | xy <- Boston # xy is a data.frame to be processed.
 9 | colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y.
10 | 
11 | ## Split data into train and test
12 | n <- nrow(xy) # Sample size.
13 | set.seed(2018) # For reproducibility.
14 | test.id <- sample(n, round(n/4)) # ID numbders for test data.
15 | test <- xy[test.id,] # test is the data.frame for testing.
16 | train <- xy[-test.id,] # train is the data.frame for training.
17 | x <- train[-ncol(xy)]
18 | y <- train$y
19 | newx <- test[-ncol(xy)]
20 | y_true <- test$y
21 | 
22 | ## Fit the model
23 | model <- aglm(x, y)  # alpha=1 (the default value)
24 | 
25 | ## Predict for various alpha and lambda
26 | lambda <- 0.1
27 | y_pred <- predict(model, newx=newx, s=lambda)
28 | rmse <- sqrt(mean((y_true - y_pred)^2))
29 | cat(sprintf("RMSE for lambda=%.2f: %.5f \n\n", lambda, rmse))
30 | 
31 | lambda <- 1.0
32 | y_pred <- predict(model, newx=newx, s=lambda)
33 | rmse <- sqrt(mean((y_true - y_pred)^2))
34 | cat(sprintf("RMSE for lambda=%.2f: %.5f \n\n", lambda, rmse))
35 | 
36 | alpha <- 0
37 | model <- aglm(x, y, alpha=alpha)
38 | 
39 | lambda <- 0.1
40 | y_pred <- predict(model, newx=newx, s=lambda)
41 | rmse <- sqrt(mean((y_true - y_pred)^2))
42 | cat(sprintf("RMSE for alpha=%.2f and lambda=%.2f: %.5f \n\n", alpha, lambda, rmse))
43 | 


--------------------------------------------------------------------------------
/examples/lvar-and-extrapolation.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #################### use_LVar and extrapolation ####################
 3 | 
 4 | library(MASS) # For Boston
 5 | library(aglm)
 6 | 
 7 | ## Randomly created train and test data
 8 | set.seed(2021)
 9 | sd <- 0.2
10 | x <- 2 * runif(1000) + 1
11 | f <- function(x){x^3 - 6 * x^2 + 13 * x}
12 | y <- f(x) + rnorm(1000, sd = sd)
13 | xy <- data.frame(x=x, y=y)
14 | x_test <- seq(0.75, 3.25, length.out=101)
15 | y_test <- f(x_test) + rnorm(101, sd=sd)
16 | xy_test <- data.frame(x=x_test, y=y_test)
17 | 
18 | ## Plot
19 | nbin.max <- 10
20 | models <- c(cv.aglm(x, y, use_LVar=FALSE, extrapolation="default", nbin.max=nbin.max),
21 |             cv.aglm(x, y, use_LVar=FALSE, extrapolation="flat", nbin.max=nbin.max),
22 |             cv.aglm(x, y, use_LVar=TRUE, extrapolation="default", nbin.max=nbin.max),
23 |             cv.aglm(x, y, use_LVar=TRUE, extrapolation="flat", nbin.max=nbin.max))
24 | 
25 | titles <- c("O-Dummies with extrapolation=\"default\"",
26 |             "O-Dummies with extrapolation=\"flat\"",
27 |             "L-Variables with extrapolation=\"default\"",
28 |             "L-Variables with extrapolation=\"flat\"")
29 | 
30 | par.old <- par(mfrow=c(2, 2))
31 | for (i in 1:4) {
32 |   model <- models[[i]]
33 |   title <- titles[[i]]
34 | 
35 |   pred <- predict(model, newx=x_test, s=model@lambda.min, type="response")
36 | 
37 |   plot(x_test, y_test, pch=20, col="grey", main=title)
38 |   lines(x_test, f(x_test), lty="dashed", lwd=2)  # the theoretical line
39 |   lines(x_test, pred, col="blue", lwd=3)  # the smoothed line by the model
40 | }
41 | par(par.old)
42 | 


--------------------------------------------------------------------------------
/man/coef.AccurateGLM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/coef-aglm.R
 3 | \name{coef.AccurateGLM}
 4 | \alias{coef.AccurateGLM}
 5 | \title{Get coefficients}
 6 | \usage{
 7 | \method{coef}{AccurateGLM}(object, index = NULL, name = NULL, s = NULL, exact = FALSE, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.}
11 | 
12 | \item{index}{An integer value representing the index of variable whose coefficients are required.}
13 | 
14 | \item{name}{A string representing the name of variable whose coefficients are required.
15 | Note that if both \code{index} and \code{name} are set, \code{index} is discarded.}
16 | 
17 | \item{s}{Same as in \link[glmnet]{coef.glmnet}.}
18 | 
19 | \item{exact}{Same as in \link[glmnet]{coef.glmnet}.}
20 | 
21 | \item{...}{Other arguments are passed directly to \code{coef.glmnet()}.}
22 | }
23 | \value{
24 | If \code{index} or \code{name} is given, the function returns a list with the one or combination
25 | of the following fields, consisting of coefficients related to the specified variable.
26 | \itemize{
27 | \item \code{coef.linear}: A coefficient of the linear term. (If any)
28 | \item \code{coef.OD}: Coefficients of O-dummies. (If any)
29 | \item \code{coef.UD}: Coefficients of U-dummies. (If any)
30 | \item \code{coef.LV}: Coefficients of L-variables. (If any)
31 | }
32 | 
33 | If both \code{index} and \code{name} are not given, the function returns entire coefficients
34 | corresponding to the internal designed matrix.
35 | }
36 | \description{
37 | Get coefficients
38 | }
39 | \author{
40 | Kenji Kondo
41 | }
42 | 


--------------------------------------------------------------------------------
/man/AccurateGLM-class.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/accurate-glm.R
 3 | \docType{class}
 4 | \name{AccurateGLM-class}
 5 | \alias{AccurateGLM-class}
 6 | \title{Class for results of \code{aglm()} and \code{cv.aglm()}}
 7 | \description{
 8 | Class for results of \code{aglm()} and \code{cv.aglm()}
 9 | }
10 | \section{Slots}{
11 | 
12 | \describe{
13 | \item{\code{backend_models}}{The fitted backend \code{glmnet} model is stored.}
14 | 
15 | \item{\code{vars_info}}{A list, each of whose element is information of one variable.}
16 | 
17 | \item{\code{lambda}}{Same as in the result of \link[glmnet]{cv.glmnet}.}
18 | 
19 | \item{\code{cvm}}{Same as in the result of \link[glmnet]{cv.glmnet}.}
20 | 
21 | \item{\code{cvsd}}{Same as in the result of \link[glmnet]{cv.glmnet}.}
22 | 
23 | \item{\code{cvup}}{Same as in the result of \link[glmnet]{cv.glmnet}.}
24 | 
25 | \item{\code{cvlo}}{Same as in the result of \link[glmnet]{cv.glmnet}.}
26 | 
27 | \item{\code{nzero}}{Same as in the result of \link[glmnet]{cv.glmnet}.}
28 | 
29 | \item{\code{name}}{Same as in the result of \link[glmnet]{cv.glmnet}.}
30 | 
31 | \item{\code{lambda.min}}{Same as in the result of \link[glmnet]{cv.glmnet}.}
32 | 
33 | \item{\code{lambda.1se}}{Same as in the result of \link[glmnet]{cv.glmnet}.}
34 | 
35 | \item{\code{fit.preval}}{Same as in the result of \link[glmnet]{cv.glmnet}.}
36 | 
37 | \item{\code{foldid}}{Same as in the result of \link[glmnet]{cv.glmnet}.}
38 | 
39 | \item{\code{call}}{An object of class \code{call}, corresponding to the function call when this \code{AccurateGLM} object is created.}
40 | }}
41 | 
42 | \author{
43 | Kenji Kondo
44 | }
45 | 


--------------------------------------------------------------------------------
/tests/testthat/test_LVar.R:
--------------------------------------------------------------------------------
 1 | context("LVar")
 2 | library(aglm)
 3 | 
 4 | test_that("getLVarMatForOneVec()'s outputs are correct.", {
 5 |   expect_equal(getLVarMatForOneVec(1:3)$dummy_mat, matrix(c(1, 0, 1), 3, 1))
 6 |   expect_equal(getLVarMatForOneVec(c(1, 1.5, 2, 2.3, 3), breaks=1:3)$dummy_mat,
 7 |                matrix(c(1, 0.5, 0, 0.3, 1), 5, 1))
 8 |   expect_equal(getLVarMatForOneVec(c(1, 1.5, 2, 2.3, 3), breaks=c(0, 1.8, 4))$dummy_mat,
 9 |                matrix(c(0.8, 0.3, 0.2, 0.5, 1.2), 5, 1))
10 | })
11 | 
12 | createX <- function(nobs, nvar_numeric, seed=12345) {
13 |   set.seed(seed)
14 |   nobs <- nobs
15 | 
16 |   data <- list()
17 |   if (nvar_numeric > 0) for (i in 1:nvar_numeric) data[[paste0("Num", i)]] <- rnorm(nobs)
18 | 
19 |   return(data.frame(data))
20 | }
21 | 
22 | test_that("Check newInput() for L-Variable", {
23 |   x <- newInput(createX(10, 1), use_LVar=TRUE)
24 | 
25 |   expect_equal(x@vars_info[[1]]$id, 1)
26 |   expect_equal(x@vars_info[[1]]$data_column_idx, 1)
27 |   expect_equal(x@vars_info[[1]]$type, "quan")
28 |   expect_equal(x@vars_info[[1]]$use_linear, TRUE)
29 |   expect_equal(x@vars_info[[1]]$use_UD, FALSE)
30 |   expect_equal(x@vars_info[[1]]$use_OD, FALSE)
31 |   expect_equal(x@vars_info[[1]]$use_LV, TRUE)
32 |   expect_true(is.null(x@vars_info[[1]]$OD_info))
33 |   expect_true(is.null(x@vars_info[[1]]$UD_info))
34 |   expect_true(!is.null(x@vars_info[[1]]$LV_info))
35 | 
36 |   mat_num <- getDesignMatrix(x)
37 |   expect_equal(mat_num[,1], x@data[,1])
38 |   # '+1' for the linear column
39 |   ncol <- dim(getLVarMatForOneVec(mat_num[,1])$dummy_mat)[2] + 1
40 |   expect_equal(dim(mat_num), c(10, ncol))
41 | 
42 |   bins_list <- list(c(0, 1, 2))
43 |   x <- newInput(createX(10, 1), use_LVar=TRUE, bins_list=bins_list)
44 |   expect_equal(x@vars_info[[1]]$LV_info$breaks, bins_list[[1]])
45 | })
46 | 


--------------------------------------------------------------------------------
/tests/testthat/test_binning.R:
--------------------------------------------------------------------------------
 1 | context("binning")
 2 | library(aglm)
 3 | 
 4 | EPS <- 1e-10
 5 | 
 6 | test_that("createEqualWidthBins()'s outputs are correct.", {
 7 |   expect_equal(createEqualWidthBins(0, 1, 3), c(0.0, 0.5, 1.0), tolerance=EPS)
 8 |   expect_equal(createEqualWidthBins(-33, 22, 14), (0:13) * ((22 + 33) / 13) - 33, tolerance=EPS)
 9 |   expect_equal(length(createEqualWidthBins(-323434, 134034193, 1234568)), 1234568)
10 | })
11 | 
12 | test_that("createEqualFreqBins()'s outputs are correct.", {
13 |   expect_equal(createEqualFreqBins(c(0.1, 0.3, 0.5, 0.8), 4), c(0.1, 0.3, 0.5, 0.8), tolerance=EPS)
14 |   expect_equal(createEqualFreqBins(c(0.1, 0.3, 0.5, 0.8), 3), c(0.1, 0.4, 0.8), tolerance=EPS)
15 |   expect_equal(length(createEqualFreqBins(rnorm(1000), 100)), 100, tolerance=EPS)
16 | 
17 |   x <- ordered(c(1, 1, 1, 1, 1, 2, 2, 3, 3, 4))
18 |   expect_equal(executeBinning(x, nbin.max=100)$labels, as.integer(x) + 1)
19 |   expect_equal(executeBinning(x, nbin.max=4)$labels, c(2, 2, 2, 2, 2, 3, 3, 3, 3, 4))
20 | })
21 | 
22 | test_that("executeBinning()'s outputs are correct.", {
23 |   expect_equal(executeBinning(c(-1, 0, 1.5, 3), breaks=createEqualWidthBins(0, 3, 3))$labels, 1:4)
24 |   expect_equal(executeBinning(c(0, 1, 3), nbin.max=3)$labels, 2:4)
25 |   expect_equal(executeBinning(c(0, 1, 3), nbin.max=3)$breaks, c(0, 1, 3))
26 |   expect_equal(executeBinning(c(0, 1, 3), nbin.max=3, method="freq")$labels, c(2, 3, 4))
27 |   expect_equal(executeBinning(c(0, 1, 3), nbin.max=3, method="freq")$breaks, c(0, 1, 3))
28 |   expect_equal(executeBinning(c(0, 1.5, 3), nbin.max=4, method="width")$labels, c(2, 3, 5))
29 |   expect_equal(executeBinning(c(0, 1.5, 3), nbin.max=4, method="width")$breaks, 0:3)
30 |   expect_equal(executeBinning(c(0, 1, 3))$breaks, c(0, 1, 3))
31 |   expect_equal(length(executeBinning(1:1000)$breaks), 100)
32 | })
33 | 


--------------------------------------------------------------------------------
/tests/testthat/test_get-dummies.R:
--------------------------------------------------------------------------------
 1 | context("get-dummies")
 2 | library(aglm)
 3 | 
 4 | test_that("getUDummyMatForOneVec()'s outputs are correct.", {
 5 |   expect_equal(getUDummyMatForOneVec(1:3)$dummy_mat, matrix(c(1, 0, 0, 0, 1, 0), 3, 2))
 6 |   expect_equal(getUDummyMatForOneVec(1:3, drop_last=FALSE)$dummy_mat, matrix(c(1, 0, 0, 0, 1, 0, 0, 0, 1), 3, 3))
 7 |   expect_equal(getUDummyMatForOneVec(c("a", "b", "b", "e", "d", "f"))$levels, c("a", "b", "d", "e", "f"))
 8 |   expect_equal(getUDummyMatForOneVec(c("a", "b", "b", "e", "d", "f"), levels=c("a", "b", "c", "d", "e", "f"))$levels, c("a", "b", "c", "d", "e", "f"))
 9 |   expect_equal(getUDummyMatForOneVec(ordered(c(1, 1, 2, 3)))$dummy_mat, matrix(c(1, 1, 0, 0, 0, 0, 1, 0), 4, 2))
10 | })
11 | 
12 | test_that("getODummyMatForOneVec()'s outputs are correct.", {
13 |   ## Tests dummy_type='C'
14 |   expect_equal(getODummyMatForOneVec(1:3, dummy_type="C")$dummy_mat,
15 |                matrix(c(0, 1, 1, 0, 0, 1), 3, 2))
16 |   expect_equal(getODummyMatForOneVec(c(1, 1.5, 2, 2.3, 3), breaks=1:3, dummy_type="C")$dummy_mat,
17 |                matrix(c(0, 0.5, 1, 1, 1, 0, 0, 0, 0.3, 1), 5, 2))
18 |   expect_equal(getODummyMatForOneVec(c(1, 1.5, 2, 2.3, 3), breaks=c(1, 2, 10), dummy_type="C")$dummy_mat,
19 |                matrix(c(0, 0.5, 1, 1, 1, 0, 0, 0, 0.3/8, 1/8), 5, 2))
20 | 
21 |   ## Tests dummy_type='J'
22 |   expect_equal(getODummyMatForOneVec(1:3, dummy_type="J")$dummy_mat,
23 |                matrix(c(1, 1, 1, 0, 1, 1, 0, 0, 1), 3, 3))
24 |   expect_equal(getODummyMatForOneVec(1:3, breaks=c(0, 0.5, 1.2, 3, 4), dummy_type="J")$dummy_mat,
25 |                matrix(c(1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0), 3, 5))
26 |   expect_equal(getODummyMatForOneVec(ordered(c(1, 1, 2, 3)), dummy_type="J")$dummy_mat,
27 |                matrix(c(1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1), 4, 3))
28 | })
29 | 


--------------------------------------------------------------------------------
/man/residuals.AccurateGLM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/residuals-aglm.R
 3 | \name{residuals.AccurateGLM}
 4 | \alias{residuals.AccurateGLM}
 5 | \title{Get residuals of various types}
 6 | \usage{
 7 | \method{residuals}{AccurateGLM}(
 8 |   object,
 9 |   x = NULL,
10 |   y = NULL,
11 |   offset = NULL,
12 |   weights = NULL,
13 |   type = c("working", "pearson", "deviance"),
14 |   s = NULL,
15 |   ...
16 | )
17 | }
18 | \arguments{
19 | \item{object}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.}
20 | 
21 | \item{x}{A design matrix.
22 | If not given, \code{x} for fitting is used.}
23 | 
24 | \item{y}{A response variable.
25 | If not given, \code{y} for fitting is used.}
26 | 
27 | \item{offset}{An offset values.
28 | If not given, \code{offset} for fitting is used.}
29 | 
30 | \item{weights}{Sample weights.
31 | If not given, \code{weights} for fitting is used.}
32 | 
33 | \item{type}{\loadmathjax
34 | A string representing type of deviance:
35 | \itemize{
36 | \item \code{"working"} get working residual
37 | \mjsdeqn{r^W_i = (y_i - \mu_i) \left(\frac{\partial \eta}{\partial \mu}\right)_{\mu=\mu_i},}
38 | where \eqn{y_i} is a response value, \eqn{\mu} is GLM mean, and \eqn{\eta=g^{-1}(\mu)} with the link function \eqn{g}.
39 | \item \code{"pearson"} get Pearson residuals
40 | \mjsdeqn{r^P_i = \frac{y_i - \mu_i}{\sqrt{V(\mu_i)}},}
41 | where \eqn{V} is the variance function.
42 | \item \code{"deviance"} get deviance residuals
43 | \mjsdeqn{r^D_i = {\rm sign}(y_i - \mu_i) \sqrt{d_i},}
44 | where \eqn{d_i} is the contribution to deviance.
45 | }}
46 | 
47 | \item{s}{A numeric value specifying \eqn{\lambda} at which residuals are calculated.}
48 | 
49 | \item{...}{Other arguments are currently not used and just discarded.}
50 | }
51 | \value{
52 | A numeric vector representing calculated residuals.
53 | }
54 | \description{
55 | Get residuals of various types
56 | }
57 | \author{
58 | Kenji Kondo
59 | }
60 | 


--------------------------------------------------------------------------------
/man/predict.AccurateGLM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/predict-aglm.R
 3 | \name{predict.AccurateGLM}
 4 | \alias{predict.AccurateGLM}
 5 | \title{Make predictions for new data}
 6 | \usage{
 7 | \method{predict}{AccurateGLM}(
 8 |   object,
 9 |   newx = NULL,
10 |   s = NULL,
11 |   type = c("link", "response", "coefficients", "nonzero", "class"),
12 |   exact = FALSE,
13 |   newoffset,
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{object}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.}
19 | 
20 | \item{newx}{A design matrix for new data.
21 | See the description of \code{x} in \link{aglm} for more details.}
22 | 
23 | \item{s}{Same as in \link[glmnet]{predict.glmnet}.}
24 | 
25 | \item{type}{Same as in \link[glmnet]{predict.glmnet}.}
26 | 
27 | \item{exact}{Same as in \link[glmnet]{predict.glmnet}.}
28 | 
29 | \item{newoffset}{Same as in \link[glmnet]{predict.glmnet}.}
30 | 
31 | \item{...}{Other arguments are passed directly when calling \code{predict.glmnet()}.}
32 | }
33 | \value{
34 | The returned object depends on \code{type}.
35 | See \link[glmnet]{predict.glmnet} for more details.
36 | }
37 | \description{
38 | Make predictions for new data
39 | }
40 | \examples{
41 | 
42 | #################### using plot() and predict() ####################
43 | 
44 | library(MASS) # For Boston
45 | library(aglm)
46 | 
47 | ## Read data
48 | xy <- Boston # xy is a data.frame to be processed.
49 | colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y.
50 | 
51 | ## Split data into train and test
52 | n <- nrow(xy) # Sample size.
53 | set.seed(2018) # For reproducibility.
54 | test.id <- sample(n, round(n/4)) # ID numbders for test data.
55 | test <- xy[test.id,] # test is the data.frame for testing.
56 | train <- xy[-test.id,] # train is the data.frame for training.
57 | x <- train[-ncol(xy)]
58 | y <- train$y
59 | newx <- test[-ncol(xy)]
60 | y_true <- test$y
61 | 
62 | ## With the result of aglm()
63 | model <- aglm(x, y)
64 | lambda <- 0.1
65 | 
66 | plot(model, s=lambda, resid=TRUE, add_rug=TRUE,
67 |      verbose=FALSE, layout=c(3, 3))
68 | 
69 | y_pred <- predict(model, newx=newx, s=lambda)
70 | plot(y_true, y_pred)
71 | 
72 | ## With the result of cv.aglm()
73 | model <- cv.aglm(x, y)
74 | lambda <- model@lambda.min
75 | 
76 | plot(model, s=lambda, resid=TRUE, add_rug=TRUE,
77 |      verbose=FALSE, layout=c(3, 3))
78 | 
79 | y_pred <- predict(model, newx=newx, s=lambda)
80 | plot(y_true, y_pred)
81 | 
82 | 
83 | }
84 | \references{
85 | Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
86 | \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
87 | \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
88 | \emph{Actuarial Colloquium Paris 2020}
89 | }
90 | \author{
91 | \itemize{
92 | \item Kenji Kondo,
93 | \item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
94 | }
95 | }
96 | 


--------------------------------------------------------------------------------
/R/cva-aglm.R:
--------------------------------------------------------------------------------
 1 | #' Fit an AGLM model with cross-validation for both \eqn{\alpha} and \eqn{\lambda}
 2 | #'
 3 | #' A fitting function with cross-validation for both \eqn{\alpha} and \eqn{\lambda}.
 4 | #' See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}.
 5 | #'
 6 | #' @param x
 7 | #'   A design matrix.
 8 | #'   See \link{aglm} for more details.
 9 | #'
10 | #' @param y
11 | #'   A response variable.
12 | #'
13 | #' @param alpha
14 | #'   A numeric vector representing \eqn{\alpha} values to be examined in cross-validation.
15 | #'
16 | #' @param nfolds
17 | #'   An integer value representing the number of folds.
18 | #'
19 | #' @param foldid
20 | #'   An integer vector with the same length as observations.
21 | #'   Each element should take a value from 1 to `nfolds`, identifying which fold it belongs.
22 | #'
23 | #' @param parallel.alpha
24 | #'   (not used yet)
25 | #'
26 | #' @param ...
27 | #'   Other arguments are passed directly to `cv.aglm()`.
28 | #'
29 | #' @return
30 | #'   An object storing fitted models and information of cross-validation.
31 | #'   See \link{CVA_AccurateGLM-class} for more details.
32 | #'
33 | #'
34 | #' @example examples/cva-aglm-1.R
35 | #'
36 | #'
37 | #' @author
38 | #'   * Kenji Kondo,
39 | #'   * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
40 | #'
41 | #'
42 | #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
43 | #' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
44 | #' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
45 | #' \emph{Actuarial Colloquium Paris 2020}
46 | #'
47 | #'
48 | #' @export
49 | #' @importFrom assertthat assert_that
50 | #' @importFrom methods new
51 | cva.aglm <- function(x, y,
52 |                      alpha=seq(0, 1, len=11)^3,
53 |                      nfolds=10,
54 |                      foldid=NULL,
55 |                      parallel.alpha=FALSE,
56 |                      ...) {
57 |   nfolds <- as.integer(nfolds)
58 | 
59 |   ## The function called to search lambda
60 |   .cvfunc <- function(a, x, y, nfolds, foldid, ...) {
61 |     cv.aglm(x, y, alpha=a, nfolds=nfolds, foldid=foldid, ...)
62 |   }
63 | 
64 |   ## Calculates for all alphas and lambdas
65 |   if (parallel.alpha) {
66 |     assert_that(FALSE, msg="parallel computation is not implemented yet.")
67 |   } else {
68 |     modlist <- lapply(alpha, .cvfunc, x=x, y=y, nfolds=nfolds, foldid=foldid, ...)
69 |   }
70 | 
71 |   ## Finds the pair (alpha, lambda), which achieves minimum loss
72 |   alpha.min.index <- which.min(lapply(modlist, function(mod){min(mod@cvm)}))
73 |   if (length(alpha.min.index) > 1) alpha.min.index <-alpha.min.index[1]
74 |   alpha.min <-alpha[alpha.min.index]
75 |   lambda.min <- modlist[[alpha.min.index]]@lambda.min
76 | 
77 |   return(new("CVA_AccurateGLM", models_list=modlist,
78 |              alpha=alpha,
79 |              nfolds=nfolds,
80 |              alpha.min.index=alpha.min.index,
81 |              alpha.min=alpha.min,
82 |              lambda.min=lambda.min,
83 |              call=match.call))
84 | }
85 | 


--------------------------------------------------------------------------------
/man/cva.aglm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cva-aglm.R
 3 | \name{cva.aglm}
 4 | \alias{cva.aglm}
 5 | \title{Fit an AGLM model with cross-validation for both \eqn{\alpha} and \eqn{\lambda}}
 6 | \usage{
 7 | cva.aglm(
 8 |   x,
 9 |   y,
10 |   alpha = seq(0, 1, len = 11)^3,
11 |   nfolds = 10,
12 |   foldid = NULL,
13 |   parallel.alpha = FALSE,
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{x}{A design matrix.
19 | See \link{aglm} for more details.}
20 | 
21 | \item{y}{A response variable.}
22 | 
23 | \item{alpha}{A numeric vector representing \eqn{\alpha} values to be examined in cross-validation.}
24 | 
25 | \item{nfolds}{An integer value representing the number of folds.}
26 | 
27 | \item{foldid}{An integer vector with the same length as observations.
28 | Each element should take a value from 1 to \code{nfolds}, identifying which fold it belongs.}
29 | 
30 | \item{parallel.alpha}{(not used yet)}
31 | 
32 | \item{...}{Other arguments are passed directly to \code{cv.aglm()}.}
33 | }
34 | \value{
35 | An object storing fitted models and information of cross-validation.
36 | See \link{CVA_AccurateGLM-class} for more details.
37 | }
38 | \description{
39 | A fitting function with cross-validation for both \eqn{\alpha} and \eqn{\lambda}.
40 | See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}.
41 | }
42 | \examples{
43 | 
44 | #################### Cross-validation for alpha and lambda ####################
45 | 
46 | library(aglm)
47 | library(faraway)
48 | 
49 | ## Read data
50 | xy <- nes96
51 | 
52 | ## Split data into train and test
53 | n <- nrow(xy) # Sample size.
54 | set.seed(2018) # For reproducibility.
55 | test.id <- sample(n, round(n/5)) # ID numbders for test data.
56 | test <- xy[test.id,] # test is the data.frame for testing.
57 | train <- xy[-test.id,] # train is the data.frame for training.
58 | x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
59 | y <- train$vote
60 | newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
61 | 
62 | # NOTE: Codes bellow will take considerable time, so run it when you have time.
63 | \donttest{
64 | 
65 | ## Fit the model
66 | cva_result <- cva.aglm(x, y, family="binomial")
67 | 
68 | alpha <- cva_result@alpha.min
69 | lambda <- cva_result@lambda.min
70 | 
71 | mod_idx <- cva_result@alpha.min.index
72 | model <- cva_result@models_list[[mod_idx]]
73 | 
74 | ## Make the confusion matrix
75 | y_true <- test$vote
76 | y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))]
77 | 
78 | cat(sprintf("Confusion matrix for alpha=\%.5f and lambda=\%.5f:\n", alpha, lambda))
79 | print(table(y_true, y_pred))
80 | 
81 | }
82 | }
83 | \references{
84 | Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
85 | \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
86 | \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
87 | \emph{Actuarial Colloquium Paris 2020}
88 | }
89 | \author{
90 | \itemize{
91 | \item Kenji Kondo,
92 | \item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
93 | }
94 | }
95 | 


--------------------------------------------------------------------------------
/R/accurate-glm.R:
--------------------------------------------------------------------------------
 1 | #' Class for results of `aglm()` and `cv.aglm()`
 2 | #'
 3 | #' @slot backend_models The fitted backend `glmnet` model is stored.
 4 | #' @slot vars_info A list, each of whose element is information of one variable.
 5 | #' @slot lambda Same as in the result of \link[glmnet]{cv.glmnet}.
 6 | #' @slot cvm Same as in the result of \link[glmnet]{cv.glmnet}.
 7 | #' @slot cvsd Same as in the result of \link[glmnet]{cv.glmnet}.
 8 | #' @slot cvup Same as in the result of \link[glmnet]{cv.glmnet}.
 9 | #' @slot cvlo Same as in the result of \link[glmnet]{cv.glmnet}.
10 | #' @slot nzero Same as in the result of \link[glmnet]{cv.glmnet}.
11 | #' @slot name Same as in the result of \link[glmnet]{cv.glmnet}.
12 | #' @slot lambda.min Same as in the result of \link[glmnet]{cv.glmnet}.
13 | #' @slot lambda.1se Same as in the result of \link[glmnet]{cv.glmnet}.
14 | #' @slot fit.preval Same as in the result of \link[glmnet]{cv.glmnet}.
15 | #' @slot foldid Same as in the result of \link[glmnet]{cv.glmnet}.
16 | #' @slot call An object of class `call`, corresponding to the function call when this `AccurateGLM` object is created.
17 | #'
18 | #' @author Kenji Kondo
19 | #'
20 | #' @export
21 | setClass("AccurateGLM",
22 |          representation=representation(backend_models="list",
23 |                                        vars_info="list",
24 |                                        lambda="numeric",
25 |                                        cvm="numeric",
26 |                                        cvsd="numeric",
27 |                                        cvup="numeric",
28 |                                        cvlo="numeric",
29 |                                        nzero="integer",
30 |                                        name="character",
31 |                                        lambda.min="numeric",
32 |                                        lambda.1se="numeric",
33 |                                        fit.preval="matrix",
34 |                                        foldid="integer",
35 |                                        call="ANY"))
36 | 
37 | 
38 | #' Class for results of `cva.aglm()`
39 | #'
40 | #' @slot models_list A list consists of `cv.glmnet()`'s results for all \eqn{\alpha} values.
41 | #' @slot alpha Same as in \link{cv.aglm}.
42 | #' @slot nfolds Same as in \link{cv.aglm}.
43 | #' @slot alpha.min.index The index of `alpha.min` in the vector `alpha`.
44 | #' @slot alpha.min The \eqn{\alpha} value achieving the minimum loss among all the values of `alpha`.
45 | #' @slot lambda.min The \eqn{\lambda} value achieving the minimum loss when \eqn{\alpha} is equal to `alpha.min`.
46 | #' @slot call An object of class `call`, corresponding to the function call when this `CVA_AccurateGLM` object is created.
47 | #'
48 | #' @author Kenji Kondo
49 | #'
50 | #' @export
51 | setClass("CVA_AccurateGLM",
52 |          representation=representation(models_list="list",
53 |                                        alpha="numeric",
54 |                                        nfolds="integer",
55 |                                        alpha.min.index="integer",
56 |                                        alpha.min="numeric",
57 |                                        lambda.min="numeric",
58 |                                        call="ANY"))
59 | 


--------------------------------------------------------------------------------
/.github/workflows/rhub.yaml:
--------------------------------------------------------------------------------
 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at
 2 | # https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml
 3 | # You can update this file to a newer version using the rhub2 package:
 4 | #
 5 | # rhub::rhub_setup()
 6 | #
 7 | # It is unlikely that you need to modify this file manually.
 8 | 
 9 | name: R-hub
10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}"
11 | 
12 | on:
13 |   workflow_dispatch:
14 |     inputs:
15 |       config:
16 |         description: 'A comma separated list of R-hub platforms to use.'
17 |         type: string
18 |         default: 'linux,windows,macos'
19 |       name:
20 |         description: 'Run name. You can leave this empty now.'
21 |         type: string
22 |       id:
23 |         description: 'Unique ID. You can leave this empty now.'
24 |         type: string
25 | 
26 | jobs:
27 | 
28 |   setup:
29 |     runs-on: ubuntu-latest
30 |     outputs:
31 |       containers: ${{ steps.rhub-setup.outputs.containers }}
32 |       platforms: ${{ steps.rhub-setup.outputs.platforms }}
33 | 
34 |     steps:
35 |     # NO NEED TO CHECKOUT HERE
36 |     - uses: r-hub/actions/setup@v1
37 |       with:
38 |         config: ${{ github.event.inputs.config }}
39 |       id: rhub-setup
40 | 
41 |   linux-containers:
42 |     needs: setup
43 |     if: ${{ needs.setup.outputs.containers != '[]' }}
44 |     runs-on: ubuntu-latest
45 |     name: ${{ matrix.config.label }}
46 |     strategy:
47 |       fail-fast: false
48 |       matrix:
49 |         config: ${{ fromJson(needs.setup.outputs.containers) }}
50 |     container:
51 |       image: ${{ matrix.config.container }}
52 | 
53 |     steps:
54 |       - uses: r-hub/actions/checkout@v1
55 |       - uses: r-hub/actions/platform-info@v1
56 |         with:
57 |           token: ${{ secrets.RHUB_TOKEN }}
58 |           job-config: ${{ matrix.config.job-config }}
59 |       - uses: r-hub/actions/setup-deps@v1
60 |         with:
61 |           token: ${{ secrets.RHUB_TOKEN }}
62 |           job-config: ${{ matrix.config.job-config }}
63 |       - uses: r-hub/actions/run-check@v1
64 |         with:
65 |           token: ${{ secrets.RHUB_TOKEN }}
66 |           job-config: ${{ matrix.config.job-config }}
67 | 
68 |   other-platforms:
69 |     needs: setup
70 |     if: ${{ needs.setup.outputs.platforms != '[]' }}
71 |     runs-on: ${{ matrix.config.os }}
72 |     name: ${{ matrix.config.label }}
73 |     strategy:
74 |       fail-fast: false
75 |       matrix:
76 |         config: ${{ fromJson(needs.setup.outputs.platforms) }}
77 | 
78 |     steps:
79 |       - uses: r-hub/actions/checkout@v1
80 |       - uses: r-hub/actions/setup-r@v1
81 |         with:
82 |           job-config: ${{ matrix.config.job-config }}
83 |           token: ${{ secrets.RHUB_TOKEN }}
84 |       - uses: r-hub/actions/platform-info@v1
85 |         with:
86 |           token: ${{ secrets.RHUB_TOKEN }}
87 |           job-config: ${{ matrix.config.job-config }}
88 |       - uses: r-hub/actions/setup-deps@v1
89 |         with:
90 |           job-config: ${{ matrix.config.job-config }}
91 |           token: ${{ secrets.RHUB_TOKEN }}
92 |       - uses: r-hub/actions/run-check@v1
93 |         with:
94 |           job-config: ${{ matrix.config.job-config }}
95 |           token: ${{ secrets.RHUB_TOKEN }}
96 | 


--------------------------------------------------------------------------------
/R/binning.R:
--------------------------------------------------------------------------------
  1 | # Inner function used in executeBinning
  2 | isBinningFeasible <- function(x_vec) {
  3 |   return(is.integer(x_vec) | is.numeric(x_vec) | is.ordered(x_vec))
  4 | }
  5 | 
  6 | 
  7 | #' Create bins (equal width binning)
  8 | #'
  9 | #' @param left The leftmost value of the interval to be binned.
 10 | #' @param right The rightmost value of the interval to be binned.
 11 | #' @param nbin The number of bins.
 12 | #'
 13 | #' @return A numeric vector representing breaks obtained by binning.
 14 | #'
 15 | #' @author Kenji Kondo
 16 | #'
 17 | #' @export
 18 | #' @importFrom assertthat assert_that
 19 | createEqualWidthBins <- function(left, right, nbin){
 20 |   nbin <- as.integer(nbin)
 21 |   assert_that(length(nbin) == 1 & nbin > 1)
 22 | 
 23 |   breaks <- (0:(nbin - 1)) * ((right - left) / (nbin - 1)) + left
 24 |   return(breaks)
 25 | }
 26 | 
 27 | 
 28 | #' Create bins (equal frequency binning)
 29 | #'
 30 | #' @param x_vec  A numeric vector, whose quantiles are used as breaks.
 31 | #' @param nbin.max The maximum number of bins.
 32 | #'
 33 | #' @return A numeric vector representing breaks obtained by binning.
 34 | #'   Note that the number of bins is equal to `min(nbin.max, length(x_vec))`.
 35 | #'
 36 | #' @author Kenji Kondo
 37 | #'
 38 | #' @export
 39 | #' @importFrom assertthat assert_that
 40 | #' @importFrom stats quantile
 41 | createEqualFreqBins <- function(x_vec, nbin.max) {
 42 |   nbin.max <- as.integer(nbin.max)
 43 |   assert_that(nbin.max > 1 & length(x_vec) > 0)
 44 | 
 45 |   nbin <- min(nbin.max, length(x_vec))
 46 |   percents <- seq(0, 1, 1 / (nbin - 1))
 47 |   if (is.ordered(x_vec)) {
 48 |     breaks <- unique(quantile(x_vec, percents, type=1))
 49 |     names(breaks) <- NULL
 50 |   } else { # integer or numeric cases
 51 |     breaks <- unique(as.numeric(quantile(x_vec, percents)))
 52 |   }
 53 | 
 54 |   return(breaks)
 55 | }
 56 | 
 57 | 
 58 | #' Binning the data to given bins.
 59 | #'
 60 | #' @param x_vec The data to be binned.
 61 | #' @param breaks A numeric vector representing breaks of bins (If `NULL`, automatically generated).
 62 | #' @param nbin.max The maximum number of bins (used only if `breaks=NULL`).
 63 | #' @param method `"freq"` for equal frequency binning or `"width"` for equal width binning (used only if `breaks=NULL`).
 64 | #'
 65 | #' @return A list with the following fields:
 66 | #' * `labels`: An integer vector with same length as `x_vec`, where `labels[i]==k` means the i-th element of `x_vec` is in the k-th bin.
 67 | #' * `breaks`: Breaks of bins used for binning.
 68 | #'
 69 | #' @author Kenji Kondo
 70 | #'
 71 | #' @export
 72 | #' @importFrom assertthat assert_that
 73 | executeBinning <- function(x_vec, breaks=NULL, nbin.max=100, method="freq") {
 74 |   # Check arguments
 75 |   assert_that(isBinningFeasible(x_vec))
 76 | 
 77 |   # If breaks is NULL, generate bins by self.
 78 |   if (is.null(breaks)) {
 79 |     left <- min(x_vec)
 80 |     right <- max(x_vec)
 81 |     if (method == "freq") {
 82 |       breaks <- createEqualFreqBins(x_vec, nbin.max)
 83 |     } else if (method == "width") {
 84 |       breaks <- createEqualWidthBins(left, right, nbin.max)
 85 |     } else {
 86 |       assert_that(FALSE, msg="wrong 'method' argument.")
 87 |     }
 88 |   }
 89 | 
 90 |   if (is.ordered(x_vec)) {
 91 |     labels <- as.integer(rep(1, length(x_vec)))
 92 |     for (i in seq(length(breaks))) {
 93 |       labels[x_vec >= breaks[i]] <- i + 1
 94 |     }
 95 |   } else { # integer or numeric cases
 96 |     # Calc labels for each element
 97 |     labels <- cut(x_vec, breaks=c(-Inf, breaks, Inf), labels=FALSE, right=FALSE)
 98 |   }
 99 | 
100 |   return(list(labels=labels, breaks=breaks))
101 | }
102 | 


--------------------------------------------------------------------------------
/R/predict-aglm.R:
--------------------------------------------------------------------------------
  1 | #' Make predictions for new data
  2 | #'
  3 | #' @param object
  4 | #'   A model object obtained from `aglm()` or `cv.aglm()`.
  5 | #'
  6 | #' @param newx
  7 | #'   A design matrix for new data.
  8 | #'   See the description of `x` in \link{aglm} for more details.
  9 | #'
 10 | #' @param s
 11 | #'   Same as in \link[glmnet]{predict.glmnet}.
 12 | #'
 13 | #' @param type
 14 | #'   Same as in \link[glmnet]{predict.glmnet}.
 15 | #'
 16 | #' @param exact
 17 | #'   Same as in \link[glmnet]{predict.glmnet}.
 18 | #'
 19 | #' @param newoffset
 20 | #'   Same as in \link[glmnet]{predict.glmnet}.
 21 | 
 22 | #' @param ...
 23 | #'   Other arguments are passed directly when calling `predict.glmnet()`.
 24 | #'
 25 | #' @return
 26 | #'   The returned object depends on `type`.
 27 | #'   See \link[glmnet]{predict.glmnet} for more details.
 28 | #'
 29 | #'
 30 | #' @example examples/predict-and-plot-1.R
 31 | #'
 32 | #'
 33 | #' @author
 34 | #'   * Kenji Kondo,
 35 | #'   * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
 36 | #'
 37 | #'
 38 | #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
 39 | #' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
 40 | #' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
 41 | #' \emph{Actuarial Colloquium Paris 2020}
 42 | #'
 43 | #'
 44 | #' @export
 45 | #' @importFrom assertthat assert_that
 46 | #' @importFrom glmnet predict.glmnet
 47 | #' @importFrom methods new
 48 | #' @importFrom stats predict
 49 | predict.AccurateGLM <- function(object,
 50 |                                 newx=NULL,
 51 |                                 s=NULL,
 52 |                                 type=c("link","response","coefficients","nonzero","class"),
 53 |                                 exact=FALSE,
 54 |                                 newoffset,
 55 |                                 ...) {
 56 |   # It's necessary to use same names for some arguments as the original methods,
 57 |   # because devtools::check() issues warnings when using inconsistent names.
 58 |   # As a result, we sometimes should accept uncomfortable argument names,
 59 |   # but still have rights to use preferable names internally.
 60 |   model <- object
 61 | 
 62 |   # Check and set `type`
 63 |   type <- match.arg(type)
 64 | 
 65 |   # Create an input object
 66 |   if (class(newx)[1] != "data.frame") newx <- data.frame(newx)
 67 |   for (i in seq(dim(newx)[2])) {
 68 |     var_info <- model@vars_info[[i]]
 69 |     if (var_info$type == "quan") newx[, i] <- as.numeric(newx[, i])
 70 |     else if (var_info$type == "qual") {
 71 |       if (var_info$use_OD & !is.ordered(newx[, i])) newx[, i] <- ordered(newx[, i])
 72 |       else if (!is.factor(newx[, i])) newx[, i] <- factor(newx[, i])
 73 |     }
 74 |   }
 75 |   newx <- new("AGLM_Input", vars_info=model@vars_info, data=newx)
 76 | 
 77 |   # Create a design matrix passed to backend API
 78 |   x_for_backend <- getDesignMatrix(newx)
 79 | 
 80 |   # Select what is to be given predict() as a model
 81 |   backend_model <- model@backend_models[[1]]
 82 | 
 83 |   model_name <- names(model@backend_models)[[1]]
 84 |   if (model_name == "cv.glmnet") {
 85 |     if (is.character(s)) {
 86 |       if (s == "lambda.min")
 87 |         s <- model@lambda.min
 88 |       if (s == "lambda.1se")
 89 |         s <- model@lambda.1se
 90 |     }
 91 |   }
 92 | 
 93 |   glmnet_result <- predict(backend_model,
 94 |                            x_for_backend,
 95 |                            s=s,
 96 |                            type=type,
 97 |                            exact=exact,
 98 |                            newoffset,
 99 |                            ...)
100 | 
101 |   return(glmnet_result)
102 | }
103 | 


--------------------------------------------------------------------------------
/tests/testthat/test_aglm-input-2.R:
--------------------------------------------------------------------------------
  1 | context("aglm-input-2")
  2 | library(MASS) # For Boston.
  3 | library(aglm)
  4 | 
  5 | # Check design matrices for actual data are same as those produced by Iwasawa-san's original codes
  6 | # Utility functions written by Iwasawa-san for oridinal AGLM scripts
  7 | 
  8 | # Function to produce a data.frame of O-dummies
  9 | make.bins <- function(data, max.nbin = 100){
 10 |   temp <- apply(data, 2, function(x){as.vector(quantile(x, seq(0, 1, 1 / (min(max.nbin, length(x)) - 1))))})
 11 |   apply(temp, 2, unique)
 12 | }
 13 | 
 14 | OD <- function(data, bins){
 15 |   x.OD <- NULL
 16 |   for (i in 1:length(bins)){
 17 |     n <- nrow(data)
 18 |     m <- length(bins[[i]])
 19 |     for(j in 1:m){
 20 |       temp <- data[,i]
 21 |       for(k in 1:n){
 22 |         temp[k] <- ifelse(temp[k] < bins[[i]][j], 0, 1)
 23 |       }
 24 |       x.OD <- cbind(x.OD, temp)
 25 |       colnames(x.OD)[ncol(x.OD)] <- paste0(colnames(data)[i], j)
 26 |     }
 27 |   }
 28 |   x.OD
 29 | }
 30 | 
 31 | UD <- function(x, name){
 32 |   x.UD <- NULL
 33 |   for(i in 1:(nlevels(x))){
 34 |     lev <- levels(x)[i]
 35 |     temp <- sapply(x, FUN = function(x){ifelse(x == lev, 1, 0)})
 36 |     x.UD <- cbind(x.UD, temp)
 37 |     colnames(x.UD)[i] <- paste0(name, levels(x)[i])
 38 |   }
 39 |   x.UD
 40 | }
 41 | 
 42 | ## Function to produce a data.frame of two-way interactions
 43 | ints.mat <- function(data){
 44 |   ints <- NULL
 45 |   temp <- colnames(data)
 46 |   for(i in 1:(ncol(data)-1)){
 47 |     for(j in (i+1):ncol(data)){
 48 |       ints <- cbind(ints, data[, i] * data[, j])
 49 |       colnames(ints)[ncol(ints)] <- paste0(temp[i], ".", temp[j])
 50 |     }
 51 |   }
 52 |   ints
 53 | }
 54 | 
 55 | # To compare two matrices allowing column-reordering, we use checksums.
 56 | compare_mat_without_column_order <- function(x, y) {
 57 |   if (!all(dim(x) == dim(y)))
 58 |     return(FALSE)
 59 | 
 60 |   # Row-wise equality test. Orders are considered.
 61 |   if (!all(apply(x, FUN=sum, MARGIN=1) == apply(x, FUN=sum, MARGIN=1)))
 62 |     return(FALSE)
 63 | 
 64 |   # Column-wise equality test. Orders are not considered.
 65 |   if (!all(sort(apply(x, FUN=sum, MARGIN=2)) == sort(apply(x, FUN=sum, MARGIN=2))))
 66 |     return(FALSE)
 67 | 
 68 |   return(TRUE)
 69 | }
 70 | 
 71 | # Tests
 72 | test_that("Check design matrix for actual data 1", {
 73 |   ## Read data
 74 |   x <- Boston[-ncol(Boston)]
 75 | 
 76 |   ## Create design matrix of aglm
 77 |   DM.aglm <- getDesignMatrix(newInput(x, OD_type_of_quantitatives='J'))
 78 | 
 79 |   ## Create design matrix to be compared
 80 |   DM.Iwasawa <- cbind(OD(x, make.bins(x)), as.matrix(x), ints.mat(x))
 81 | 
 82 |   ## Test if two design matrice are same
 83 |   expect_true(compare_mat_without_column_order(DM.aglm, DM.Iwasawa))
 84 | })
 85 | 
 86 | test_that("Check design matrix for actual data 2", {
 87 |   ## Read data
 88 |   x <- Boston[, -ncol(Boston)]
 89 | 
 90 |   ## Create bins
 91 |   bins_list <- make.bins(x[, colnames(x) != "chas"])
 92 |   bins_names <- colnames(x)[colnames(x) != "chas"]
 93 | 
 94 |   ## Set chas and rad variables as factors
 95 |   x$chas <- as.factor(x$chas)
 96 |   x$rad <- as.ordered(x$rad)
 97 | 
 98 |   ## Create design matrix of aglm
 99 |   input.aglm <- newInput(x, bins_list=bins_list, bins_names=bins_names, OD_type_of_quantitatives='J')
100 |   DM.aglm <- getDesignMatrix(input.aglm)
101 | 
102 |   ## Create design matrix to be compared
103 |   x.OD <- OD(as.matrix(x[colnames(x) != "chas"]), bins_list)
104 |   x.UD <- cbind(UD(x$chas, "chas"), UD(x$rad, "rad"))
105 |   x.linear <- as.matrix(x[!colnames(x) %in% c("chas", "rad")])
106 |   x.ints <- ints.mat(cbind(x.UD, x.linear))
107 |   DM.Iwasawa <- cbind(x.linear, x.OD, x.UD, x.ints)
108 | 
109 |   ## Test if two design matrice are same
110 |   expect_true(compare_mat_without_column_order(DM.aglm, DM.Iwasawa))
111 | })
112 | 


--------------------------------------------------------------------------------
/R/coef-aglm.R:
--------------------------------------------------------------------------------
  1 | #' Get coefficients
  2 | #'
  3 | #' @param object
  4 | #'   A model object obtained from `aglm()` or `cv.aglm()`.
  5 | #'
  6 | #' @param index
  7 | #'   An integer value representing the index of variable whose coefficients are required.
  8 | #'
  9 | #' @param name
 10 | #'   A string representing the name of variable whose coefficients are required.
 11 | #'   Note that if both `index` and `name` are set, `index` is discarded.
 12 | #'
 13 | #' @param s
 14 | #'   Same as in \link[glmnet]{coef.glmnet}.
 15 | #'
 16 | #' @param exact
 17 | #'   Same as in \link[glmnet]{coef.glmnet}.
 18 | #'
 19 | #' @param ...
 20 | #'   Other arguments are passed directly to `coef.glmnet()`.
 21 | #'
 22 | #' @return
 23 | #'   If `index` or `name` is given, the function returns a list with the one or combination
 24 | #'   of the following fields, consisting of coefficients related to the specified variable.
 25 | #'   * `coef.linear`: A coefficient of the linear term. (If any)
 26 | #'   * `coef.OD`: Coefficients of O-dummies. (If any)
 27 | #'   * `coef.UD`: Coefficients of U-dummies. (If any)
 28 | #'   * `coef.LV`: Coefficients of L-variables. (If any)
 29 | #'
 30 | #'   If both `index` and `name` are not given, the function returns entire coefficients
 31 | #'   corresponding to the internal designed matrix.
 32 | #'
 33 | #'
 34 | #' @author
 35 | #' Kenji Kondo
 36 | #'
 37 | #'
 38 | #' @importFrom assertthat assert_that
 39 | #' @importFrom stats coef
 40 | #' @export
 41 | coef.AccurateGLM <- function(object, index=NULL, name=NULL, s=NULL, exact=FALSE, ...) {
 42 |   # It's necessary to use same names for some arguments as the original methods,
 43 |   # because devtools::check() issues warnings when using inconsistent names.
 44 |   # As a result, we sometimes should accept uncomfortable argument names,
 45 |   # but still have rights to use preferable names internally.
 46 |   model <- object
 47 | 
 48 |   coefs <- coef(model@backend_models[[1]], s, exact, ...)
 49 | 
 50 |   # If `name` is set, `index` is overwritten.
 51 |   if (!is.null(name)) {
 52 |     tmp <- model@vars_info[sapply(model@vars_info, function(v){v$name}) == name]
 53 |     assert_that(length(tmp) == 1)
 54 |     index <- tmp[[1]]$idx
 55 |   }
 56 | 
 57 |   # If `index` or `name` is set, returns coefficients of that variable.
 58 |   if (!is.null(index)) {
 59 |     nvars <- length(model@vars_info)
 60 |     off0 <- 1  # not 0 because the first column is used as intercept.
 61 |     for (i in seq(nvars)) {
 62 |       var_info <- model@vars_info[[i]]
 63 |       ncol_linear <- 0
 64 |       if (var_info$use_linear) ncol_linear <- 1
 65 | 
 66 |       ncol_OD <- 0
 67 |       if(var_info$use_OD) {
 68 |         ncol_OD <- length(var_info$OD_info$breaks)
 69 |         if (var_info$OD_type == "C") ncol_OD <- ncol_OD - 1
 70 |       }
 71 | 
 72 |       ncol_UD <- 0
 73 |       if(var_info$use_UD) ncol_UD <- length(var_info$UD_info$levels) - var_info$UD_info$drop_last
 74 | 
 75 |       ncol_LV <- 0
 76 |       # In case length(LV_info$breaks) <= 2, there are no internal breaks, and so only a linear column is created.
 77 |       if(var_info$use_LV & length(var_info$LV_info$breaks) > 2) ncol_LV <- length(var_info$LV_info$breaks) - 2
 78 | 
 79 |       if (i == index) {
 80 |         c <- list()
 81 | 
 82 |         if(ncol_linear) c$coef.linear <- coefs[off0 + 1,] else c$coef.linear <- NULL
 83 |         off0 <- off0 + ncol_linear
 84 | 
 85 |         if(ncol_OD) c$coef.OD <- coefs[off0 + 1:ncol_OD, ] else c$coef.OD <-  NULL
 86 |         off0 <- off0 + ncol_OD
 87 | 
 88 |         if(ncol_UD) c$coef.UD <- coefs[off0 + 1:ncol_UD,] else c$coef.UD <- NULL
 89 |         off0 <- off0 + ncol_UD
 90 | 
 91 |         if(ncol_LV) c$coef.LV <- coefs[off0 + 1:ncol_LV, ] else c$coef.LV <-  NULL
 92 |         off0 <- off0 + ncol_LV
 93 | 
 94 |         return(c)
 95 |       } else {
 96 |         ncol_all <- ncol_linear + ncol_OD + ncol_UD + ncol_LV
 97 |         off0 <- off0 + ncol_all
 98 |       }
 99 |     }
100 |     assert_that(FALSE)
101 |   }
102 | 
103 |   # If neither `index` nor `name` is set, return coefficients of all variables
104 |   return(coefs)
105 | }
106 | 


--------------------------------------------------------------------------------
/man/cv.aglm.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/cv-aglm.R
  3 | \name{cv.aglm}
  4 | \alias{cv.aglm}
  5 | \title{Fit an AGLM model with cross-validation for \eqn{\lambda}}
  6 | \usage{
  7 | cv.aglm(
  8 |   x,
  9 |   y,
 10 |   qualitative_vars_UD_only = NULL,
 11 |   qualitative_vars_both = NULL,
 12 |   qualitative_vars_OD_only = NULL,
 13 |   quantitative_vars = NULL,
 14 |   use_LVar = FALSE,
 15 |   extrapolation = "default",
 16 |   add_linear_columns = TRUE,
 17 |   add_OD_columns_of_qualitatives = TRUE,
 18 |   add_interaction_columns = FALSE,
 19 |   OD_type_of_quantitatives = "C",
 20 |   nbin.max = NULL,
 21 |   bins_list = NULL,
 22 |   bins_names = NULL,
 23 |   family = c("gaussian", "binomial", "poisson"),
 24 |   keep = FALSE,
 25 |   ...
 26 | )
 27 | }
 28 | \arguments{
 29 | \item{x}{A design matrix.
 30 | See \link{aglm} for more details.}
 31 | 
 32 | \item{y}{A response variable.}
 33 | 
 34 | \item{qualitative_vars_UD_only}{Same as in \link{aglm}.}
 35 | 
 36 | \item{qualitative_vars_both}{Same as in \link{aglm}.}
 37 | 
 38 | \item{qualitative_vars_OD_only}{Same as in \link{aglm}.}
 39 | 
 40 | \item{quantitative_vars}{Same as in \link{aglm}.}
 41 | 
 42 | \item{use_LVar}{Same as in \link{aglm}.}
 43 | 
 44 | \item{extrapolation}{Same as in \link{aglm}.}
 45 | 
 46 | \item{add_linear_columns}{Same as in \link{aglm}.}
 47 | 
 48 | \item{add_OD_columns_of_qualitatives}{Same as in \link{aglm}.}
 49 | 
 50 | \item{add_interaction_columns}{Same as in \link{aglm}.}
 51 | 
 52 | \item{OD_type_of_quantitatives}{Same as in \link{aglm}.}
 53 | 
 54 | \item{nbin.max}{Same as in \link{aglm}.}
 55 | 
 56 | \item{bins_list}{Same as in \link{aglm}.}
 57 | 
 58 | \item{bins_names}{Same as in \link{aglm}.}
 59 | 
 60 | \item{family}{Same as in \link{aglm}.}
 61 | 
 62 | \item{keep}{Set to \code{TRUE} if you need the \code{fit.preval} field in the returned value, as in \code{cv.glmnet()}.}
 63 | 
 64 | \item{...}{Other arguments are passed directly when calling \code{cv.glmnet()}.}
 65 | }
 66 | \value{
 67 | A model object fitted to the data with cross-validation results.
 68 | Functions such as \code{predict} and \code{plot} can be applied to the returned object, same as the result of \code{aglm()}.
 69 | See \link{AccurateGLM-class} for more details.
 70 | }
 71 | \description{
 72 | A fitting function with given \eqn{\alpha} and cross-validation for \eqn{\lambda}.
 73 | See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}.
 74 | }
 75 | \examples{
 76 | 
 77 | #################### Cross-validation for lambda ####################
 78 | 
 79 | library(aglm)
 80 | library(faraway)
 81 | 
 82 | ## Read data
 83 | xy <- nes96
 84 | 
 85 | ## Split data into train and test
 86 | n <- nrow(xy) # Sample size.
 87 | set.seed(2018) # For reproducibility.
 88 | test.id <- sample(n, round(n/5)) # ID numbders for test data.
 89 | test <- xy[test.id,] # test is the data.frame for testing.
 90 | train <- xy[-test.id,] # train is the data.frame for training.
 91 | x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
 92 | y <- train$vote
 93 | newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
 94 | 
 95 | # NOTE: Codes bellow will take considerable time, so run it when you have time.
 96 | \donttest{
 97 | 
 98 | ## Fit the model
 99 | model <- cv.aglm(x, y, family="binomial")
100 | 
101 | ## Make the confusion matrix
102 | lambda <- model@lambda.min
103 | y_true <- test$vote
104 | y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))]
105 | 
106 | cat(sprintf("Confusion matrix for lambda=\%.5f:\n", lambda))
107 | print(table(y_true, y_pred))
108 | 
109 | }
110 | }
111 | \references{
112 | Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
113 | \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
114 | \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
115 | \emph{Actuarial Colloquium Paris 2020}
116 | }
117 | \author{
118 | \itemize{
119 | \item Kenji Kondo,
120 | \item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
121 | }
122 | }
123 | 


--------------------------------------------------------------------------------
/R/aglm-package.R:
--------------------------------------------------------------------------------
 1 | #' aglm: Accurate Generalized Linear Model
 2 | #'
 3 | #' Provides functions to fit Accurate Generalized Linear Model (AGLM) models,
 4 | #' visualize them, and predict for new data. AGLM is defined as a regularized GLM
 5 | #' which applies a sort of feature transformations using a discretization of numerical
 6 | #' features and specific coding methodologies of dummy variables.
 7 | #' For more information on AGLM, see
 8 | #' \href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa (2020)}.
 9 | #'
10 | #' The collection of functions provided by the `aglm` package has almost the same structure as the famous `glmnet` package,
11 | #' so users familiar with the `glmnet` package will be able to handle it easily.
12 | #' In fact, this structure is reasonable in implementation, because what the `aglm` package does is
13 | #' applying appropriate transformations to the given data and passing it to the `glmnet` package as a backend.
14 | #'
15 | #' @section Fitting functions:
16 | #' The `aglm` package provides three different fitting functions, depending on how users want to handle hyper-parameters of AGLM models.
17 | #'
18 | #' Because AGLM is based on regularized GLM, the regularization term of the loss function can be expressed as follows:
19 | #' \loadmathjax
20 | #' \mjsdeqn{
21 | #'   R(\lbrace \beta_{jk} \rbrace; \lambda, \alpha)
22 | #'    = \lambda \left\lbrace
23 | #'      (1 - \alpha)\sum_{j=1}^{p} \sum_{k=1}^{m_j}|\beta_{jk}|^2 + \alpha \sum_{j=1}^{p} \sum_{k=1}^{m_j} |\beta_{jk}|
24 | #'    \right\rbrace,
25 | #' }
26 | #' where \eqn{\beta_jk} is the k-th coefficient of auxiliary variables for the j-th column in data,
27 | #' \eqn{\alpha} is a weight which controls how L1 and L2 regularization terms are mixed,
28 | #' and \eqn{\lambda} determines the strength of the regularization.
29 | #'
30 | #' Searching hyper-parameters \eqn{\alpha} and \eqn{\lambda} is often useful to get better results, but usually time-consuming.
31 | #' That's why the `aglm` package provides three fitting functions with different strategies for specifying hyper-parameters as follows:
32 | #'   * \link{aglm}: A basic fitting function with given \eqn{\alpha} and \eqn{\lambda} (s).
33 | #'   * \link{cv.aglm}: A fitting function with given \eqn{\alpha} and cross-validation for \eqn{\lambda}.
34 | #'   * \link{cva.aglm}: A fitting function with cross-validation for both \eqn{\alpha} and \eqn{\lambda}.
35 | #'
36 | #' Generally speaking, setting an appropriate \eqn{\lambda} is often important to get meaningful results,
37 | #' and using `cv.aglm()` with default \eqn{\alpha=1} (LASSO) is usually enough.
38 | #' Since `cva.aglm()` is much time-consuming than `cv.aglm()`, it is better to use it only if particularly better results are needed.
39 | #'
40 | #' The following S4 classes are defined to store results of the fitting functions.
41 | #'   * \link{AccurateGLM-class}: A class for results of `aglm()` and `cv.aglm()`
42 | #'   * \link{CVA_AccurateGLM-class}: A class for results of `cva.aglm()`
43 | #'
44 | #' @section Using the fitted model:
45 | #' Users can use models obtained from fitting functions in various ways, by passing them to following functions:
46 | #'   * \link[=predict.AccurateGLM]{predict}: Make predictions for new data
47 | #'   * \link[=plot.AccurateGLM]{plot}: Plot contribution of each variable and residuals
48 | #'   * \link[=print.AccurateGLM]{print}: Display textual information of the model
49 | #'   * \link[=coef.AccurateGLM]{coef}: Get coefficients
50 | #'   * \link[=deviance.AccurateGLM]{deviance}: Get deviance
51 | #'   * \link[=residuals.AccurateGLM]{residuals}: Get residuals of various types
52 | #'
53 | #' We emphasize that `plot()` is particularly useful to understand the fitted model,
54 | #' because it presents a visual representation of how variables in the original data are used by the model.
55 | #'
56 | #' @section Other functions:
57 | #' The following functions are basically for internal use, but exported as utility functions for convenience.
58 | #'
59 | #' * Functions for creating feature vectors
60 | #'   * \link{getUDummyMatForOneVec}
61 | #'   * \link{getODummyMatForOneVec}
62 | #'   * \link{getLVarMatForOneVec}
63 | #' * Functions for binning
64 | #'   * \link{createEqualWidthBins}
65 | #'   * \link{createEqualFreqBins}
66 | #'   * \link{executeBinning}
67 | #'
68 | #' @import mathjaxr
69 | #'
70 | #' @author
71 | #'   * Kenji Kondo,
72 | #'   * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
73 | #'
74 | #'
75 | #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
76 | #' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
77 | #' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
78 | #' \emph{Actuarial Colloquium Paris 2020}
79 | #'
80 | #' @name aglm-package
81 | "_PACKAGE"
82 | 


--------------------------------------------------------------------------------
/man/plot.AccurateGLM.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/plot-aglm.R
  3 | \name{plot.AccurateGLM}
  4 | \alias{plot.AccurateGLM}
  5 | \title{Plot contribution of each variable and residuals}
  6 | \usage{
  7 | \method{plot}{AccurateGLM}(
  8 |   x,
  9 |   vars = NULL,
 10 |   verbose = TRUE,
 11 |   s = NULL,
 12 |   resid = FALSE,
 13 |   smooth_resid = TRUE,
 14 |   smooth_resid_fun = NULL,
 15 |   ask = TRUE,
 16 |   layout = c(2, 2),
 17 |   only_plot = FALSE,
 18 |   main = "",
 19 |   add_rug = FALSE,
 20 |   ...
 21 | )
 22 | }
 23 | \arguments{
 24 | \item{x}{A model object obtained from \code{aglm()} or \code{cv.aglm()}.}
 25 | 
 26 | \item{vars}{Used to specify variables to be plotted (\code{NULL} means all the variables).
 27 | This parameter may have one of the following classes:
 28 | \itemize{
 29 | \item \code{integer}: specifying variables by index.
 30 | \item \code{character}: specifying variables by name.
 31 | }}
 32 | 
 33 | \item{verbose}{Set to \code{FALSE} if textual outputs are not needed.}
 34 | 
 35 | \item{s}{A numeric value specifying \eqn{\lambda} at which plotting is required.
 36 | Note that plotting for multiple \eqn{\lambda}'s are not allowed and \code{s} always should be a single value.
 37 | When the model is trained with only a single \eqn{\lambda} value, just set it to \code{NULL} to plot for that value.}
 38 | 
 39 | \item{resid}{Used to display residuals in plots.
 40 | This parameter may have one of the following classes:
 41 | \itemize{
 42 | \item \code{logical}(single value): If \code{TRUE}, working residuals are plotted.
 43 | \item \code{character}(single value): type of residual to be plotted. See \link{residuals.AccurateGLM} for more details on types of residuals.
 44 | \item \code{numerical}(vector): residual values to be plotted.
 45 | }}
 46 | 
 47 | \item{smooth_resid}{Used to display smoothing lines of residuals for quantitative variables.
 48 | This parameter may have one of the following classes:
 49 | \itemize{
 50 | \item \code{logical}: If \code{TRUE}, smoothing lines are drawn.
 51 | \item \code{character}:
 52 | \itemize{
 53 | \item \code{smooth_resid="both"}: Balls and smoothing lines are drawn.
 54 | \item \code{smooth_resid="smooth_only"}: Only smoothing lines are drawn.
 55 | }
 56 | }}
 57 | 
 58 | \item{smooth_resid_fun}{Set if users need custom smoothing functions.}
 59 | 
 60 | \item{ask}{By default, \code{plot()} stops and waits inputs each time plotting for each variable is completed.
 61 | Users can set \code{ask=FALSE} to avoid this.
 62 | It is useful, for example, when using devices as \code{bmp} to create image files.}
 63 | 
 64 | \item{layout}{Plotting multiple variables for each page is allowed.
 65 | To achieve this, set it to a pair of integer, which indicating number of rows and columns, respectively.}
 66 | 
 67 | \item{only_plot}{Set to \code{TRUE} if no automatic graphical configurations are needed.}
 68 | 
 69 | \item{main}{Used to specify the title of plotting.}
 70 | 
 71 | \item{add_rug}{Set to \code{TRUE} for rug plots.}
 72 | 
 73 | \item{...}{Other arguments are currently not used and just discarded.}
 74 | }
 75 | \value{
 76 | No return value, called for side effects.
 77 | }
 78 | \description{
 79 | Plot contribution of each variable and residuals
 80 | }
 81 | \examples{
 82 | 
 83 | #################### using plot() and predict() ####################
 84 | 
 85 | library(MASS) # For Boston
 86 | library(aglm)
 87 | 
 88 | ## Read data
 89 | xy <- Boston # xy is a data.frame to be processed.
 90 | colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y.
 91 | 
 92 | ## Split data into train and test
 93 | n <- nrow(xy) # Sample size.
 94 | set.seed(2018) # For reproducibility.
 95 | test.id <- sample(n, round(n/4)) # ID numbders for test data.
 96 | test <- xy[test.id,] # test is the data.frame for testing.
 97 | train <- xy[-test.id,] # train is the data.frame for training.
 98 | x <- train[-ncol(xy)]
 99 | y <- train$y
100 | newx <- test[-ncol(xy)]
101 | y_true <- test$y
102 | 
103 | ## With the result of aglm()
104 | model <- aglm(x, y)
105 | lambda <- 0.1
106 | 
107 | plot(model, s=lambda, resid=TRUE, add_rug=TRUE,
108 |      verbose=FALSE, layout=c(3, 3))
109 | 
110 | y_pred <- predict(model, newx=newx, s=lambda)
111 | plot(y_true, y_pred)
112 | 
113 | ## With the result of cv.aglm()
114 | model <- cv.aglm(x, y)
115 | lambda <- model@lambda.min
116 | 
117 | plot(model, s=lambda, resid=TRUE, add_rug=TRUE,
118 |      verbose=FALSE, layout=c(3, 3))
119 | 
120 | y_pred <- predict(model, newx=newx, s=lambda)
121 | plot(y_true, y_pred)
122 | 
123 | 
124 | }
125 | \references{
126 | Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
127 | \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
128 | \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
129 | \emph{Actuarial Colloquium Paris 2020}
130 | }
131 | \author{
132 | \itemize{
133 | \item Kenji Kondo,
134 | \item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
135 | }
136 | }
137 | 


--------------------------------------------------------------------------------
/R/residuals-aglm.R:
--------------------------------------------------------------------------------
  1 | #' Get residuals of various types
  2 | #'
  3 | #' @param object
  4 | #'   A model object obtained from `aglm()` or `cv.aglm()`.
  5 | #'
  6 | #' @param x
  7 | #'   A design matrix.
  8 | #'   If not given, `x` for fitting is used.
  9 | #'
 10 | #' @param y
 11 | #'   A response variable.
 12 | #'   If not given, `y` for fitting is used.
 13 | #'
 14 | #' @param offset
 15 | #'   An offset values.
 16 | #'   If not given, `offset` for fitting is used.
 17 | #'
 18 | #' @param weights
 19 | #'   Sample weights.
 20 | #'   If not given, `weights` for fitting is used.
 21 | #'
 22 | #' @param type
 23 | #'   \loadmathjax
 24 | #'   A string representing type of deviance:
 25 | #'   * `"working"` get working residual
 26 | #'      \mjsdeqn{r^W_i = (y_i - \mu_i) \left(\frac{\partial \eta}{\partial \mu}\right)_{\mu=\mu_i},}
 27 | #'      where \eqn{y_i} is a response value, \eqn{\mu} is GLM mean, and \eqn{\eta=g^{-1}(\mu)} with the link function \eqn{g}.
 28 | #'   * `"pearson"` get Pearson residuals
 29 | #'      \mjsdeqn{r^P_i = \frac{y_i - \mu_i}{\sqrt{V(\mu_i)}},}
 30 | #'      where \eqn{V} is the variance function.
 31 | #'   * `"deviance"` get deviance residuals
 32 | #'      \mjsdeqn{r^D_i = {\rm sign}(y_i - \mu_i) \sqrt{d_i},}
 33 | #'      where \eqn{d_i} is the contribution to deviance.
 34 | #'
 35 | #' @param s
 36 | #'   A numeric value specifying \eqn{\lambda} at which residuals are calculated.
 37 | #'
 38 | #' @param ...
 39 | #'   Other arguments are currently not used and just discarded.
 40 | #'
 41 | #' @return
 42 | #'   A numeric vector representing calculated residuals.
 43 | #'
 44 | #'
 45 | #' @author
 46 | #' Kenji Kondo
 47 | #'
 48 | #'
 49 | #' @export
 50 | #' @importFrom assertthat assert_that
 51 | #' @importFrom stats predict
 52 | #' @importFrom stats getCall
 53 | residuals.AccurateGLM <- function(object,
 54 |                                   x=NULL,
 55 |                                   y=NULL,
 56 |                                   offset=NULL,
 57 |                                   weights=NULL,
 58 |                                   type=c("working", "pearson", "deviance"),
 59 |                                   s=NULL,
 60 |                                   ...) {
 61 |   # It's necessary to use same names for some arguments as the original methods,
 62 |   # because devtools::check() issues warnings when using inconsistent names.
 63 |   # As a result, we sometimes should accept uncomfortable argument names,
 64 |   # but still have rights to use preferable names internally.
 65 |   model <- object
 66 | 
 67 |   # Check and set `type`
 68 |   type <- match.arg(type)
 69 | 
 70 |   # Get x and y from model@call
 71 |   call.orig <- getCall(model)
 72 |   if (is.null(x)) {
 73 |     x <- eval.parent(call.orig$x)
 74 |     if (class(x)[1] != "data.frame") x <- data.frame(x)
 75 |   }
 76 |   if (is.null(y)) {
 77 |     y <- as.numeric(drop(eval.parent(call.orig$y)))
 78 |   }
 79 |   if (!is.null(call.orig$offset) & is.null(offset)) {
 80 |     offset <- as.numeric(drop(eval.parent(call.orig$offset)))
 81 |   }
 82 |   if (is.null(weights)) {
 83 |     weights <- as.numeric(drop(eval.parent(call.orig$weights)))
 84 |     if (is.null(weights) || length(weights) == 0) weights <- rep(1, length(y))
 85 |   }
 86 |   if (class(x)[1] != "data.frame") x <- data.frame(x)
 87 |   assert_that(dim(x)[1] == length(y))
 88 |   assert_that(length(y) == length(weights))
 89 | 
 90 |   # Calculate residuals
 91 |   cl <- class(model@backend_models[[1]])
 92 | 
 93 |   if (type == "working") {
 94 |     yhat <- as.numeric(drop(predict(model, newx=x, newoffset=offset, s=s, type="response")))
 95 |     resids <- sqrt(weights) * (y - yhat)
 96 |     if ("fishnet" %in% cl)
 97 |       resids <- resids / yhat  # Poisson case
 98 |     else if ("lognet" %in% cl)
 99 |       resids <- resids / (yhat * (1 - yhat))  # binomial case
100 |   } else if (type == "pearson") {
101 |     yhat <- as.numeric(drop(predict(model, newx=x, newoffset=offset, s=s, type="response")))
102 |     resids <- sqrt(weights) * (y - yhat)
103 |     if ("fishnet" %in% cl)
104 |       resids <- resids / sqrt(yhat) # Poisson case
105 |     else if ("lognet" %in% cl)
106 |       resids <- resids / sqrt(yhat * (1 - yhat)) # binomial case
107 |   } else if (type == "deviance") {
108 |     if ("fishnet" %in% cl){  # Poisson case
109 |       yhat <- as.numeric(drop(predict(model, newx=x, newoffset=offset, s=s, type="response")))
110 |       z <- 2 * (log((y / yhat)^y) - y + yhat)
111 |       resids <- sqrt(weights) * sign(y - yhat) * sqrt(abs(z))
112 |     } else if ("lognet" %in% cl) {  # binomial case
113 |       eta <- as.numeric(drop(predict(model, newx=x, newoffset=offset, s=s, type="link")))
114 |       z <- 2 * (log(1 + exp(eta) - y * eta))
115 |       resids <- sqrt(weights) * sign(z) * sqrt(abs(z))
116 |     } else {  # Gaussian case
117 |       yhat <- as.numeric(drop(predict(model, newx=x, newoffset=offset, s=s, type="response")))
118 |       resids <- sqrt(weights) * (y - yhat)
119 |     }
120 |   } else {
121 |     assert_that(FALSE)  # never comes here
122 |   }
123 | 
124 |   return(resids)
125 | }
126 | 


--------------------------------------------------------------------------------
/man/aglm-package.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/aglm-package.R
  3 | \docType{package}
  4 | \name{aglm-package}
  5 | \alias{aglm-package}
  6 | \title{aglm: Accurate Generalized Linear Model}
  7 | \description{
  8 | Provides functions to fit Accurate Generalized Linear Model (AGLM) models,
  9 | visualize them, and predict for new data. AGLM is defined as a regularized GLM
 10 | which applies a sort of feature transformations using a discretization of numerical
 11 | features and specific coding methodologies of dummy variables.
 12 | For more information on AGLM, see
 13 | \href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa (2020)}.
 14 | }
 15 | \details{
 16 | The collection of functions provided by the \code{aglm} package has almost the same structure as the famous \code{glmnet} package,
 17 | so users familiar with the \code{glmnet} package will be able to handle it easily.
 18 | In fact, this structure is reasonable in implementation, because what the \code{aglm} package does is
 19 | applying appropriate transformations to the given data and passing it to the \code{glmnet} package as a backend.
 20 | }
 21 | \section{Fitting functions}{
 22 | 
 23 | The \code{aglm} package provides three different fitting functions, depending on how users want to handle hyper-parameters of AGLM models.
 24 | 
 25 | Because AGLM is based on regularized GLM, the regularization term of the loss function can be expressed as follows:
 26 | \loadmathjax
 27 | \mjsdeqn{
 28 |   R(\lbrace \beta_{jk} \rbrace; \lambda, \alpha)
 29 |    = \lambda \left\lbrace
 30 |      (1 - \alpha)\sum_{j=1}^{p} \sum_{k=1}^{m_j}|\beta_{jk}|^2 + \alpha \sum_{j=1}^{p} \sum_{k=1}^{m_j} |\beta_{jk}|
 31 |    \right\rbrace,
 32 | }
 33 | where \eqn{\beta_jk} is the k-th coefficient of auxiliary variables for the j-th column in data,
 34 | \eqn{\alpha} is a weight which controls how L1 and L2 regularization terms are mixed,
 35 | and \eqn{\lambda} determines the strength of the regularization.
 36 | 
 37 | Searching hyper-parameters \eqn{\alpha} and \eqn{\lambda} is often useful to get better results, but usually time-consuming.
 38 | That's why the \code{aglm} package provides three fitting functions with different strategies for specifying hyper-parameters as follows:
 39 | \itemize{
 40 | \item \link{aglm}: A basic fitting function with given \eqn{\alpha} and \eqn{\lambda} (s).
 41 | \item \link{cv.aglm}: A fitting function with given \eqn{\alpha} and cross-validation for \eqn{\lambda}.
 42 | \item \link{cva.aglm}: A fitting function with cross-validation for both \eqn{\alpha} and \eqn{\lambda}.
 43 | }
 44 | 
 45 | Generally speaking, setting an appropriate \eqn{\lambda} is often important to get meaningful results,
 46 | and using \code{cv.aglm()} with default \eqn{\alpha=1} (LASSO) is usually enough.
 47 | Since \code{cva.aglm()} is much time-consuming than \code{cv.aglm()}, it is better to use it only if particularly better results are needed.
 48 | 
 49 | The following S4 classes are defined to store results of the fitting functions.
 50 | \itemize{
 51 | \item \link{AccurateGLM-class}: A class for results of \code{aglm()} and \code{cv.aglm()}
 52 | \item \link{CVA_AccurateGLM-class}: A class for results of \code{cva.aglm()}
 53 | }
 54 | }
 55 | 
 56 | \section{Using the fitted model}{
 57 | 
 58 | Users can use models obtained from fitting functions in various ways, by passing them to following functions:
 59 | \itemize{
 60 | \item \link[=predict.AccurateGLM]{predict}: Make predictions for new data
 61 | \item \link[=plot.AccurateGLM]{plot}: Plot contribution of each variable and residuals
 62 | \item \link[=print.AccurateGLM]{print}: Display textual information of the model
 63 | \item \link[=coef.AccurateGLM]{coef}: Get coefficients
 64 | \item \link[=deviance.AccurateGLM]{deviance}: Get deviance
 65 | \item \link[=residuals.AccurateGLM]{residuals}: Get residuals of various types
 66 | }
 67 | 
 68 | We emphasize that \code{plot()} is particularly useful to understand the fitted model,
 69 | because it presents a visual representation of how variables in the original data are used by the model.
 70 | }
 71 | 
 72 | \section{Other functions}{
 73 | 
 74 | The following functions are basically for internal use, but exported as utility functions for convenience.
 75 | \itemize{
 76 | \item Functions for creating feature vectors
 77 | \itemize{
 78 | \item \link{getUDummyMatForOneVec}
 79 | \item \link{getODummyMatForOneVec}
 80 | \item \link{getLVarMatForOneVec}
 81 | }
 82 | \item Functions for binning
 83 | \itemize{
 84 | \item \link{createEqualWidthBins}
 85 | \item \link{createEqualFreqBins}
 86 | \item \link{executeBinning}
 87 | }
 88 | }
 89 | }
 90 | 
 91 | \references{
 92 | Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
 93 | \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
 94 | \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
 95 | \emph{Actuarial Colloquium Paris 2020}
 96 | }
 97 | \seealso{
 98 | Useful links:
 99 | \itemize{
100 |   \item \url{https://github.com/kkondo1981/aglm}
101 |   \item Report bugs at \url{https://github.com/kkondo1981/aglm/issues}
102 | }
103 | 
104 | }
105 | \author{
106 | \itemize{
107 | \item Kenji Kondo,
108 | \item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
109 | }
110 | }
111 | 


--------------------------------------------------------------------------------
/R/get-dummies.R:
--------------------------------------------------------------------------------
  1 | #' Create a U-dummy matrix for one variable
  2 | #'
  3 | #' @param x_vec A vector representing original variable.
  4 | #'   The class of `x_vec` should be one of `integer`, `character`, or `factor`.
  5 | #' @param levels A character vector representing values of `x_vec` used to create U-dummies.
  6 | #'   If `NULL`, all the unique values of `x_vec` are used to create dummies.
  7 | #' @param drop_last If `TRUE`, the last column of the resulting matrix is dropped to avoid multicollinearity.
  8 | #' @param only_info If `TRUE`, only information fields of returned values are filled and no dummy matrix is returned.
  9 | #'
 10 | #' @return A list with the following fields:
 11 | #' * `levels`: Same as input.
 12 | #' * `drop_last`: Same as input.
 13 | #' * `dummy_mat`: The created U-dummy matrix (only if `only_info=FALSE`).
 14 | #'
 15 | #' @author Kenji Kondo
 16 | #'
 17 | #' @export
 18 | #' @importFrom assertthat assert_that
 19 | getUDummyMatForOneVec <- function(x_vec, levels=NULL, drop_last=TRUE, only_info=FALSE) {
 20 |   # Check arguments. numerical vectors are not allowed.
 21 |   assert_that(is.integer(x_vec) | is.character(x_vec) | is.factor(x_vec))
 22 | 
 23 |   # Create factor. Note that if x_vec is itself factor and levels is not specified, do not need nothing.
 24 |   if (!is.factor(x_vec) | !is.null(levels)) {
 25 |     x_vec <- as.character(x_vec)
 26 |     if (is.null(levels)) x_vec <- factor(x_vec)
 27 |     else x_vec <- factor(x_vec, levels=levels)
 28 |   }
 29 | 
 30 |   # create dummy matrix for x_vec
 31 |   int_labels <- as.integer(x_vec)
 32 |   nrow <- length(x_vec)
 33 |   ncol <- length(levels(x_vec))
 34 |   if (drop_last) ncol <- ncol - 1
 35 |   dummy_mat <- 1 * (int_labels == t(matrix(1:ncol, ncol, nrow)))
 36 | 
 37 |   if (only_info) return(list(levels=levels(x_vec), drop_last=drop_last))
 38 |   else return(list(levels=levels(x_vec), drop_last=drop_last, dummy_mat=dummy_mat))
 39 | }
 40 | 
 41 | 
 42 | #' Create a O-dummy matrix for one variable
 43 | #'
 44 | #' @param x_vec A numeric vector representing original variable.
 45 | #' @param breaks A numeric vector representing breaks of bins (If `NULL`, automatically generated).
 46 | #' @param nbin.max The maximum number of bins (used only if `breaks=NULL`).
 47 | #' @param only_info If `TRUE`, only information fields of returned values are filled and no dummy matrix is returned.
 48 | #' @param dummy_type Used to control the shape of linear combinations obtained by O-dummies for quantitative variables (deprecated).
 49 | #'
 50 | #' @return A list with the following fields:
 51 | #' * `breaks`: Same as input
 52 | #' * `dummy_mat`: The created O-dummy matrix (only if `only_info=FALSE`).
 53 | #'
 54 | #' @author Kenji Kondo
 55 | #'
 56 | #' @export
 57 | #' @importFrom assertthat assert_that
 58 | getODummyMatForOneVec <- function(x_vec, breaks=NULL, nbin.max=100, only_info=FALSE, dummy_type=NULL) {
 59 |   # Check arguments. only integer or numerical or ordered vectors are allowed.
 60 |   assert_that(is.integer(x_vec) | is.numeric(x_vec) | is.ordered(x_vec))
 61 |   if (is.null(dummy_type)) {
 62 |     if (is.ordered(x_vec)) dummy_type <- "J"
 63 |     else dummy_type <- "C"
 64 |   }
 65 | 
 66 |   # Execute binning
 67 |   binned_x <- executeBinning(x_vec, breaks=breaks, nbin.max=nbin.max)
 68 | 
 69 |   # create dummy matrix for x_vec
 70 |   if (dummy_type == "C") {
 71 |     nrow <- length(x_vec)
 72 |     ncol <- length(binned_x$breaks) - 1
 73 |     X <- matrix(x_vec, nrow, ncol)
 74 |     B0 <- t(matrix(binned_x$breaks[1:ncol], ncol, nrow))
 75 |     B1 <- t(matrix(binned_x$breaks[-1], ncol, nrow))
 76 |     dummy_mat <- (X - B0) / (B1 - B0)
 77 |     dummy_mat[dummy_mat <= 0] <- 0
 78 |     dummy_mat[dummy_mat >= 1] <- 1
 79 |   } else if (dummy_type == "J") {
 80 |     nrow <- length(x_vec)
 81 |     ncol <- length(binned_x$breaks)
 82 |     dummy_mat <- 1 * (binned_x$labels > t(matrix(1:ncol, ncol, nrow)))
 83 |   } else {
 84 |     assert_that(FALSE, msg="dummy_type must be \"C\" or \"J\".")
 85 |   }
 86 | 
 87 |   if (only_info) return(list(breaks=binned_x$breaks))
 88 |   else return(list(breaks=binned_x$breaks, dummy_mat=dummy_mat))
 89 | }
 90 | 
 91 | 
 92 | #' Create L-variable matrix for one variable
 93 | #'
 94 | #' @param x_vec A numeric vector representing original variable.
 95 | #' @param breaks A numeric vector representing breaks of bins (If `NULL`, automatically generated).
 96 | #' @param nbin.max The maximum number of bins (used only if `breaks=NULL`).
 97 | #' @param only_info If `TRUE`, only information fields of returned values are filled and no dummy matrix is returned.
 98 | #'
 99 | #' @return A list with the following fields:
100 | #' * `breaks`: Same as input
101 | #' * `dummy_mat`: The created L-variable matrix (only if `only_info=FALSE`).
102 | #'
103 | #' @author Kenji Kondo
104 | #'
105 | #' @export
106 | #' @importFrom assertthat assert_that
107 | getLVarMatForOneVec <- function(x_vec, breaks=NULL, nbin.max=100, only_info=FALSE) {
108 |   # Check arguments. only integer or numerical or ordered vectors are allowed.
109 |   assert_that(is.integer(x_vec) | is.numeric(x_vec) | is.ordered(x_vec))
110 | 
111 |   # Execute binning
112 |   binned_x <- executeBinning(x_vec, breaks=breaks, nbin.max=nbin.max)
113 | 
114 |   # create dummy matrix for x_vec
115 |   nrow <- length(x_vec)
116 |   ncol <- length(binned_x$breaks) - 2
117 |   if (ncol < 1) {
118 |     dummy_mat <- NULL
119 |   } else {
120 |     X <- matrix(x_vec, nrow, ncol)
121 |     B0 <- t(matrix(binned_x$breaks[2:(ncol + 1)], ncol, nrow))
122 |     dummy_mat <- abs(X - B0)
123 |   }
124 | 
125 |   if (only_info) return(list(breaks=binned_x$breaks))
126 |   else return(list(breaks=binned_x$breaks, dummy_mat=dummy_mat))
127 | }
128 | 


--------------------------------------------------------------------------------
/tests/testthat/test_aglm.R:
--------------------------------------------------------------------------------
  1 | context("test_aglm")
  2 | library(aglm)
  3 | 
  4 | 
  5 | test_that("Check the types and forms of return value of aglm() and predict.aglm().", {
  6 |   set.seed(12345)
  7 | 
  8 |   # Generates random data
  9 |   nobs <- 500
 10 |   n_quan_vars <- 20
 11 |   quan_var <- matrix(runif(nobs * n_quan_vars), nobs, n_quan_vars)
 12 |   qual_var <- factor(paste0("level_", sample(1:5, nobs, replace=TRUE)))
 13 |   x <- data.frame(quan_var, qual_var)
 14 |   colnames(x) <- c(lapply(1:n_quan_vars, function(i){sprintf("quan_var%d", i)}), "qual_var")
 15 |   y <- sign(quan_var[,1]) * (abs(quan_var[,2]) ** 4) * (qual_var != "level_1")
 16 |   y <- y + 0.1 * rnorm(length(y))
 17 | 
 18 |   # Generates new variables
 19 |   n_new_obs <- 100
 20 |   new_quan_var <- matrix(runif(n_new_obs * n_quan_vars), n_new_obs, n_quan_vars)
 21 |   new_qual_var <- factor(paste0("level_", sample(1:5, n_new_obs, replace=TRUE)))
 22 |   newx <- data.frame(new_quan_var, new_qual_var)
 23 |   colnames(newx) <- c(lapply(1:n_quan_vars, function(i){sprintf("quan_var%d", i)}), "qual_var")
 24 |   y_true <- sign(new_quan_var[,1]) * (abs(new_quan_var[,2]) ** 4) * (new_qual_var != "level_1")
 25 | 
 26 | 
 27 |   # cv.aglm
 28 |   cv_results <- cv.aglm(x, y, family="gaussian")
 29 |   lambda.min <- cv_results@lambda.min
 30 |   res <- aglm(x, y, family="gaussian", lambda=lambda.min)
 31 |   y_pred <- predict(res, newx)
 32 |   RMSE1 <- sqrt(mean((y_pred - y_true)^2))
 33 | 
 34 |   expect_true("AccurateGLM" %in% class(res))
 35 |   expect_true("glmnet" %in% class(res@backend_models[[1]]))
 36 |   expect_true("matrix" %in% class(y_pred))
 37 |   expect_equal(length(y_pred), n_new_obs)
 38 | 
 39 |   # cva.aglm
 40 |   cva_results <- cva.aglm(x, y, family="gaussian")
 41 |   alpha.min <- cva_results@alpha.min
 42 |   lambda.min <- cva_results@lambda.min
 43 |   res <- aglm(x, y, family="gaussian", alpha=alpha.min, lambda=lambda.min)
 44 |   y_pred <- predict(res, newx)
 45 |   RMSE2 <- sqrt(mean((y_pred - y_true)^2))
 46 | 
 47 |   # The result of cva.aglm() must be better than that of cv.aglm() for trainning data.
 48 |   expect_true(RMSE2 <= RMSE1)
 49 | })
 50 | 
 51 | test_that("Check for predict.AGLM_CV().", {
 52 |   set.seed(12345)
 53 | 
 54 |   # size of observations
 55 |   nobs <- 1000
 56 | 
 57 |   # Randomly generates a numeric vector (as quantitative data)
 58 |   quan_var <- runif(nobs)
 59 | 
 60 |   # Randomly generates a character vector (as qualitative data with 5 levels)
 61 |   qual_var <- factor(paste0("level_", sample(1:5, nobs, replace=TRUE)))
 62 | 
 63 |   # Create the whole input data frame
 64 |   x <- data.frame(quan_var, qual_var)
 65 |   colnames(x) <- c("quan_var", "qual_var")
 66 | 
 67 |   # Generates non-linear reponse
 68 |   y <- sign(quan_var) * (abs(quan_var) ** 4) * (qual_var != "level_1")
 69 |   y <- y + 0.1 * rnorm(length(y))
 70 | 
 71 |   res <- cv.aglm(x, y, family="gaussian", keep=TRUE)
 72 | 
 73 |   expect_true("AccurateGLM" %in% class(res))
 74 |   expect_true("glmnet" %in% class(res@backend_models[[1]]))
 75 |   expect_true("matrix" %in% class(res@fit.preval))
 76 |   expect_true(class(res@foldid) == "integer")
 77 | 
 78 |   # Generates new predictive variables
 79 |   n_new_obs <- 100
 80 |   new_quan_var <- runif(n_new_obs)
 81 |   new_qual_var <- factor(paste0("level_", sample(1:5, n_new_obs, replace=TRUE)))
 82 |   newx <- data.frame(new_quan_var, new_qual_var)
 83 |   colnames(newx) <- c("quan_var", "qual_var")
 84 | 
 85 |   # Predict values of y for newx
 86 |   #y_true <- sign(new_quan_var) * (abs(new_quan_var) ** 4) * (new_qual_var != "level_1")
 87 |   y_pred <- predict(res, newx, s=res@lambda.min)
 88 |   #plot(new_quan_var, y_pred)
 89 |   #points(new_quan_var, y_true, col="red")
 90 |   expect_true("matrix" %in% class(y_pred))
 91 |   expect_equal(length(y_pred), n_new_obs)
 92 | })
 93 | 
 94 | 
 95 | test_that("Check for logical features", {
 96 |   set.seed(12345)
 97 | 
 98 |   # size of observations and variables
 99 |   nobs <- 1000
100 |   nvars <- 2
101 | 
102 |   # a random generated logical variables
103 |   x <- matrix(sample(c(TRUE, FALSE), nobs * nvars, replace=TRUE), nobs, nvars)
104 | 
105 |   # Generates non-linear reponse
106 |   y <- xor(x[, 1], x[, 2])
107 | 
108 |   res <- cv.aglm(x, y, family="gaussian", keep=TRUE)
109 | 
110 |   expect_true("AccurateGLM" %in% class(res))
111 |   expect_true("glmnet" %in% class(res@backend_models[[1]]))
112 |   expect_true("matrix" %in% class(res@fit.preval))
113 |   expect_true(class(res@foldid) == "integer")
114 | 
115 |   # Generates new predictive variables
116 |   n_new_obs <- 100
117 | 
118 |   newx <- matrix(sample(c(TRUE, FALSE), n_new_obs * nvars, replace=TRUE), n_new_obs, nvars)
119 | 
120 |   # Predict values of y for newx
121 |   y_pred <- predict(res, newx, s=res@lambda.min)
122 |   #y_true <- xor(newx[, 1], newx[, 2])
123 |   #plot(y_true, y_pred)
124 |   expect_true("matrix" %in% class(y_pred))
125 |   expect_equal(length(y_pred), n_new_obs)
126 | })
127 | 
128 | test_that("Check for binomial family", {
129 |   set.seed(12345)
130 | 
131 |   nobs <- 1000
132 |   x1 <- rnorm(nobs); x2 <- rnorm(nobs); x <- cbind(x1, x2)
133 |   y <- 1 * ((atan(0.25 * x1 - 0.5 * x2) / pi + 0.5) > 0.5)
134 |   model <- aglm(x, y, family="binomial", alpha=1, lambda=0.003)
135 | 
136 |   newx1 <- rnorm(100); newx2 <- rnorm(100); newx <- cbind(newx1, newx2)
137 |   aglm.pred <- predict(model, newx)
138 | 
139 |   expect_equal(length(aglm.pred), 100)
140 | })
141 | 
142 | 
143 | test_that("Check for poisson family", {
144 |   set.seed(12345)
145 | 
146 |   nobs <- 100
147 |   x <- runif(nobs) * 20
148 |   y <- rpois(nobs, x)
149 |   cv_result <- cv.aglm(x, y, family = "poisson", alpha = 1)
150 | 
151 |   newx <- runif(100) * 20
152 |   y_pred <- predict(cv_result, newx, s=cv_result@lambda.min)
153 | 
154 |   expect_equal(length(y_pred), 100)
155 |   expect_equal(names(cv_result@name), "deviance")
156 | })
157 | 


--------------------------------------------------------------------------------
/R/cv-aglm.R:
--------------------------------------------------------------------------------
  1 | #' Fit an AGLM model with cross-validation for \eqn{\lambda}
  2 | #'
  3 | #' A fitting function with given \eqn{\alpha} and cross-validation for \eqn{\lambda}.
  4 | #' See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}.
  5 | #'
  6 | #' @param x
  7 | #'   A design matrix.
  8 | #'   See \link{aglm} for more details.
  9 | #'
 10 | #' @param y
 11 | #'   A response variable.
 12 | #'
 13 | #' @param qualitative_vars_UD_only
 14 | #'   Same as in \link{aglm}.
 15 | #'
 16 | #' @param qualitative_vars_both
 17 | #'   Same as in \link{aglm}.
 18 | #'
 19 | #' @param qualitative_vars_OD_only
 20 | #'   Same as in \link{aglm}.
 21 | #'
 22 | #' @param quantitative_vars
 23 | #'   Same as in \link{aglm}.
 24 | #'
 25 | #' @param use_LVar
 26 | #'   Same as in \link{aglm}.
 27 | #'
 28 | #' @param extrapolation
 29 | #'   Same as in \link{aglm}.
 30 | #'
 31 | #' @param add_linear_columns
 32 | #'   Same as in \link{aglm}.
 33 | #'
 34 | #' @param add_OD_columns_of_qualitatives
 35 | #'   Same as in \link{aglm}.
 36 | #'
 37 | #' @param add_interaction_columns
 38 | #'   Same as in \link{aglm}.
 39 | #'
 40 | #' @param OD_type_of_quantitatives
 41 | #'   Same as in \link{aglm}.
 42 | #'
 43 | #' @param nbin.max
 44 | #'   Same as in \link{aglm}.
 45 | #'
 46 | #' @param bins_list
 47 | #'   Same as in \link{aglm}.
 48 | #'
 49 | #' @param bins_names
 50 | #'   Same as in \link{aglm}.
 51 | #'
 52 | #' @param family
 53 | #'   Same as in \link{aglm}.
 54 | #'
 55 | #' @param keep
 56 | #'   Set to `TRUE` if you need the `fit.preval` field in the returned value, as in `cv.glmnet()`.
 57 | #'
 58 | #' @param ...
 59 | #'   Other arguments are passed directly when calling `cv.glmnet()`.
 60 | #'
 61 | #' @return
 62 | #'   A model object fitted to the data with cross-validation results.
 63 | #'   Functions such as `predict` and `plot` can be applied to the returned object, same as the result of `aglm()`.
 64 | #'   See \link{AccurateGLM-class} for more details.
 65 | #'
 66 | #'
 67 | #' @example examples/cv-aglm-1.R
 68 | #'
 69 | #'
 70 | #' @author
 71 | #'   * Kenji Kondo,
 72 | #'   * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
 73 | #'
 74 | #'
 75 | #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
 76 | #' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
 77 | #' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
 78 | #' \emph{Actuarial Colloquium Paris 2020}
 79 | #'
 80 | #'
 81 | #' @export
 82 | #' @importFrom assertthat assert_that
 83 | #' @importFrom glmnet cv.glmnet
 84 | #' @importFrom methods new
 85 | cv.aglm <- function(x, y,
 86 |                     qualitative_vars_UD_only=NULL,
 87 |                     qualitative_vars_both=NULL,
 88 |                     qualitative_vars_OD_only=NULL,
 89 |                     quantitative_vars=NULL,
 90 |                     use_LVar=FALSE,
 91 |                     extrapolation="default",
 92 |                     add_linear_columns=TRUE,
 93 |                     add_OD_columns_of_qualitatives=TRUE,
 94 |                     add_interaction_columns=FALSE,
 95 |                     OD_type_of_quantitatives="C",
 96 |                     nbin.max=NULL,
 97 |                     bins_list=NULL,
 98 |                     bins_names=NULL,
 99 |                     family=c("gaussian","binomial","poisson"),
100 |                     keep=FALSE,
101 |                     ...) {
102 |   # Create an input object
103 |   x <- newInput(x,
104 |                 qualitative_vars_UD_only=qualitative_vars_UD_only,
105 |                 qualitative_vars_both=qualitative_vars_both,
106 |                 qualitative_vars_OD_only=qualitative_vars_OD_only,
107 |                 quantitative_vars=quantitative_vars,
108 |                 use_LVar=use_LVar,
109 |                 extrapolation=extrapolation,
110 |                 add_linear_columns=add_linear_columns,
111 |                 add_OD_columns_of_qualitatives=add_OD_columns_of_qualitatives,
112 |                 add_interaction_columns=add_interaction_columns,
113 |                 OD_type_of_quantitatives=OD_type_of_quantitatives,
114 |                 nbin.max,
115 |                 bins_list,
116 |                 bins_names)
117 | 
118 |   # Check y
119 |   y <- drop(y)
120 |   #assert_that(class(y) == "integer" | class(y) == "numeric")
121 |   y <- as.numeric(y)
122 |   assert_that(length(y) == dim(x@data)[1])
123 | 
124 |   # Check family
125 |   if (is.character(family))
126 |     family <- match.arg(family)
127 | 
128 |   # Create a design matrix which is passed to backend API
129 |   x_for_backend <- getDesignMatrix(x)
130 | 
131 |   # Data size
132 |   nobs <- dim(x_for_backend)[1]
133 |   nvars <- dim(x_for_backend)[2]
134 |   assert_that(length(y) == nobs)
135 | 
136 |   # Call backend
137 |   args <- list(x=x_for_backend,
138 |                y=y,
139 |                family=family,
140 |                keep=keep,
141 |                ...)
142 | 
143 |   cv.glmnet_result <- do.call(cv.glmnet, args)
144 | 
145 |   if (!keep) {
146 |     cv.glmnet_result$fit.preval <- matrix(0)
147 |     cv.glmnet_result$foldid <- integer(0)
148 |   }
149 | 
150 |   return(new("AccurateGLM", backend_models=list(cv.glmnet=cv.glmnet_result$glmnet.fit),
151 |              lambda=cv.glmnet_result$lambda,
152 |              cvm=cv.glmnet_result$cvm,
153 |              cvsd=cv.glmnet_result$cvsd,
154 |              cvup=cv.glmnet_result$cvup,
155 |              cvlo=cv.glmnet_result$cvlo,
156 |              nzero=cv.glmnet_result$nzero,
157 |              name=cv.glmnet_result$name,
158 |              lambda.min=cv.glmnet_result$lambda.min,
159 |              lambda.1se=cv.glmnet_result$lambda.1se,
160 |              fit.preval=cv.glmnet_result$fit.preval,
161 |              foldid=cv.glmnet_result$foldid,
162 |              vars_info=x@vars_info,
163 |              call=match.call()))
164 | }
165 | 


--------------------------------------------------------------------------------
/tests/testthat/test_aglm-input.R:
--------------------------------------------------------------------------------
  1 | context("aglm-input")
  2 | library(aglm)
  3 | 
  4 | 
  5 | createX <- function(nobs, nvar_int, nvar_numeric, nvar_ordered, nvar_factor, seed=12345) {
  6 |   set.seed(seed)
  7 |   nobs <- nobs
  8 |   nvar <- nvar_int + nvar_numeric + nvar_ordered + nvar_factor
  9 | 
 10 |   data <- list()
 11 |   if (nvar_int > 0) for (i in 1:nvar_int) data[[paste0("Int", i)]] <- sample(1:10, size=nobs, replace=TRUE)
 12 |   if (nvar_numeric > 0) for (i in 1:nvar_numeric) data[[paste0("Num", i)]] <- rnorm(nobs)
 13 |   if (nvar_ordered > 0) for (i in 1:nvar_ordered) data[[paste0("Ord", i)]] <- ordered(sample(1:5, size=nobs, replace=TRUE))
 14 |   if (nvar_factor > 0) for (i in 1:nvar_factor) data[[paste0("Fac", i)]] <- factor(sample(c("A", "B", "C"), nobs, replace=TRUE))
 15 | 
 16 |   return(data.frame(data))
 17 | }
 18 | 
 19 | test_that("Check returned values of newInput() for each input type", {
 20 |   x <- newInput(createX(10, 1, 1, 1, 1))
 21 | 
 22 |   expect_equal(x@vars_info[[1]]$id, 1)
 23 |   expect_equal(x@vars_info[[1]]$data_column_idx, 1)
 24 |   expect_equal(x@vars_info[[1]]$type, "quan")
 25 |   expect_equal(x@vars_info[[1]]$use_linear, TRUE)
 26 |   expect_equal(x@vars_info[[1]]$use_UD, FALSE)
 27 |   expect_equal(x@vars_info[[1]]$use_OD, TRUE)
 28 |   expect_true(!is.null(x@vars_info[[1]]$OD_info))
 29 |   expect_true(is.null(x@vars_info[[1]]$UD_info))
 30 | 
 31 |   expect_equal(x@vars_info[[2]]$id, 2)
 32 |   expect_equal(x@vars_info[[2]]$data_column_idx, 2)
 33 |   expect_equal(x@vars_info[[2]]$type, "quan")
 34 |   expect_equal(x@vars_info[[2]]$use_linear, TRUE)
 35 |   expect_equal(x@vars_info[[2]]$use_UD, FALSE)
 36 |   expect_equal(x@vars_info[[2]]$use_OD, TRUE)
 37 |   expect_true(!is.null(x@vars_info[[2]]$OD_info))
 38 |   expect_true(is.null(x@vars_info[[2]]$UD_info))
 39 | 
 40 |   expect_equal(x@vars_info[[3]]$id, 3)
 41 |   expect_equal(x@vars_info[[3]]$data_column_idx, 3)
 42 |   expect_equal(x@vars_info[[3]]$type, "qual")
 43 |   expect_equal(x@vars_info[[3]]$use_linear, FALSE)
 44 |   expect_equal(x@vars_info[[3]]$use_UD, TRUE)
 45 |   expect_equal(x@vars_info[[3]]$use_OD, TRUE)
 46 |   expect_true(!is.null(x@vars_info[[3]]$UD_info))
 47 |   expect_true(!is.null(x@vars_info[[3]]$OD_info))
 48 | 
 49 |   expect_equal(x@vars_info[[4]]$id, 4)
 50 |   expect_equal(x@vars_info[[4]]$data_column_idx, 4)
 51 |   expect_equal(x@vars_info[[4]]$type, "qual")
 52 |   expect_equal(x@vars_info[[4]]$use_linear, FALSE)
 53 |   expect_equal(x@vars_info[[4]]$use_UD, TRUE)
 54 |   expect_equal(x@vars_info[[4]]$use_OD, FALSE)
 55 |   expect_true(!is.null(x@vars_info[[4]]$UD_info))
 56 |   expect_true(is.null(x@vars_info[[4]]$OD_info))
 57 | })
 58 | 
 59 | 
 60 | test_that("Check add_xxx flags of newInput()", {
 61 |   x <- newInput(createX(10, 1, 1, 1, 1), add_interaction_columns=FALSE)
 62 |   expect_equal(length(x@vars_info), 4)
 63 | 
 64 |   x <- newInput(createX(10, 1, 1, 1, 1), add_linear_columns=FALSE, add_interaction_columns=FALSE)
 65 |   expect_true(all(sapply(x@vars_info, function(var) {!var$use_linear})))
 66 | 
 67 |   x <- newInput(createX(10, 1, 1, 1, 1), add_OD_columns_of_qualitatives=FALSE, add_interaction_columns=FALSE)
 68 |   expect_true(all(sapply(x@vars_info, function(var) {var$type=="quan" | !var$use_OD})))
 69 | })
 70 | 
 71 | test_that("Check bins_list of newInput()", {
 72 |   bins_list <- list(c(0, 1, 2))
 73 |   x <- newInput(createX(10, 0, 5, 0, 0), bins_list=bins_list)
 74 |   expect_equal(x@vars_info[[1]]$OD_info$breaks, bins_list[[1]])
 75 | 
 76 |   bins_names <- list(3)
 77 |   x <- newInput(createX(10, 0, 5, 0, 0), bins_list=bins_list, bins_names=bins_names)
 78 |   expect_equal(x@vars_info[[3]]$OD_info$breaks, bins_list[[1]])
 79 | 
 80 |   bins_names <- list("Num5")
 81 |   x <- newInput(createX(10, 0, 5, 0, 0), bins_list=bins_list, bins_names=bins_names)
 82 |   #expect_equal(x@vars_info[[5]]$OD_info$breaks, bins_list[[1]])
 83 | })
 84 | 
 85 | test_that("Check return values of getDesignMatrix()", {
 86 |   x_int <- newInput(createX(10, 1, 0, 0, 0), add_interaction_columns=FALSE)
 87 |   mat_int <- getDesignMatrix(x_int)
 88 |   #print("")
 89 |   #print(t(x_int@data))
 90 |   #print(mat_int)
 91 |   expect_equal(mat_int[,1], x_int@data[,1])
 92 |   expect_equal(dim(mat_int), c(10, dim(getODummyMatForOneVec(mat_int[,1])$dummy_mat)[2] + 1))
 93 | 
 94 |   x_num <- newInput(createX(10, 0, 1, 0, 0), add_interaction_columns=FALSE)
 95 |   mat_num <- getDesignMatrix(x_num)
 96 |   #print("")
 97 |   #print(t(x_num@data))
 98 |   #print(mat_num)
 99 |   expect_equal(mat_num[,1], x_num@data[,1])
100 |   expect_equal(dim(mat_num), c(10, dim(getODummyMatForOneVec(mat_num[,1])$dummy_mat)[2] + 1))
101 | 
102 |   x_ord <- newInput(createX(10, 0, 0, 1, 0), add_interaction_columns=FALSE)
103 |   mat_ord <- getDesignMatrix(x_ord)
104 |   #print("")
105 |   #print(t(x_ord@data))
106 |   #print(mat_ord)
107 |   expect_equal(dim(mat_ord), c(10,
108 |                                dim(getODummyMatForOneVec(x_ord@data[,1])$dummy_mat)[2]
109 |                                + dim(getUDummyMatForOneVec(x_ord@data[,1], drop_last=FALSE)$dummy_mat)[2]))
110 | 
111 |   x_fac <- newInput(createX(10, 0, 0, 0, 1), add_interaction_columns=FALSE)
112 |   mat_fac <- getDesignMatrix(x_fac)
113 |   #print("")
114 |   #print(t(x_fac@data))
115 |   #print(mat_fac)
116 |   expect_equal(dim(mat_fac), c(10, dim(getUDummyMatForOneVec(x_fac@data[,1], drop_last=FALSE)$dummy_mat)[2]))
117 | 
118 |   x_all <- newInput(data.frame(x_int@data, x_num@data, x_ord@data, x_fac@data), add_interaction_columns=FALSE)
119 |   mat_all <- getDesignMatrix(x_all)
120 |   expect_equal(mat_all, cbind(mat_int, mat_num, mat_ord, mat_fac))
121 | 
122 | 
123 |   ## Check interaction columns
124 |   x_inter <- newInput(data.frame(x_int@data,  x_fac@data), add_interaction_columns=TRUE)
125 |   mat_inter <- getDesignMatrix(x_inter)
126 |   a <- dim(mat_int)[2] + dim(mat_fac)[2]
127 |   b <- dim(x_int@data)[2] + dim(mat_fac)[2]
128 |   expect_equal(dim(mat_inter), c(10, a + b * (b - 1) / 2))
129 | })
130 | 


--------------------------------------------------------------------------------
/cran-comments_old.md:
--------------------------------------------------------------------------------
  1 | # Test environments
  2 | - local
  3 |   - x86_64-w64-mingw32
  4 | - r-hub
  5 |   - ubuntu-gcc-release
  6 |   - fedora-clang-devel
  7 | - win-builder (for R-release)
  8 |   - x86_64-w64-mingw32
  9 | 
 10 | 
 11 | # Results of `R CMD check --as-cran`
 12 | 
 13 | Results are identical in all the environments.
 14 | 
 15 | ```
 16 | Status: 1 NOTE
 17 | ```
 18 | 
 19 | ## Note #1
 20 | ```
 21 | * checking CRAN incoming feasibility ... NOTE
 22 | Maintainer: 'Kenji Kondo <kkondo.odnokk@gmail.com>'
 23 | 
 24 | New submission
 25 | ```
 26 | 
 27 | This note just says this is the first submission of me, and there is actually no problem.
 28 | 
 29 | 
 30 | # `revdepcheck` results
 31 | There are currently no downstream dependencies for this package.
 32 | 
 33 | 
 34 | # Resubmission
 35 | 
 36 | I got some instructions from CRAN and fixed them as below.
 37 | 
 38 | ## Instruction #1
 39 | 
 40 | ```
 41 | The Description field is intended to be a (one paragraph) description
 42 | of what the package does and why it may be useful.
 43 | Please add more details about the package functionality and implemented
 44 | methods in your Description text.
 45 | 
 46 | If there are references describing the methods in your package, please
 47 | add these in the description field of your DESCRIPTION file in the form
 48 | authors (year) <doi:...>
 49 | authors (year) <arXiv:...>
 50 | authors (year, ISBN:...)
 51 | or if those are not available: <https:...>
 52 | with no space after 'doi:', 'arXiv:', 'https:' and angle brackets for
 53 | auto-linking.
 54 | (If you want to add a title as well please put it in quotes: "Title")
 55 | ```
 56 | 
 57 | I fixed the description field of my DESCRIPTION file.
 58 | For consistency, I also fixed the beginning of 'aglm-package.Rd'.
 59 | 
 60 | I got a new note like below as a result, and just ignore it because these words are names of us and our method.
 61 | ```
 62 | Possibly mis-spelled words in DESCRIPTION:
 63 |   AGLM (10:75, 10:131, 10:339)
 64 |   Fujita (10:356)
 65 |   Hirokazu (10:395)
 66 |   Iwasawa (10:404)
 67 |   Kenji (10:379)
 68 |   Kondo (10:385)
 69 |   Suguru (10:349)
 70 |   Tanaka (10:371)
 71 |   Toyoto (10:364)
 72 | ```
 73 | 
 74 | Additionally, I noticed 'URL' and 'BugReports' fields are useful and added them as below:
 75 | ```
 76 | URL: https://github.com/kkondo1981/aglm
 77 | BugReports: https://github.com/kkondo1981/aglm/issues
 78 | ```
 79 | 
 80 | 
 81 | ## Instruction #2
 82 | ```
 83 | Please rather use the Authors@R field and declare Maintainer, Authors
 84 | and Contributors with their appropriate roles with person() calls.
 85 | e.g. something like:
 86 | Authors@R: c(person("Alice", "Developer", role = c("aut", "cre","cph"),
 87 |                       email = "alice.developer@some.domain.net"),
 88 |                person("Bob", "Dev", role = "aut") )
 89 | 
 90 | Please always add all authors, contributors and copyright holders in the
 91 | Authors@R field with the appropriate roles, instead of writing "others".
 92 | ```
 93 | 
 94 | I removed the 'Authors' and 'Maintainer' fields, added an 'Authors@R' field, and wrote down all the contributors instead of writing 'others' as below:
 95 | ```
 96 | Authors@R: c(
 97 |     person("Kenji", "Kondo", role=c("aut", "cre", "cph"), email="kkondo.odnokk@gmail.com"),
 98 |     person("Kazuhisa", "Takahashi", role=c("ctb")),
 99 |     person("Hikari", "Banno", role=c("ctb"))
100 |     )
101 | ```
102 | 
103 | I also fixed \\author tags in the following files for consistency:
104 | - aglm-package.Rd
105 | - aglm-Rd
106 | - cv.aglm.Rd
107 | - cva.aglm.Rd
108 | - plot.AccurateGLM.Rd
109 | - predict.AccurateGLM.Rd
110 | 
111 | 
112 | ## Instruction #3
113 | ```
114 | Please add \value to .Rd files regarding exported methods and explain
115 | the functions results in the documentation. Please write about the
116 | structure of the output (class) and also what the output means. (If a
117 | function does not return a value, please document that too, e.g.
118 | \value{No return value, called for side effects} or similar)
119 | Missing Rd-tags:
120 |       coef.AccurateGLM.Rd: \value
121 |       deviance.AccurateGLM.Rd: \value
122 |       plot.AccurateGLM.Rd: \value
123 |       print.AccurateGLM.Rd: \value
124 | ```
125 | 
126 | I added \\value tags to the abovementioned Rd files.
127 | I also checked all the explanations in \\value tags, and believe that they are enough.
128 | 
129 | 
130 | ## Instruction #4
131 | ```
132 | \dontrun{} should only be used if the example really cannot be executed
133 | (e.g. because of missing additional software, missing API keys, ...) by
134 | the user. That's why wrapping examples in \dontrun{} adds the comment
135 | ("# Not run:") as a warning for the user.
136 | Does not seem necessary.
137 | Please unwrap the examples if they are executable in < 5 sec, or replace
138 | \dontrun{} with \donttest{}.
139 | ```
140 | 
141 | I replaced \\dontrun{} in examples of the following functions to \\donttest{} because they take time > 5 sec:
142 | - `cv.aglm()`
143 | - `cva.aglm()`
144 | 
145 | 
146 | ## Instruction #5
147 | ```
148 | Please make sure that you do not change the user's options, par or
149 | working directory. If you really have to do so within functions, please
150 | ensure with an *immediate* call of on.exit() that the settings are reset
151 | when the function is exited. e.g.:
152 | ...
153 | oldpar <- par(no.readonly = TRUE)    # code line i
154 | on.exit(par(oldpar))            # code line i + 1
155 | ...
156 | par(mfrow=c(2,2))            # somewhere after
157 | ...
158 | e.g.: plot-aglm.R
159 | If you're not familiar with the function, please check ?on.exit. This
160 | function makes it possible to restore options before exiting a function
161 | even if the function breaks. Therefore it needs to be called immediately
162 | after the option change within a function.
163 | ```
164 | 
165 | I added two immidiate calls of `on.exit()` in 'plot-aglm.R' to restore `par` and `devAskNewPage`.
166 | Because there are possibly two calls `on.exit()` in one function, I set `add=TRUE` when call `on.exit()`.
167 | 


--------------------------------------------------------------------------------
/R/aglm.R:
--------------------------------------------------------------------------------
  1 | #' Fit an AGLM model with no cross-validation
  2 | #'
  3 | #' A basic fitting function with given \eqn{\alpha} and \eqn{\lambda} (s).
  4 | #' See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}.
  5 | #'
  6 | #' @param x
  7 | #'   A design matrix.
  8 | #'   Usually a `data.frame` object is expected, but a `matrix` object is fine if all columns are of a same class.
  9 | #'   Each column may have one of the following classes, and `aglm` will automatically determine how to handle it:
 10 | #'   * `numeric`: interpreted as a quantitative variable. `aglm` performs discretization by binning, and creates dummy variables suitable for ordered values (named O-dummies/L-variables).
 11 | #'   * `factor` (unordered) or `logical` : interpreted as a qualitative variable without order. `aglm` creates dummy variables suitable for unordered values (named U-dummies).
 12 | #'   * `ordered`: interpreted as a qualitative variable with order. `aglm` creates both O-dummies and U-dummies.
 13 | #'
 14 | #'   These dummy variables are added to `x` and form a larger matrix, which is used internally as an actual design matrix.
 15 | #'   See \href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{our paper} for more details on O-dummies, U-dummies, and L-variables.
 16 | #'
 17 | #'   If you need to change the default behavior, use the following options: `qualitative_vars_UD_only`, `qualitative_vars_both`, `qualitative_vars_OD_only`, and `quantitative_vars`.
 18 | #'
 19 | #' @param y
 20 | #'   A response variable.
 21 | #'
 22 | #' @param qualitative_vars_UD_only
 23 | #'   Used to change the default behavior of `aglm` for given variables.
 24 | #'   Variables specified by this parameter are considered as qualitative variables and only U-dummies are created as auxiliary columns.
 25 | #'   This parameter may have one of the following classes:
 26 | #'   * `integer`: specifying variables by index.
 27 | #'   * `character`: specifying variables by name.
 28 | #'
 29 | #' @param qualitative_vars_both
 30 | #'   Same as `qualitative_vars_UD_only`, except that both O-dummies and U-dummies are created for specified variables.
 31 | #'
 32 | #' @param qualitative_vars_OD_only
 33 | #'   Same as `qualitative_vars_UD_only`, except that both only O-dummies are created for specified variables.
 34 | #'
 35 | #' @param quantitative_vars
 36 | #'   Same as `qualitative_vars_UD_only`, except that specified variables are considered as quantitative variables.
 37 | #'
 38 | #' @param use_LVar
 39 | #'   Set to use L-variables.
 40 | #'   By default, `aglm` uses O-dummies as the representation of a quantitative variable.
 41 | #'   If `use_LVar=TRUE`, L-variables are used instead.
 42 | #'
 43 | #' @param extrapolation
 44 | #'   Used to control values of linear combination for quantitative variables, outside where the data exists.
 45 | #'   By default, values of a linear combination outside the data is extended based on the slope of the edges of the region where the data exists.
 46 | #'   You can set `extrapolation="flat"` to get constant values outside the data instead.
 47 | #'
 48 | #' @param add_linear_columns
 49 | #'   By default, for quantitative variables, `aglm` expands them by adding dummies and the original columns, i.e. the linear effects, are remained in the resulting model.
 50 | #'   You can set `add_linear_columns=FALSE` to drop linear effects.
 51 | #'
 52 | #' @param add_OD_columns_of_qualitatives
 53 | #'   Set to `FALSE` if you do not want to use O-dummies for qualitative variables with order (usually, columns with `ordered` class).
 54 | #'
 55 | #' @param add_interaction_columns
 56 | #'   If this parameter is set to `TRUE`, `aglm` creates an additional auxiliary variable `x_i * x_j` for each pair `(x_i, x_j)` of variables.
 57 | #'
 58 | #' @param OD_type_of_quantitatives
 59 | #'   Used to control the shape of linear combinations obtained by O-dummies for quantitative variables (deprecated).
 60 | #'
 61 | #' @param family
 62 | #'   A `family` object or a string representing the type of the error distribution.
 63 | #'   Currently `aglm` supports `gaussian`, `binomial`, and `poisson`.
 64 | #'
 65 | #' @param nbin.max
 66 | #'   An integer representing the maximum number of bins when `aglm` perform binning for quantitative variables.
 67 | #'
 68 | #' @param bins_list
 69 | #'   Used to set custom bins for variables with O-dummies.
 70 | #'
 71 | #' @param bins_names
 72 | #'   Used to set custom bins for variables with O-dummies.
 73 | #'
 74 | #' @param ...
 75 | #'   Other arguments are passed directly when calling `glmnet()`.
 76 | #'
 77 | #' @return
 78 | #'   A model object fitted to the data.
 79 | #'   Functions such as `predict` and `plot` can be applied to the returned object.
 80 | #'   See \link{AccurateGLM-class} for more details.
 81 | #'
 82 | #'
 83 | #' @example examples/aglm-1.R
 84 | #' @example examples/aglm-2.R
 85 | #' @example examples/lvar-and-extrapolation.R
 86 | #'
 87 | #'
 88 | #' @author
 89 | #'   * Kenji Kondo,
 90 | #'   * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
 91 | #'
 92 | #'
 93 | #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
 94 | #' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
 95 | #' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
 96 | #' \emph{Actuarial Colloquium Paris 2020}
 97 | #'
 98 | #'
 99 | #' @export
100 | #' @importFrom assertthat assert_that
101 | #' @importFrom glmnet glmnet
102 | #' @importFrom methods new
103 | aglm <- function(x, y,
104 |                  qualitative_vars_UD_only=NULL,
105 |                  qualitative_vars_both=NULL,
106 |                  qualitative_vars_OD_only=NULL,
107 |                  quantitative_vars=NULL,
108 |                  use_LVar=FALSE,
109 |                  extrapolation="default",
110 |                  add_linear_columns=TRUE,
111 |                  add_OD_columns_of_qualitatives=TRUE,
112 |                  add_interaction_columns=FALSE,
113 |                  OD_type_of_quantitatives="C",
114 |                  nbin.max=NULL,
115 |                  bins_list=NULL,
116 |                  bins_names=NULL,
117 |                  family=c("gaussian","binomial","poisson"),
118 |                  ...) {
119 |   # Create an input object
120 |   x <- newInput(x,
121 |                 qualitative_vars_UD_only=qualitative_vars_UD_only,
122 |                 qualitative_vars_both=qualitative_vars_both,
123 |                 qualitative_vars_OD_only=qualitative_vars_OD_only,
124 |                 quantitative_vars=quantitative_vars,
125 |                 use_LVar=use_LVar,
126 |                 extrapolation=extrapolation,
127 |                 add_linear_columns=add_linear_columns,
128 |                 add_OD_columns_of_qualitatives=add_OD_columns_of_qualitatives,
129 |                 add_interaction_columns=add_interaction_columns,
130 |                 OD_type_of_quantitatives=OD_type_of_quantitatives,
131 |                 nbin.max,
132 |                 bins_list,
133 |                 bins_names)
134 | 
135 |   # Check y
136 |   y <- drop(y)
137 |   y <- as.numeric(y)
138 |   #assert_that(class(y) == "integer" | class(y) == "numeric")
139 |   assert_that(length(y) == dim(x@data)[1])
140 | 
141 |   # Check family
142 |   if (is.character(family))
143 |     family <- match.arg(family)
144 | 
145 |   # Create a design matrix which is passed to backend API
146 |   x_for_backend <- getDesignMatrix(x)
147 | 
148 |   # Data size
149 |   nobs <- dim(x_for_backend)[1]
150 |   nvars <- dim(x_for_backend)[2]
151 |   assert_that(length(y) == nobs)
152 | 
153 |   # Call backend
154 |   args <- list(x=x_for_backend,
155 |                y=y,
156 |                family=family,
157 |                ...)
158 |   glmnet_result <- do.call(glmnet, args)
159 | 
160 |   return(new("AccurateGLM", backend_models=list(glmnet=glmnet_result), vars_info=x@vars_info, call=match.call()))
161 | }
162 | 


--------------------------------------------------------------------------------
/man/aglm.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/aglm.R
  3 | \name{aglm}
  4 | \alias{aglm}
  5 | \title{Fit an AGLM model with no cross-validation}
  6 | \usage{
  7 | aglm(
  8 |   x,
  9 |   y,
 10 |   qualitative_vars_UD_only = NULL,
 11 |   qualitative_vars_both = NULL,
 12 |   qualitative_vars_OD_only = NULL,
 13 |   quantitative_vars = NULL,
 14 |   use_LVar = FALSE,
 15 |   extrapolation = "default",
 16 |   add_linear_columns = TRUE,
 17 |   add_OD_columns_of_qualitatives = TRUE,
 18 |   add_interaction_columns = FALSE,
 19 |   OD_type_of_quantitatives = "C",
 20 |   nbin.max = NULL,
 21 |   bins_list = NULL,
 22 |   bins_names = NULL,
 23 |   family = c("gaussian", "binomial", "poisson"),
 24 |   ...
 25 | )
 26 | }
 27 | \arguments{
 28 | \item{x}{A design matrix.
 29 | Usually a \code{data.frame} object is expected, but a \code{matrix} object is fine if all columns are of a same class.
 30 | Each column may have one of the following classes, and \code{aglm} will automatically determine how to handle it:
 31 | \itemize{
 32 | \item \code{numeric}: interpreted as a quantitative variable. \code{aglm} performs discretization by binning, and creates dummy variables suitable for ordered values (named O-dummies/L-variables).
 33 | \item \code{factor} (unordered) or \code{logical} : interpreted as a qualitative variable without order. \code{aglm} creates dummy variables suitable for unordered values (named U-dummies).
 34 | \item \code{ordered}: interpreted as a qualitative variable with order. \code{aglm} creates both O-dummies and U-dummies.
 35 | }
 36 | 
 37 | These dummy variables are added to \code{x} and form a larger matrix, which is used internally as an actual design matrix.
 38 | See \href{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1}{our paper} for more details on O-dummies, U-dummies, and L-variables.
 39 | 
 40 | If you need to change the default behavior, use the following options: \code{qualitative_vars_UD_only}, \code{qualitative_vars_both}, \code{qualitative_vars_OD_only}, and \code{quantitative_vars}.}
 41 | 
 42 | \item{y}{A response variable.}
 43 | 
 44 | \item{qualitative_vars_UD_only}{Used to change the default behavior of \code{aglm} for given variables.
 45 | Variables specified by this parameter are considered as qualitative variables and only U-dummies are created as auxiliary columns.
 46 | This parameter may have one of the following classes:
 47 | \itemize{
 48 | \item \code{integer}: specifying variables by index.
 49 | \item \code{character}: specifying variables by name.
 50 | }}
 51 | 
 52 | \item{qualitative_vars_both}{Same as \code{qualitative_vars_UD_only}, except that both O-dummies and U-dummies are created for specified variables.}
 53 | 
 54 | \item{qualitative_vars_OD_only}{Same as \code{qualitative_vars_UD_only}, except that both only O-dummies are created for specified variables.}
 55 | 
 56 | \item{quantitative_vars}{Same as \code{qualitative_vars_UD_only}, except that specified variables are considered as quantitative variables.}
 57 | 
 58 | \item{use_LVar}{Set to use L-variables.
 59 | By default, \code{aglm} uses O-dummies as the representation of a quantitative variable.
 60 | If \code{use_LVar=TRUE}, L-variables are used instead.}
 61 | 
 62 | \item{extrapolation}{Used to control values of linear combination for quantitative variables, outside where the data exists.
 63 | By default, values of a linear combination outside the data is extended based on the slope of the edges of the region where the data exists.
 64 | You can set \code{extrapolation="flat"} to get constant values outside the data instead.}
 65 | 
 66 | \item{add_linear_columns}{By default, for quantitative variables, \code{aglm} expands them by adding dummies and the original columns, i.e. the linear effects, are remained in the resulting model.
 67 | You can set \code{add_linear_columns=FALSE} to drop linear effects.}
 68 | 
 69 | \item{add_OD_columns_of_qualitatives}{Set to \code{FALSE} if you do not want to use O-dummies for qualitative variables with order (usually, columns with \code{ordered} class).}
 70 | 
 71 | \item{add_interaction_columns}{If this parameter is set to \code{TRUE}, \code{aglm} creates an additional auxiliary variable \code{x_i * x_j} for each pair \verb{(x_i, x_j)} of variables.}
 72 | 
 73 | \item{OD_type_of_quantitatives}{Used to control the shape of linear combinations obtained by O-dummies for quantitative variables (deprecated).}
 74 | 
 75 | \item{nbin.max}{An integer representing the maximum number of bins when \code{aglm} perform binning for quantitative variables.}
 76 | 
 77 | \item{bins_list}{Used to set custom bins for variables with O-dummies.}
 78 | 
 79 | \item{bins_names}{Used to set custom bins for variables with O-dummies.}
 80 | 
 81 | \item{family}{A \code{family} object or a string representing the type of the error distribution.
 82 | Currently \code{aglm} supports \code{gaussian}, \code{binomial}, and \code{poisson}.}
 83 | 
 84 | \item{...}{Other arguments are passed directly when calling \code{glmnet()}.}
 85 | }
 86 | \value{
 87 | A model object fitted to the data.
 88 | Functions such as \code{predict} and \code{plot} can be applied to the returned object.
 89 | See \link{AccurateGLM-class} for more details.
 90 | }
 91 | \description{
 92 | A basic fitting function with given \eqn{\alpha} and \eqn{\lambda} (s).
 93 | See \link{aglm-package} for more details on \eqn{\alpha} and \eqn{\lambda}.
 94 | }
 95 | \examples{
 96 | 
 97 | #################### Gaussian case ####################
 98 | 
 99 | library(MASS) # For Boston
100 | library(aglm)
101 | 
102 | ## Read data
103 | xy <- Boston # xy is a data.frame to be processed.
104 | colnames(xy)[ncol(xy)] <- "y" # Let medv be the objective variable, y.
105 | 
106 | ## Split data into train and test
107 | n <- nrow(xy) # Sample size.
108 | set.seed(2018) # For reproducibility.
109 | test.id <- sample(n, round(n/4)) # ID numbders for test data.
110 | test <- xy[test.id,] # test is the data.frame for testing.
111 | train <- xy[-test.id,] # train is the data.frame for training.
112 | x <- train[-ncol(xy)]
113 | y <- train$y
114 | newx <- test[-ncol(xy)]
115 | y_true <- test$y
116 | 
117 | ## Fit the model
118 | model <- aglm(x, y)  # alpha=1 (the default value)
119 | 
120 | ## Predict for various alpha and lambda
121 | lambda <- 0.1
122 | y_pred <- predict(model, newx=newx, s=lambda)
123 | rmse <- sqrt(mean((y_true - y_pred)^2))
124 | cat(sprintf("RMSE for lambda=\%.2f: \%.5f \n\n", lambda, rmse))
125 | 
126 | lambda <- 1.0
127 | y_pred <- predict(model, newx=newx, s=lambda)
128 | rmse <- sqrt(mean((y_true - y_pred)^2))
129 | cat(sprintf("RMSE for lambda=\%.2f: \%.5f \n\n", lambda, rmse))
130 | 
131 | alpha <- 0
132 | model <- aglm(x, y, alpha=alpha)
133 | 
134 | lambda <- 0.1
135 | y_pred <- predict(model, newx=newx, s=lambda)
136 | rmse <- sqrt(mean((y_true - y_pred)^2))
137 | cat(sprintf("RMSE for alpha=\%.2f and lambda=\%.2f: \%.5f \n\n", alpha, lambda, rmse))
138 | 
139 | #################### Binomial case ####################
140 | 
141 | library(aglm)
142 | library(faraway)
143 | 
144 | ## Read data
145 | xy <- nes96
146 | 
147 | ## Split data into train and test
148 | n <- nrow(xy) # Sample size.
149 | set.seed(2018) # For reproducibility.
150 | test.id <- sample(n, round(n/5)) # ID numbders for test data.
151 | test <- xy[test.id,] # test is the data.frame for testing.
152 | train <- xy[-test.id,] # train is the data.frame for training.
153 | x <- train[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
154 | y <- train$vote
155 | newx <- test[, c("popul", "TVnews", "selfLR", "ClinLR", "DoleLR", "PID", "age", "educ", "income")]
156 | 
157 | ## Fit the model
158 | model <- aglm(x, y, family="binomial")
159 | 
160 | ## Make the confusion matrix
161 | lambda <- 0.1
162 | y_true <- test$vote
163 | y_pred <- levels(y_true)[as.integer(predict(model, newx, s=lambda, type="class"))]
164 | 
165 | print(table(y_true, y_pred))
166 | 
167 | #################### use_LVar and extrapolation ####################
168 | 
169 | library(MASS) # For Boston
170 | library(aglm)
171 | 
172 | ## Randomly created train and test data
173 | set.seed(2021)
174 | sd <- 0.2
175 | x <- 2 * runif(1000) + 1
176 | f <- function(x){x^3 - 6 * x^2 + 13 * x}
177 | y <- f(x) + rnorm(1000, sd = sd)
178 | xy <- data.frame(x=x, y=y)
179 | x_test <- seq(0.75, 3.25, length.out=101)
180 | y_test <- f(x_test) + rnorm(101, sd=sd)
181 | xy_test <- data.frame(x=x_test, y=y_test)
182 | 
183 | ## Plot
184 | nbin.max <- 10
185 | models <- c(cv.aglm(x, y, use_LVar=FALSE, extrapolation="default", nbin.max=nbin.max),
186 |             cv.aglm(x, y, use_LVar=FALSE, extrapolation="flat", nbin.max=nbin.max),
187 |             cv.aglm(x, y, use_LVar=TRUE, extrapolation="default", nbin.max=nbin.max),
188 |             cv.aglm(x, y, use_LVar=TRUE, extrapolation="flat", nbin.max=nbin.max))
189 | 
190 | titles <- c("O-Dummies with extrapolation=\"default\"",
191 |             "O-Dummies with extrapolation=\"flat\"",
192 |             "L-Variables with extrapolation=\"default\"",
193 |             "L-Variables with extrapolation=\"flat\"")
194 | 
195 | par.old <- par(mfrow=c(2, 2))
196 | for (i in 1:4) {
197 |   model <- models[[i]]
198 |   title <- titles[[i]]
199 | 
200 |   pred <- predict(model, newx=x_test, s=model@lambda.min, type="response")
201 | 
202 |   plot(x_test, y_test, pch=20, col="grey", main=title)
203 |   lines(x_test, f(x_test), lty="dashed", lwd=2)  # the theoretical line
204 |   lines(x_test, pred, col="blue", lwd=3)  # the smoothed line by the model
205 | }
206 | par(par.old)
207 | }
208 | \references{
209 | Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
210 | \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
211 | \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
212 | \emph{Actuarial Colloquium Paris 2020}
213 | }
214 | \author{
215 | \itemize{
216 | \item Kenji Kondo,
217 | \item Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
218 | }
219 | }
220 | 


--------------------------------------------------------------------------------
/R/plot-aglm.R:
--------------------------------------------------------------------------------
  1 | #' Plot contribution of each variable and residuals
  2 | #'
  3 | #' @param x
  4 | #'   A model object obtained from `aglm()` or `cv.aglm()`.
  5 | #'
  6 | #' @param vars
  7 | #'   Used to specify variables to be plotted (`NULL` means all the variables).
  8 | #'   This parameter may have one of the following classes:
  9 | #'   * `integer`: specifying variables by index.
 10 | #'   * `character`: specifying variables by name.
 11 | #'
 12 | #' @param verbose
 13 | #'   Set to `FALSE` if textual outputs are not needed.
 14 | #'
 15 | #' @param s
 16 | #'   A numeric value specifying \eqn{\lambda} at which plotting is required.
 17 | #'   Note that plotting for multiple \eqn{\lambda}'s are not allowed and `s` always should be a single value.
 18 | #'   When the model is trained with only a single \eqn{\lambda} value, just set it to `NULL` to plot for that value.
 19 | #'
 20 | #' @param resid
 21 | #'   Used to display residuals in plots.
 22 | #'   This parameter may have one of the following classes:
 23 | #'   * `logical`(single value): If `TRUE`, working residuals are plotted.
 24 | #'   * `character`(single value): type of residual to be plotted. See \link{residuals.AccurateGLM} for more details on types of residuals.
 25 | #'   * `numerical`(vector): residual values to be plotted.
 26 | #'
 27 | #' @param smooth_resid
 28 | #'   Used to display smoothing lines of residuals for quantitative variables.
 29 | #'   This parameter may have one of the following classes:
 30 | #'   * `logical`: If `TRUE`, smoothing lines are drawn.
 31 | #'   * `character`:
 32 | #'     * `smooth_resid="both"`: Balls and smoothing lines are drawn.
 33 | #'     * `smooth_resid="smooth_only"`: Only smoothing lines are drawn.
 34 | #'
 35 | #' @param smooth_resid_fun
 36 | #'   Set if users need custom smoothing functions.
 37 | #'
 38 | #' @param ask
 39 | #'   By default, `plot()` stops and waits inputs each time plotting for each variable is completed.
 40 | #'   Users can set `ask=FALSE` to avoid this.
 41 | #'   It is useful, for example, when using devices as `bmp` to create image files.
 42 | #'
 43 | #' @param layout
 44 | #'   Plotting multiple variables for each page is allowed.
 45 | #'   To achieve this, set it to a pair of integer, which indicating number of rows and columns, respectively.
 46 | #'
 47 | #' @param only_plot
 48 | #'   Set to `TRUE` if no automatic graphical configurations are needed.
 49 | #'
 50 | #' @param main
 51 | #'   Used to specify the title of plotting.
 52 | #'
 53 | #' @param add_rug
 54 | #'   Set to `TRUE` for rug plots.
 55 | #'
 56 | #' @param ...
 57 | #'   Other arguments are currently not used and just discarded.
 58 | #'
 59 | #' @return
 60 | #'   No return value, called for side effects.
 61 | #'
 62 | #'
 63 | #' @example examples/predict-and-plot-1.R
 64 | #'
 65 | #'
 66 | #' @author
 67 | #'   * Kenji Kondo,
 68 | #'   * Kazuhisa Takahashi and Hikari Banno (worked on L-Variable related features)
 69 | #'
 70 | #'
 71 | #' @references Suguru Fujita, Toyoto Tanaka, Kenji Kondo and Hirokazu Iwasawa. (2020)
 72 | #' \emph{AGLM: A Hybrid Modeling Method of GLM and Data Science Techniques}, \cr
 73 | #' \url{https://www.institutdesactuaires.com/global/gene/link.php?doc_id=16273&fg=1} \cr
 74 | #' \emph{Actuarial Colloquium Paris 2020}
 75 | #'
 76 | #'
 77 | #' @export
 78 | #' @importFrom assertthat assert_that
 79 | #' @importFrom utils str
 80 | #' @importFrom utils flush.console
 81 | #' @importFrom stats getCall
 82 | #' @importFrom stats residuals
 83 | #' @importFrom stats coef
 84 | #' @importFrom stats IQR
 85 | #' @importFrom stats smooth.spline
 86 | #' @importFrom stats ksmooth
 87 | #' @importFrom graphics par
 88 | #' @importFrom graphics points
 89 | #' @importFrom graphics lines
 90 | #' @importFrom graphics rug
 91 | #' @importFrom graphics mtext
 92 | #' @importFrom graphics boxplot
 93 | #' @importFrom graphics barplot
 94 | #' @importFrom grDevices devAskNewPage
 95 | plot.AccurateGLM <- function(x,
 96 |                              vars=NULL,
 97 |                              verbose=TRUE,
 98 |                              s=NULL,
 99 |                              resid=FALSE,
100 |                              smooth_resid=TRUE,
101 |                              smooth_resid_fun=NULL,
102 |                              ask=TRUE,
103 |                              layout=c(2,2),
104 |                              only_plot=FALSE,
105 |                              main="",
106 |                              add_rug=FALSE,
107 |                              ...) {
108 |   # It's necessary to use same names for some arguments as the original methods,
109 |   # because devtools::check() issues warnings when using inconsistent names.
110 |   # As a result, we sometimes should accept uncomfortable argument names,
111 |   # but still have rights to use preferable names internally.
112 |   model <- x
113 | 
114 |   nvars <- length(model@vars_info)
115 | 
116 |   if (is.null(vars)) {
117 |     inds <- seq(nvars)
118 |   } else if (is.numeric(vars)) {
119 |     inds <- unique(sort(vars))
120 |   } else if (is.character(vars)) {
121 |     inds <- NULL
122 |     for (i in seq(nvars)) {
123 |       var_name <- model@vars_info[[i]]$name
124 |       if (var_name %in% vars) inds <- c(inds, i)
125 |     }
126 |   } else {
127 |     assert_that(FALSE)
128 |   }
129 | 
130 |   ## Calculates residuals
131 |   use_x.orig <- if (is.logical(resid)) resid else TRUE
132 |   if (use_x.orig) {
133 |     call.orig <- getCall(model)
134 |     x.orig <- eval.parent(call.orig$x)
135 |     if (class(x.orig)[1] != "data.frame")
136 |       x.orig <- data.frame(x.orig)
137 | 
138 |     if (is.numeric(resid)) {
139 |       resids <- resid
140 |       resid <- TRUE
141 |     } else if (is.character(resid)) {
142 |       resids <- residuals(model, x=x.orig, s=s, type=resid)
143 |       resid <- TRUE
144 |     } else {
145 |       resids <- residuals(model, x=x.orig, s=s, type="working")
146 |     }
147 |     assert_that(nrow(x.orig) == length(resids))
148 |   }
149 | 
150 |   ## set flags for smoothing
151 |   if (resid) {
152 |     if (is.character(smooth_resid)) {
153 |       draws_balls <- smooth_resid == "both"
154 |       draws_lines <- TRUE
155 |     } else {
156 |       draws_balls <- TRUE
157 |       draws_lines <- smooth_resid
158 |     }
159 |   }
160 | 
161 |   ## set par
162 |   if (!only_plot) {
163 |     old.par <- par(no.readonly=TRUE)
164 |     on.exit(par(old.par), add=TRUE)
165 |     par(oma=c(0, 0, 2, 0))
166 |     if (length(inds) == 1) layout <- c(1,1)
167 |     par(mfrow=layout)
168 |   }
169 | 
170 |   ask.old <- devAskNewPage()
171 |   on.exit(devAskNewPage(ask.old), add=TRUE)
172 |   devAskNewPage(FALSE)
173 | 
174 |   ## Plotting
175 |   for (i in inds) {
176 |     var_info <- model@vars_info[[i]]
177 |     if (var_info$type == "inter") break ## no plot for interactions
178 | 
179 |     coefs <- coef(model, index=var_info$idx, s=s)
180 | 
181 |     if (resid) {
182 |       xlab <- var_info$name
183 |       ylab <- "Comp + Resid"
184 |     } else {
185 |       xlab <- var_info$name
186 |       ylab <- "Comp"
187 |     }
188 | 
189 |     first <- TRUE
190 |     if (var_info$type == "quan") {
191 |       # Plot for numeric features
192 | 
193 |       ## Calculates range of x to be plotted
194 |       if (var_info$use_LV) {
195 |         breaks <- var_info$LV_info$breaks
196 |       } else {
197 |         breaks <- var_info$OD_info$breaks
198 |       }
199 |       breaks <- breaks[abs(breaks) < Inf]  # get rid of -Inf and Inf
200 |       x.min <- min(breaks)
201 |       x.max <- max(breaks)
202 |       x.d <- x.max - x.min
203 |       assert_that(x.d > 0)
204 | 
205 |       x.min <- x.min - 0.05 * x.d
206 |       x.max <- x.max + 0.05 * x.d
207 |       x.d <- x.max - x.min
208 | 
209 |       ## Extract x values to be plotted
210 |       x <- x.min + (0:2000) / 2000 * x.d
211 | 
212 |       ## Calculates component values of x
213 |       x.mat <- getMatrixRepresentationByVector(x, var_info)
214 |       if (var_info$use_LV) {
215 |         b <- matrix(c(coefs$coef.linear, coefs$coef.LV), ncol=1)
216 |       } else {
217 |         b <- matrix(c(coefs$coef.linear, coefs$coef.OD), ncol=1)
218 |       }
219 |       comp <- drop(x.mat %*% b)
220 | 
221 |       ## Calculates component and residual values of samples
222 |       x.sample <- NULL
223 |       c_and_r.sample <- NULL
224 |       if (resid) {
225 |         x.sample <- x.orig[, var_info$data_column_idx]
226 |         x.sample.mat <- getMatrixRepresentationByVector(x.sample, var_info)
227 |         c_and_r.sample <- drop(x.sample.mat %*% b) + resids
228 | 
229 |         if (draws_lines) {
230 |           ord <- order(x.sample)
231 |           f <- smooth_resid_fun
232 |           if (is.null(f)) {
233 |             if (length(unique(x.sample)) >= 4 & IQR(x.sample) > 0)
234 |               f <- smooth.spline
235 |             else
236 |               f <- ksmooth
237 |           }
238 |           smoothed_c_and_r.sample <- f(x.sample[ord], c_and_r.sample[ord])
239 |         }
240 |       }
241 | 
242 |       ## Plotting
243 |       x.all <- c(x, x.sample)
244 |       xlim <- c(min(x.all), max(x.all))
245 |       xlim[1] <- xlim[1] - 0.05 * (xlim[2] - xlim[1])
246 |       xlim[2] <- xlim[2] + 0.05 * (xlim[2] - xlim[1])
247 | 
248 |       y.all <- comp
249 |       if (resid) {
250 |         if (draws_balls)
251 |           y.all <- c(y.all, c_and_r.sample)
252 |         if (draws_lines)
253 |           y.all <- c(y.all, smoothed_c_and_r.sample$y[!is.na(smoothed_c_and_r.sample$y)])
254 |       }
255 |       ylim <- c(min(y.all), max(y.all))
256 |       ylim[1] <- ylim[1] - 0.05 * (ylim[2] - ylim[1])
257 |       ylim[2] <- ylim[2] + 0.05 * (ylim[2] - ylim[1])
258 | 
259 |       plot(x=x,
260 |            y=comp,
261 |            type="n",
262 |            main=main,
263 |            xlab=xlab,
264 |            ylab=ylab,
265 |            xlim=xlim,
266 |            ylim=ylim)
267 | 
268 |       if (resid) {
269 |         if (draws_balls) {
270 |           points(x=x.sample,
271 |                  y=c_and_r.sample,
272 |                  pch=".")
273 |         }
274 | 
275 |         if (draws_lines) {
276 |           lines(smoothed_c_and_r.sample,
277 |                 col="blue",
278 |                 lty=5)
279 |         }
280 | 
281 |         if (add_rug) {
282 |           rug(x=x.sample,
283 |               col="gray")
284 |         }
285 |       }
286 | 
287 |       lines(x=x,
288 |             y=comp)
289 |     } else if (var_info$type == "qual") {
290 |       # Plot for factorial features
291 | 
292 |       ## All levels to be plotted
293 |       lv <- var_info$UD_info$levels
294 |       x <- if (var_info$use_OD) {
295 |         ordered(lv, levels=lv)
296 |       } else {
297 |         factor(lv, levels=lv)
298 |       }
299 | 
300 |       ## Calculate component values of x
301 |       x.mat <- getMatrixRepresentationByVector(x, var_info)
302 |       b <- matrix(c(coefs$coef.OD, coefs$coef.UD), ncol=1)
303 |       comp <- drop(x.mat %*% b)
304 | 
305 |       # Calculates component and residual values of samples
306 |       x.sample <- NULL
307 |       c_and_r.sample <- NULL
308 |       if (resid) {
309 |         x.sample <- x.orig[, var_info$data_column_idx]
310 |         x.sample <- if (var_info$use_OD) {
311 |           ordered(x.sample, levels=lv)
312 |         } else {
313 |           factor(x.sample, levels=lv)
314 |         }
315 |         x.sample.mat <- getMatrixRepresentationByVector(x.sample, var_info)
316 |         c_and_r.sample <- drop(x.sample.mat %*% b) + resids
317 |       }
318 | 
319 |       if (resid) {
320 |         y.all <- c(comp, c_and_r.sample)
321 | 
322 |         boxplot(c_and_r.sample ~ x.sample,
323 |                 main=main,
324 |                 xlab=xlab,
325 |                 ylab=ylab,
326 |                 outline=FALSE)
327 |       } else {
328 |         barplot(comp,
329 |                 names=lv,
330 |                 main=main,
331 |                 xlab=xlab,
332 |                 ylab=ylab)
333 |       }
334 |     }
335 | 
336 |     if (verbose) {
337 |       cat(sprintf("Plotting for %s", var_info$name))
338 |       cat("Variable Informations:\n"); str(var_info); cat("\n")
339 |       cat("Coefficients:\n"); str(coefs); cat("\n")
340 |     }
341 | 
342 |     flush.console() # this makes sure that the display is current
343 | 
344 | 
345 |     if (first) {
346 |       if (!only_plot) {
347 |         if (resid) mtext(line=0, outer=TRUE, text="Component + Residual Plot")
348 |         else mtext(line=0, outer=TRUE, text="Component Plot")
349 |       }
350 |       devAskNewPage(ask)
351 |       first <- FALSE
352 |     }
353 |   }
354 | }
355 | 


--------------------------------------------------------------------------------
/R/aglm-input.R:
--------------------------------------------------------------------------------
  1 | #' S4 class for input
  2 | #'
  3 | #' @slot vars_info A list, each of whose element is information of one variable.
  4 | #' @slot data The original data.
  5 | setClass("AGLM_Input",
  6 |          representation=representation(vars_info="list", data="data.frame"))
  7 | 
  8 | 
  9 | # An inner-use function for creating a new AGLM_Input object
 10 | #' @importFrom assertthat assert_that
 11 | #' @importFrom methods new
 12 | newInput <- function(x,
 13 |                      qualitative_vars_UD_only=NULL,
 14 |                      qualitative_vars_both=NULL,
 15 |                      qualitative_vars_OD_only=NULL,
 16 |                      quantitative_vars=NULL,
 17 |                      use_LVar=FALSE,
 18 |                      extrapolation="default",
 19 |                      add_linear_columns=TRUE,
 20 |                      add_OD_columns_of_qualitatives=TRUE,
 21 |                      add_interaction_columns=TRUE,
 22 |                      OD_type_of_quantitatives="C",
 23 |                      nbin.max=NULL,
 24 |                      bins_list=NULL,
 25 |                      bins_names=NULL) {
 26 |   # Check and process arguments
 27 |   assert_that(!is.null(x))
 28 |   if (class(x)[1] != "data.frame") x <- data.frame(x)
 29 |   assert_that(dim(x)[2] > 0)
 30 | 
 31 |   # Calculate data size
 32 |   nobs <- dim(x)[1]
 33 |   nvar <- dim(x)[2]
 34 |   assert_that(nobs > 0 & nvar > 0)
 35 | 
 36 | 
 37 |   # Create a list  (explanatory) variables' information.
 38 |   # Firstly create without considering qualitative_vars_UD_only, qualitative_vars_both, ...
 39 |   vars_info <- list()
 40 | 
 41 |   for (i in seq(nvar)) {
 42 |     var <- list()
 43 |     var$idx <- i
 44 |     var$name <- names(x)[i]
 45 |     var$data_column_idx <- i
 46 | 
 47 |     var$type <- if (is.factor(x[, i]) | is.logical(x[, i])) {"qual"} else {"quan"}
 48 |     is_ordered <- (var$type == "qual" & is.ordered(x[, i])) | (var$type == "quan")
 49 | 
 50 |     var$use_linear <- var$type == "quan" & add_linear_columns
 51 |     var$use_UD <- var$type == "qual"
 52 |     var$use_OD <- (var$type == "quan" & OD_type_of_quantitatives != "N") |
 53 |       (var$type == "qual" & is_ordered & add_OD_columns_of_qualitatives)
 54 |     if (var$use_OD) {
 55 |       if (var$type == "quan") var$OD_type <- OD_type_of_quantitatives
 56 |       else var$OD_type <- "J"
 57 |     } else {
 58 |       if (var$type == "quan") {
 59 |         # Even cases not using O-dummies for quantitatives,
 60 |         # we should store minimum and maximum values to calculate plotting range
 61 |         # in plot.AccurateGLM()
 62 |         x_vec <- x[, i]
 63 |         var$OD_info$breaks <- c(min(x_vec), max(x_vec))
 64 |       }
 65 |     }
 66 |     var$use_LV <- var$type == "quan" & use_LVar
 67 |     if (var$use_LV) {
 68 |       var$use_OD <- FALSE
 69 |     }
 70 |     var$extrapolation <- extrapolation
 71 | 
 72 |     vars_info[[i]] <- var
 73 |   }
 74 | 
 75 | 
 76 |   # Define a utility function
 77 |   var_names <- sapply(vars_info, function(var) {return(var$name)})
 78 |   get_idx <- function(idxs_or_names) {
 79 |     if (is.null(idxs_or_names)) {
 80 |       return(integer(0))
 81 |     }
 82 | 
 83 |     cl <- class(idxs_or_names)
 84 |     idxs <- seq(length(var_names))
 85 |     if (cl == "integer") {
 86 |       is_hit_i <- function(idx) {return(idx %in% idxs_or_names)}
 87 |       idxs <- idxs[sapply(idxs, is_hit_i)]
 88 |     } else if (cl == "character") {
 89 |       is_hit_c <- function(var_name) {return(var_name %in% idxs_or_names)}
 90 |       idxs <- idxs[sapply(var_names, is_hit_c)]
 91 |     } else {
 92 |       assert_that(FALSE, msg="qualitative_vars_UD_only, qualitative_vars_both, qualitative_vars_both, quantitative_vars should be integer or character vectors.")
 93 |     }
 94 |   }
 95 | 
 96 | 
 97 |   # Get indices of variables specified by qualitative_vars_UD_only, qualitative_vars_both, ...
 98 |   qual_UD <- get_idx(qualitative_vars_UD_only)
 99 |   qual_OD <- get_idx(qualitative_vars_both)
100 |   qual_both <- get_idx(qualitative_vars_both)
101 |   quan <- get_idx(quantitative_vars)
102 | 
103 |   # Check if no variables are doubly counted.
104 |   msg <- paste0("Each pair of qualitative_vars_UD_only, qualitative_vars_both, qualitative_vars_both, ",
105 |                 "and quantitative_vars shouldn't be overlapped.")
106 |   assert_that(length(intersect(qual_UD, qual_OD)) == 0, msg=msg)
107 |   assert_that(length(intersect(qual_UD, qual_both)) == 0, msg=msg)
108 |   assert_that(length(intersect(qual_UD, quan)) == 0, msg=msg)
109 |   assert_that(length(intersect(qual_OD, qual_both)) == 0, msg=msg)
110 |   assert_that(length(intersect(qual_OD, quan)) == 0, msg=msg)
111 |   assert_that(length(intersect(qual_both, quan)) == 0, msg=msg)
112 | 
113 | 
114 |   # Modify vars_info using qualitative_vars_UD_only, qualitative_vars_both, ...
115 |   for (i in qual_UD) {
116 |     var <- vars_info[[i]]
117 |     var$type <- "qual"
118 | 
119 |     var$use_linear <- FALSE
120 |     var$use_UD <- TRUE
121 |     var$use_OD <- FALSE
122 |     var$use_LV <- FALSE
123 | 
124 |     vars_info[[i]] <- var
125 |   }
126 |   for (i in qual_OD) {
127 |     var <- vars_info[[i]]
128 |     var$type <- "qual"
129 | 
130 |     var$use_linear <- FALSE
131 |     var$use_UD <- FALSE
132 |     var$use_OD <- TRUE
133 |     var$use_LV <- FALSE
134 | 
135 |     vars_info[[i]] <- var
136 |   }
137 |   for (i in qual_both) {
138 |     var <- vars_info[[i]]
139 |     var$type <- "qual"
140 | 
141 |     var$use_linear <- FALSE
142 |     var$use_UD <- TRUE
143 |     var$use_OD <- TRUE
144 |     var$use_LV <- FALSE
145 | 
146 |     vars_info[[i]] <- var
147 |   }
148 |   for (i in quan) {
149 |     var <- vars_info[[i]]
150 |     var$type <- "quan"
151 | 
152 |     var$use_linear <- add_linear_columns
153 |     var$use_UD <- FALSE
154 |     var$use_OD <- !use_LVar
155 |     var$use_LV <- use_LVar
156 | 
157 |     vars_info[[i]] <- var
158 |   }
159 | 
160 | 
161 |   # Retypes columns
162 |   for (i in seq(nvar)) {
163 |     # For quantitative variables, holds numeric data
164 |     if (vars_info[[i]]$type == "quan") {
165 |       data_idx <- vars_info[[i]]$data_column_idx
166 |       x[, data_idx] <- as.numeric(x[, data_idx])
167 |     }
168 | 
169 |     # For qualitative variables, holds ordered or unordered factor data
170 |     if (vars_info[[i]]$type == "qual") {
171 |       data_idx <- vars_info[[i]]$data_column_idx
172 |       if (vars_info[[i]]$use_OD & !is.ordered(x[, data_idx])) {
173 |         x[, data_idx] <- ordered(x[, data_idx])
174 |       } else if (!is.factor(x[, data_idx])) {
175 |         x[, data_idx] <- factor(x[, data_idx])
176 |       }
177 |     }
178 |   }
179 | 
180 |   # Set binning informations from bins_list
181 |   if (!is.null(bins_list)) {
182 |     if (is.null(bins_names)) {
183 |       idx_list <- list()
184 |       for (i in seq(nvar)) {
185 |         v <- vars_info[[i]]
186 |         if (v$use_OD | v$use_LV) idx_list <- c(idx_list, v$idx)
187 |       }
188 | 
189 |       for (i in seq(length(bins_list))) {
190 |         idx <- idx_list[[i]]
191 |         breaks <- bins_list[[i]]
192 |         if (vars_info[[idx]]$use_OD) vars_info[[idx]]$OD_info$breaks <- unique(sort(breaks[is.finite(breaks)]))
193 |         if (vars_info[[idx]]$use_LV) vars_info[[idx]]$LV_info$breaks <- unique(sort(breaks[is.finite(breaks)]))
194 |       }
195 |     } else {
196 |       idx_map <- list()
197 |       if (all(sapply(bins_names, is.character))) {
198 |         for (i in seq(nvar)) {
199 |           v <- vars_info[[i]]
200 |           if (v$use_OD | v$use_LV) idx_map[[v$name]] <- v$idx
201 |         }
202 |       } else {
203 |         for (i in seq(nvar)) {
204 |           v <- vars_info[[i]]
205 |           if (v$use_OD | v$use_LV) idx_map[[v$idx]] <- v$idx
206 |         }
207 |       }
208 | 
209 |       for (i in seq(length(bins_list))) {
210 |         name <- bins_names[[i]]
211 |         idx <- idx_map[[name]]
212 |         breaks <- bins_list[[i]]
213 |         if (vars_info[[idx]]$use_OD) vars_info[[idx]]$OD_info$breaks <- unique(sort(breaks[is.finite(breaks)]))
214 |         if (vars_info[[idx]]$use_LV) vars_info[[idx]]$LV_info$breaks <- unique(sort(breaks[is.finite(breaks)]))
215 |       }
216 |     }
217 |   }
218 | 
219 |   # For variables using dummies, set informations of the way dummies are generated
220 |   for (i in seq(nvar)) {
221 |     if (vars_info[[i]]$use_UD) {
222 |       vars_info[[i]]$UD_info <- getUDummyMatForOneVec(x[, i], only_info=TRUE, drop_last=FALSE)
223 |     }
224 |     if (vars_info[[i]]$use_OD & is.null(vars_info[[i]]$OD_info)) {
225 |       args <- list(x_vec=x[, i], dummy_type=vars_info[[i]]$OD_type, only_info=TRUE)
226 |       if(!is.null(nbin.max)) args <- c(args, nbin.max=nbin.max)
227 |       vars_info[[i]]$OD_info <- do.call(getODummyMatForOneVec, args)
228 |     }
229 |     if (vars_info[[i]]$use_LV & is.null(vars_info[[i]]$LV_info)) {
230 |       args <- list(x_vec=x[, i], only_info=TRUE)
231 |       if(!is.null(nbin.max)) args <- c(args, nbin.max=nbin.max)
232 |       vars_info[[i]]$LV_info <- do.call(getLVarMatForOneVec, args)
233 |     }
234 |   }
235 | 
236 | 
237 |   # Append informations of interaction effects
238 |   if (add_interaction_columns & nvar > 1) {
239 |     idx <- length(vars_info)
240 |     for (i in 1:nvar) {
241 |       for (j in i:nvar) {
242 |         idx <- idx + 1
243 |         var_info1 <- vars_info[[i]]
244 |         var_info2 <- vars_info[[j]]
245 |         var_info <- list(idx=idx,
246 |                          name=paste0(var_info1$name, "_x_", var_info2$name),
247 |                          type="inter",
248 |                          var_idx1=i,
249 |                          var_idx2=j)
250 |         vars_info[[idx]] <- var_info
251 |       }
252 |     }
253 |   }
254 | 
255 |   new("AGLM_Input", vars_info=vars_info, data=x)
256 | }
257 | 
258 | 
259 | # Functions for inner use
260 | getMatrixRepresentationByVector <- function(raw_vec, var_info, drop_OD=FALSE) {
261 |   assert_that(var_info$type != "inter")
262 | 
263 |   x_vec <- raw_vec
264 |   if (var_info$extrapolation == "flat") {
265 |     breaks <- if (var_info$use_LV) var_info$LV_info$breaks else var_info$OD_info$breaks
266 |     assert_that(!is.null(breaks), msg=paste0("No breaks are found for var ", var_info$name, "."))
267 | 
268 |     # It's expected that min is the first and max is the last, but calculate them for safety.
269 |     x_vec <- pmax(pmin(x_vec, max(breaks)), min(breaks))
270 |   } else {
271 |     assert_that(var_info$extrapolation == "default", msg="extrapolation must be \"default\" or \"flat\".")
272 |   }
273 | 
274 |   z <- NULL
275 | 
276 |   if (var_info$use_linear) {
277 |     z <- matrix(x_vec, ncol=1)
278 |     colnames(z) <- var_info$name
279 |   }
280 | 
281 |   if (var_info$use_LV & !drop_OD) {
282 |     z_LV <- getLVarMatForOneVec(x_vec, breaks=var_info$LV_info$breaks)$dummy_mat
283 |     if (!is.null(z_LV)) {
284 |       colnames(z_LV) <- paste0(var_info$name, "_LV_", seq(dim(z_LV)[2]))
285 |       z <- cbind(z, z_LV)
286 |     }
287 |   }
288 | 
289 |   if (var_info$use_OD & !drop_OD) {
290 |     z_OD <- getODummyMatForOneVec(x_vec, breaks=var_info$OD_info$breaks, dummy_type=var_info$OD_type)$dummy_mat
291 |     if (!is.null(z_OD)) {
292 |       colnames(z_OD) <- paste0(var_info$name, "_OD_", seq(dim(z_OD)[2]))
293 |       z <- cbind(z, z_OD)
294 |     }
295 |   }
296 | 
297 |   if (var_info$use_UD) {
298 |     z_UD <- getUDummyMatForOneVec(x_vec, levels=var_info$UD_info$levels,
299 |                                   drop_last=var_info$UD_info$drop_last)$dummy_mat
300 |     if (!is.null(z_UD)) {
301 |       colnames(z_UD) <- paste0(var_info$name, "_UD_", seq(dim(z_UD)[2]))
302 |       z <- cbind(z, z_UD)
303 |     }
304 |   }
305 | 
306 |   return(z)
307 | }
308 | 
309 | 
310 | #' @importFrom assertthat assert_that
311 | getMatrixRepresentation <- function(x, idx, drop_OD=FALSE) {
312 |   var_info <- x@vars_info[[idx]]
313 |   z <- NULL
314 | 
315 |   if (var_info$type == "quan" | var_info$type == "qual") {
316 |     z_raw <- x@data[, var_info$data_column_idx]
317 |     z <- getMatrixRepresentationByVector(z_raw, var_info, drop_OD)
318 |   } else if (var_info$type == "inter") {
319 |     # Interaction effects between columns of one variable itself
320 |     self_interaction <- var_info$var_idx1 == var_info$var_idx2
321 | 
322 |     # Get matrix representations of two variables
323 |     z1 <- getMatrixRepresentation(x, var_info$var_idx1, drop_OD=TRUE)
324 |     z2 <- getMatrixRepresentation(x, var_info$var_idx2, drop_OD=TRUE)
325 |     if (is.null(z1) | is.null(z2))
326 |       return(NULL)
327 | 
328 |     # Create matrix representation of intarction
329 |     nrow <- dim(z1)[1]
330 |     ncol1 <- dim(z1)[2]
331 |     ncol2 <- dim(z2)[2]
332 |     ncol_res <- ifelse(self_interaction, ncol1 * (ncol1 - 1) / 2, ncol1 * ncol2)
333 |     assert_that(dim(z2)[1] == nrow)
334 |     z <- matrix(0, nrow, ncol_res)
335 |     nm <- character(ncol_res)
336 |     ij <- 0
337 |     for (i in 1:ncol1) {
338 |       js <- 1:ncol2
339 |       if (self_interaction)
340 |         js <- js[js > i]
341 |       for (j in js) {
342 |         ij <- ij + 1
343 |         z[, ij] <- z1[, i] * z2[, j]
344 |         nm[ij] <- paste0(var_info$name, "_", i, "_", j)
345 |       }
346 |     }
347 |     colnames(z) <- nm
348 |   } else {
349 |     assert_that(FALSE)  # never expects to come here
350 |   }
351 | 
352 |   return(z)
353 | }
354 | 
355 | 
356 | #' @importFrom assertthat assert_that
357 | getDesignMatrix <- function(x) {
358 |   # Check arguments
359 |   assert_that(class(x) == "AGLM_Input")
360 | 
361 |   # Data size
362 |   nobs <- dim(x@data)[1]
363 |   nvar <- length(x@vars_info)
364 |   x_mat <- NULL
365 | 
366 |   for (i in 1:nvar) {
367 |     z <- getMatrixRepresentation(x, i)
368 |     if (i == 1) x_mat <- z
369 |     else x_mat <- cbind(x_mat, z)
370 |   }
371 | 
372 |   return(x_mat)
373 | }
374 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------