├── .gitignore
├── MATLAB
    ├── README.md
    └── knockoffs_matlab
    │   ├── +knockoffs
    │       ├── +create
    │       │   ├── Contents.m
    │       │   ├── divideSDP.m
    │       │   ├── fixed.m
    │       │   ├── fixed_MinCondCov.m
    │       │   ├── fixed_SDP.m
    │       │   ├── fixed_equi.m
    │       │   ├── gaussian.m
    │       │   ├── gaussian_sample.m
    │       │   ├── solveASDP.m
    │       │   ├── solveEqui.m
    │       │   └── solveSDP.m
    │       ├── +private
    │       │   ├── Contents.m
    │       │   ├── canonicalSVD.m
    │       │   ├── cvxBuild.m
    │       │   ├── decompose.m
    │       │   └── normc.m
    │       ├── +stats
    │       │   ├── +private
    │       │   │   ├── Contents.m
    │       │   │   ├── forwardSelection.m
    │       │   │   ├── forwardSelectionOMP.m
    │       │   │   ├── forwardSelectionSlow.m
    │       │   │   ├── forwardSelectionSlowOMP.m
    │       │   │   ├── lassoMaxLambda.m
    │       │   │   ├── lassoMaxLambda_binom.m
    │       │   │   ├── lassoMaxLambda_probit.m
    │       │   │   └── sequentialfs.m
    │       │   ├── Contents.m
    │       │   ├── SqrtLassoIterative_WebPage.m
    │       │   ├── bvsProbDiff.m
    │       │   ├── bvsProbDiff2.m
    │       │   ├── forwardSelection.m
    │       │   ├── forwardSelectionOMP.m
    │       │   ├── lassoCoefDiff.m
    │       │   ├── lassoCoefDiff_bin.m
    │       │   ├── lassoCoefDiff_bin_refit.m
    │       │   ├── lassoLambdaDifference.m
    │       │   ├── lassoLambdaDifference_bin.m
    │       │   ├── lassoLambdaSignedMax.m
    │       │   ├── lassoLambdaSignedMax_bin.m
    │       │   ├── olsCoefDiff.m
    │       │   ├── randomForest.m
    │       │   ├── ridgeCoefDiff.m
    │       │   ├── ridgeCoefDiff_bin.m
    │       │   ├── sqrtLassoCoefDiff.m
    │       │   ├── stabilitySignedMax.m
    │       │   └── stabilitySignedMax_bin.m
    │       ├── +tests
    │       │   ├── CreateASDPTest.m
    │       │   ├── CreateEquiTest.m
    │       │   ├── CreateGaussian.m
    │       │   ├── CreateSDPTest.m
    │       │   ├── FSBenchmark.m
    │       │   ├── FilterTest.m
    │       │   ├── ForwardSelectionTest.m
    │       │   ├── KnockoffTestCase.m
    │       │   └── LassoTest.m
    │       ├── Contents.m
    │       ├── create.m
    │       ├── filter.m
    │       ├── select.m
    │       └── threshold.m
    │   ├── LICENSE.txt
    │   ├── README.txt
    │   └── examples
    │       ├── examples_advanced.m
    │       ├── examples_basic.m
    │       └── examples_fixed.m
├── R
    ├── README.md
    └── knockoff
    │   ├── .Rbuildignore
    │   ├── DESCRIPTION
    │   ├── NAMESPACE
    │   ├── NEWS
    │   ├── R
    │       ├── create_fixed.R
    │       ├── create_gaussian.R
    │       ├── create_second_order.R
    │       ├── knockoff.R
    │       ├── knockoff_filter.R
    │       ├── solve_asdp.R
    │       ├── solve_equi.R
    │       ├── solve_sdp.R
    │       ├── stats_forward_selection.R
    │       ├── stats_glmnet.R
    │       ├── stats_glmnet_cv.R
    │       ├── stats_lasso.R
    │       ├── stats_lasso_bin.R
    │       ├── stats_lasso_cv.R
    │       ├── stats_lasso_cv_bin.R
    │       ├── stats_random_forest.R
    │       ├── stats_sqrt_lasso.R
    │       ├── stats_stability_selection.R
    │       └── util.R
    │   ├── knockoff.Rproj
    │   ├── man
    │       ├── create.fixed.Rd
    │       ├── create.gaussian.Rd
    │       ├── create.second_order.Rd
    │       ├── create.solve_asdp.Rd
    │       ├── create.solve_equi.Rd
    │       ├── create.solve_sdp.Rd
    │       ├── create_equicorrelated.Rd
    │       ├── create_sdp.Rd
    │       ├── decompose.Rd
    │       ├── divide.sdp.Rd
    │       ├── fs.Rd
    │       ├── knockoff.Rd
    │       ├── knockoff.filter.Rd
    │       ├── knockoff.threshold.Rd
    │       ├── lasso_max_lambda.Rd
    │       ├── merge.clusters.Rd
    │       ├── print.knockoff.result.Rd
    │       ├── stability_selection_importance.Rd
    │       ├── stat.forward_selection.Rd
    │       ├── stat.glmnet_coefdiff.Rd
    │       ├── stat.glmnet_lambdadiff.Rd
    │       ├── stat.glmnet_lambdasmax.Rd
    │       ├── stat.lasso_coefdiff.Rd
    │       ├── stat.lasso_coefdiff_bin.Rd
    │       ├── stat.lasso_lambdadiff.Rd
    │       ├── stat.lasso_lambdadiff_bin.Rd
    │       ├── stat.lasso_lambdasmax.Rd
    │       ├── stat.lasso_lambdasmax_bin.Rd
    │       ├── stat.random_forest.Rd
    │       ├── stat.sqrt_lasso.Rd
    │       ├── stat.stability_selection.Rd
    │       ├── vectorize_matrix.Rd
    │       └── verify_stat_depends.Rd
    │   ├── tests
    │       ├── testthat.R
    │       └── testthat
    │       │   ├── test_create.R
    │       │   ├── test_filter.R
    │       │   ├── test_stats.R
    │       │   └── test_util.R
    │   └── vignettes
    │       ├── advanced.Rmd
    │       ├── fixed.Rmd
    │       ├── hiv.Rmd
    │       ├── knockoff.Rmd
    │       └── references.bib
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *~
 3 | *.DS_Store
 4 | .Rproj.user
 5 | .Rhistory
 6 | *.tar.gz
 7 | *.zip
 8 | *.o
 9 | *.so
10 | *.pdf
11 | *.html
12 | 


--------------------------------------------------------------------------------
/MATLAB/README.md:
--------------------------------------------------------------------------------
 1 | The Knockoff Filter for MATLAB
 2 | ==========================
 3 | 
 4 | This package provides a versatile MATLAB interface to the knockoff methodology.
 5 | 
 6 | # Installation
 7 | 
 8 | ## Requirements
 9 | 
10 | Basic software requirements:
11 | 
12 | - MATLAB 2012 or later
13 | - Statistical toolbox
14 | - CVX (optional; required for creating SDP knockoffs)
15 | 
16 | Warning: your CVX release must be from April 2014 or later. Earlier
17 | versions of CVX contain a serious bug.
18 |   
19 | Additional software requirements:
20 | 
21 |  - glmnet for MATLAB
22 |  
23 | Many statistics used by the knockoff filter are computed with glmnet, 
24 | which can be downloaded from:
25 | http://web.stanford.edu/~hastie/glmnet_matlab/download.html
26 | 
27 | If you have not already installed the glmnet package for MATLAB, please download it using the link provided above 
28 | and unpack the archive into the current directory. 
29 | This should create a subdirectory named "glmnet_matlab".
30 | 
31 | ## Installation from source
32 | 
33 | You can install the lastest development version by cloning this repository, or downloading it as an archive, and then loading the source code from MATLAB.
34 | 
35 | To install the package, save the directory "knockoffs_matlab" of this repository anywhere on your machine, say to `<path>`, then add the lines
36 | ```Matlab
37 | addpath("<path>/knockoffs_matlab")
38 | addpath("<path>/knockoffs_matlab/glmnet_matlab")
39 | ```
40 | 
41 | To test your installation, you can run one of the demo files in the "examples" subdirectory. Alternatively, if you have MATLAB 2013a or newer, you can execute the test suite by typing
42 | ```Matlab
43 | runtests('knockoffs.tests')
44 | ```
45 | 
46 | in your MATLAB command window. All the tests should pass.
47 | 
48 | ## Documentation
49 | 
50 | For an overview of the functions in this package, run
51 | 
52 | ```Matlab
53 | help knockoffs
54 | ```
55 | 
56 | For a list of included knockoff statistics, run
57 | 
58 | ```Matlab
59 | help knockoffs.stats
60 | ```
61 | 
62 | Besides the documentation associated with individual functions 
63 | (accessible via the "help" function), the main source of documentation
64 | is the collection of examples in the "examples" subdirectory.
65 | 
66 | The file "examples_basic.m" is a good place to start.
67 | 
68 | ## Resources
69 | For more information, visit: https://web.stanford.edu/group/candes/knockoffs
70 | 
71 | ## Credits
72 | 
73 | This package was developed by Matteo Sesia, Lucas Janson, Emmanuel Candès, Yingying Fan and Jinchi Lv.
74 | 
75 | An earlier version of this package was developed by Evan Patterson, based on code originally written Rina Foygel Barber, Emmanuel Candès and Evan Patterson: https://bitbucket.org/epatters/knockoff-filter.
76 | 
77 | ## License
78 | 
79 | This software is distributed under the [GPLv3 license](https://www.gnu.org/licenses/gpl-3.0.en.html) and it comes with ABSOLUTELY NO WARRANTY.


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+create/Contents.m:
--------------------------------------------------------------------------------
 1 | % +CREATE
 2 | %
 3 | % Files
 4 | %   gaussian.m           - Samples Model-X Gaussian knockoffs
 5 | %   gaussian_sample.m    - Samples Model-X Gaussian knockoffs
 6 | %   gaussian_equi.m      - Computes the diagonal matrix 'diag_s', for Gaussian equi-correlated knockoffs
 7 | %   gaussian_SDP.m       - Computes the diagonal matrix 'diag_s', for Gaussian SDP knockoffs
 8 | %   gaussian_ASDP.m      - Computes an approximation of the diagonal matrix 'diag_s', for Gaussian SDP knockoffs
 9 | %   fixed.m              - Samples Fixed-X knockoffs
10 | %   fixed_equi.m         - Computes the diagonal matrix 'diag_s', for Fixed-X equi-correlated knockoffs
11 | %   fixed_SDP.m          - Computes the diagonal matrix 'diag_s', for Fixed-X SDP knockoffs
12 | %   fixed_MinCondCov.m   - Computes the diagonal matrix 'diag_s', for Fixed-X SDP knockoffs (different SDP)
13 | %   decompose.m          - Decompose design matrix X for knockoff creation
14 | 


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+create/divideSDP.m:
--------------------------------------------------------------------------------
 1 | function [clusters_sdp,subSigma] = divideSDP(Sigma)
 2 | %KNOCKOFFS.CREATE.DIVIDESDP Approximate a covariance matrix by a block diagonal matrix,
 3 | % in order to efficiently construct approximate SDP knockoffs.
 4 | %
 5 | % This function is used to create approximate SDP knockoffs.
 6 | % The full covariance matrix is approximated by a block diagonal matrix
 7 | % constructed by clustering the columns of the original covariance matrix.
 8 | % The clusters are created from a single linkage dendrogram by joining the 
 9 | % leaves greedely in such a way that no cluster contains more than 10
10 | % percent of all variables.
11 | %
12 | %   [,subSigma] = KNOCKOFFS.CREATE.DIVIDESDP(Sigma)
13 | %
14 | %  Inputs:
15 | %     Sigma  - p x p covariance matrix for the marginal distribution of X
16 | %
17 | %  Outputs:
18 | %    subSigma - a cell array of smaller covariance matrices
19 | %
20 | % See also KNOCKOFFS.CREATE.GAUSSIAN_ASDP, KNOCKOFFS.CREATE.GAUSSIAN_SDP
21 | 
22 | p = length(Sigma);
23 | 
24 | % Parameters for the max-size clustering algorithm
25 | linkmeth = 'single'; %average, complete
26 | maxclust = floor(p/10);
27 | 
28 | % Compute the clustering dendrogram
29 | Z = linkage(1-abs(Sigma(tril(true(p),-1)))',linkmeth);
30 | 
31 | % Create clusters adaptively from the dendrogram, making sure that no
32 | % cluster contains more than 'maxclust' elements
33 | clusters_sdp = (1:p)';
34 | clustersizes = zeros(max(max(Z(:,1:2))),1);
35 | clustersizes(1:p) = 1;
36 | for j = 1:size(Z,1)
37 |   if sum(clustersizes(Z(j,1:2)))<=maxclust
38 |     clusters_sdp(ismember(clusters_sdp,Z(j,1:2))) = p+j;
39 |     clustersizes(p+j) = sum(clustersizes(Z(j,1:2)));
40 |     clustersizes(Z(j,1:2)) = 0;
41 |   end
42 | end
43 | uclusters_sdp = unique(clusters_sdp);
44 | nclust_sdp = length(uclusters_sdp);
45 | 
46 | % Create the block matrices and rename the unique clusters
47 | maxsubp = 0;
48 | subSigma = cell(1,nclust_sdp);
49 | for ksub = 1:nclust_sdp
50 |   k = uclusters_sdp(ksub);
51 |   k_indices = clusters_sdp==k;
52 |   subp = sum(k_indices);
53 |   maxsubp = max(maxsubp,subp);
54 |   subSigma{ksub} = Sigma(k_indices,k_indices);
55 |   clusters_sdp(k_indices) = ksub;
56 | end
57 | 
58 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+create/fixed.m:
--------------------------------------------------------------------------------
 1 | function X_k = fixed(X, method, randomize)
 2 | %KNOCKOFFS.CREATE.FIXED Creates fixed-X knockoffs.
 3 | %
 4 | %  X_k = KNOCKOFFS.CREATE.FIXED(X)
 5 | %  X_k = KNOCKOFFS.CREATE.FIXED(X, method)
 6 | %  X_k = KNOCKOFFS.CREATE.FIXED(X, method, randomize)
 7 | %
 8 | %  Inputs:
 9 | %       X    - n x p scaled covariate matrix
10 | %     method - either 'equi' (for equi-correlated knockoffs), 'sdp'
11 | %              (for knockoffs optimized using semi-definite programming) or 
12 | %              'asdp' (for approximate SDP knockoffs)
13 | %              Default: 'sdp'
14 | %  randomize - whether to use randomization in the construction of the
15 | %              knockoff variables.
16 | %   
17 | %  Outputs:
18 | %       X_k - n x p matrix of knockoff variables
19 | %
20 | %  See also KNOCKOFFS.CREATE.GAUSSIAN
21 | 
22 | if ~exist('method', 'var') || isempty(method), method = 'sdp'; end;
23 | if ~exist('randomize', 'var'), randomize = []; end;
24 | 
25 | % Create the knockoffs
26 | method = lower(method);
27 | switch method
28 |     case 'equi'
29 |         X_k = knockoffs.create.fixed_equi(X, randomize);
30 |     case 'sdp'
31 |         X_k = knockoffs.create.fixed_SDP(X, randomize, false);
32 |     case 'asdp'
33 |         X_k = knockoffs.create.fixed_SDP(X, randomize, true);
34 |     case 'mincondcov'
35 |         X_k = knockoffs.create.fixed_MinCondCov(X, randomize);
36 |     otherwise
37 |         error('Invalid Fixed-X knockoff creation method %s', method)
38 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+create/fixed_MinCondCov.m:
--------------------------------------------------------------------------------
 1 | function X_ko = fixed_MinCondCov(X, randomize)
 2 | % KNOCKOFFS.CREATE.FIXED_MINCONDCOV  Create knockoff variables using a SDP
 3 | %   X_ko = KNOCKOFFS.CREATE.FIXED_MINCONDCOV(X) create knockoffs deterministically
 4 | %   X_ko = KNOCKOFFS.CREATE.FIXED_MINCONDCOV(X, true) knockoffs with randomization
 5 | %
 6 | %   Creates knockoff variables using semi-definite programming (SDP) to
 7 | %   minimize conditional covariances.
 8 | %
 9 | %   Inputs:
10 | %       X - n x p scaled data matrix (n >= 2*p)
11 | %       randomize - whether to use randomization in the construction of
12 | %                   the knockoff variables
13 | %   
14 | %   Outputs:
15 | %       X_ko - n x p knockoff variable matrix
16 | %
17 | %   See also KNOCKOFFS.CREATE.FIXED_EQUI, KNOCKOFFS.CREATE.FIXED_SDP.
18 | 
19 | % Validate inputs.
20 | if ~exist('randomize', 'var'), randomize = []; end
21 | 
22 | % Validate CVX version.
23 | % (Build 1079 was released on Apr 23, 2014).
24 | if ~exist('cvx_begin', 'file')
25 |     error('knockoff:MissingCVX', ...
26 |           'CVX is not installed. To use SDP knockoffs, please install CVX.')
27 | elseif knockoffs.private.cvxBuild() < 1079
28 |     error('knockoff:OldCVX', ...
29 |           'CVX is too old. To use MinCondCov knockoffs, please upgrade CVX.')
30 | end
31 | 
32 | % Compute SVD and U_perp.
33 | [~,S,V,U_perp] = knockoffs.private.decompose(X, randomize);
34 | 
35 | % Check for rank deficiency.
36 | tol = 1e-5;
37 | S_inv = 1 ./ diag(S);
38 | S_zeros = diag(S) <= tol*max(diag(S));
39 | if any(S_zeros)
40 |     warning('knockoff:RankDeficiency', ...
41 |         ['Data matrix is rank deficient. ' ...
42 |          'Model is not identifiable, but proceeding with MinCondCov knockoffs.'])
43 |     S_inv(S_zeros) = 0;
44 | end
45 | S_inv = diag(S_inv);
46 | 
47 | % Compute the Gram matrix X'*X, conditional covariances, and its (pseudo)inverse.
48 | G = V * sparse(S.^2) * V';
49 | sc = nan(size(G,1),1);
50 | for i = 1:size(G,1)
51 |   noti = setdiff(1:size(G,1),i);
52 |   G11 = G(i,i)*[1 1;1 1];
53 |   G12 = repmat(G(i,noti),2,1);
54 |   G22 = G(noti,noti);
55 |   SC = G11 - G12*(G22\G12');
56 |   sc(i) = SC(1,2);
57 | end
58 | G_inv = V * sparse(S_inv.^2) * V';
59 | 
60 | % Optimize the parameter s of Equation 1.3 according to the SDP
61 | % minimization problem of Equation 2.5.
62 | s = solveMinCondCov(G,sc);
63 | s(s <= tol) = 0;
64 | diag_s = sparse(diag(s));
65 | 
66 | % Construct the knockoff according to Equation 1.4:
67 | %   X_ko = X(I - (X'X)^{-1} * s) + U_perp * C
68 | % where
69 | %   C'C = 2s - s * (X'X)^{-1} * s.
70 | [~,D,V] = knockoffs.private.canonicalSVD(2*diag_s - diag_s*G_inv*diag_s);
71 | d = sqrt(max(0, diag(D)));
72 | diag_d = sparse(diag(d));
73 | X_ko = X - X * G_inv * diag_s + U_perp * diag_d * V';
74 | 
75 | end
76 | 
77 | function y = solveMinCondCov(G,sc) %#ok<STOUT>
78 | % Solves 
79 | %
80 | % maximize    |y-sc|
81 | % subect to   s <= 1
82 | %             [G , G - diag(s); G - diag(s) , G] >= 0
83 | %
84 | % The LMI is equivalent to 2G - diag(s) >= 0 and s >= 0
85 | % 
86 | % Using CVX, we solve this via the dual SDP.
87 | 
88 | p = length(G);
89 | 
90 | cvx_begin quiet
91 |     variable y(p);
92 |     minimize(norm(y-sc,1))
93 |     2*G-diag(y) == semidefinite(p); %#ok<EQEFF>
94 |     0 <= y <= 1 %#ok<CHAIN,NOPRT>
95 | cvx_end
96 | 
97 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+create/fixed_SDP.m:
--------------------------------------------------------------------------------
 1 | function X_ko = fixed_SDP(X, randomize, approximate)
 2 | % KNOCKOFFS.CREATE.FIXED_SDP  Create knockoff variables using SDP
 3 | %   X_ko = KNOCKOFFS.CREATE.FIXED_SDP(X) create knockoffs deterministically
 4 | %   X_ko = KNOCKOFFS.CREATE.FIXED_SDP(X, true) create knockoffs with randomization
 5 | %   X_ko = KNOCKOFFS.CREATE.FIXED_SDP(X, false, true) create approximate SDP knockoffs
 6 | %
 7 | %   Creates knockoff variables using semi-definite programming (SDP).
 8 | %
 9 | %   Inputs:
10 | %       X         - n x p scaled data matrix (n >= 2*p)
11 | %       randomize - whether to use randomization in the construction of
12 | %                   the knockoff variables (default: false)
13 | %     approximate - whether to use approximate sdp construction 
14 | %                   (default: false)
15 | %   
16 | %   Outputs:
17 | %       X_ko - n x p knockoff variable matrix
18 | %
19 | %   See also KNOCKOFFS.CREATE.FIXED_EQUI, KNOCKOFFS.CREATE.FIXED_MINCONDCOV.
20 | 
21 | % Validate inputs.
22 | if ~exist('randomize', 'var'), randomize = []; end
23 | if ~exist('approximate', 'var'), approximate = false; end
24 | 
25 | % Validate CVX version.
26 | % (Build 1079 was released on Apr 23, 2014).
27 | if ~exist('cvx_begin', 'file')
28 |     error('knockoff:MissingCVX', ...
29 |           'CVX is not installed. To use SDP knockoffs, please install CVX.')
30 | elseif knockoffs.private.cvxBuild() < 1079
31 |     error('knockoff:OldCVX', ...
32 |           'CVX is too old. To use SDP knockoffs, please upgrade CVX.')
33 | end
34 | 
35 | % Compute SVD and U_perp.
36 | [~,S,V,U_perp] = knockoffs.private.decompose(X, randomize);
37 | 
38 | % Check for rank deficiency.
39 | tol = 1e-5;
40 | S_inv = 1 ./ diag(S);
41 | S_zeros = diag(S) <= tol*max(diag(S));
42 | if any(S_zeros)
43 |     warning('knockoff:RankDeficiency', ...
44 |         ['Data matrix is rank deficient. ' ...
45 |          'Model is not identifiable, but proceeding with SDP knockoffs.'])
46 |     S_inv(S_zeros) = 0;
47 | end
48 | S_inv = diag(S_inv);
49 | 
50 | % Compute the Gram matrix X'*X and its (pseudo)inverse.
51 | G = V * sparse(S.^2) * V';
52 | G_inv = V * sparse(S_inv.^2) * V';
53 | 
54 | % Optimize the parameter s of Equation 1.3 according to the SDP
55 | % minimization problem of Equation 2.5.
56 | if approximate
57 |     diag_s = sparse(diag(knockoffs.create.solveASDP(G)));
58 | else
59 |     diag_s = sparse(diag(knockoffs.create.solveSDP(G)));
60 | end
61 | 
62 | % Construct the knockoff according to Equation 1.4:
63 | %   X_ko = X(I - (X'X)^{-1} * s) + U_perp * C
64 | % where
65 | %   C'C = 2s - s * (X'X)^{-1} * s.
66 | [~,D,V] = knockoffs.private.canonicalSVD(2*diag_s - diag_s*G_inv*diag_s);
67 | d = sqrt(max(0, diag(D)));
68 | diag_d = sparse(diag(d));
69 | X_ko = X - X * G_inv * diag_s + U_perp * diag_d * V';
70 | 
71 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+create/fixed_equi.m:
--------------------------------------------------------------------------------
 1 | function X_ko = fixed_equi(X, randomize)
 2 | % KNOCKOFFS.CREATE.FIXED_EQUI  Create equi-correlated knockoff variables for a fixed design
 3 | %   X_ko = KNOCKOFFS.CREATE.FIXED_EQUI(X) create knockoffs deterministically
 4 | %   X_ko = KNOCKOFFS.CREATE.FIXED_EQUI(X, true) create knockoffs with randomization
 5 | %
 6 | %   Inputs:
 7 | %       X - n x p scaled data matrix (n >= 2*p)
 8 | %       randomize - whether to use randomization in the construction of
 9 | %                   the knockoff variables
10 | %   
11 | %   Outputs:
12 | %       X_ko - n x p knockoff variable matrix
13 | %
14 | %   See also KNOCKOFFS.FIXED.CREATESDP, KNOCKOFFS.FIXED.CREATEMINCONDCOV.
15 | 
16 | if ~exist('randomize', 'var'), randomize = []; end
17 | 
18 | % Compute SVD and U_perp.
19 | [U,S,V,U_perp] = knockoffs.private.decompose(X, randomize);
20 | 
21 | % Set s = min(2 * smallest eigenvalue of X'X, 1), so that all the
22 | % correlations have the same value X_j'X_j = 1 - s.
23 | if any(diag(S) <= 1e-5 * max(diag(S)))
24 |     error('knockoff:RankDeficiency', ...
25 |           ['Data matrix is rank deficient. ' ...
26 |            'Equicorrelated knockoffs will have no power. ' ...
27 |            'If you must proceed, use SDP knockoffs instead.'])
28 | end
29 | lambda_min = min(diag(S))^2;
30 | s = min(2*lambda_min, 1);
31 | 
32 | % Construct the knockoff according to Equation 1.4:
33 | %   X_ko = X(I - (X'X)^{-1} * s) + U_perp * C
34 | % where
35 | %   C'C = 2s - s * (X'X)^{-1} * s.
36 | X_ko = U * sparse(diag(diag(S) - s./diag(S))) * V' + ...
37 |     U_perp * sparse(diag(sqrt(2*s - s^2./diag(S).^2))) * V';
38 | X_ko = real(X_ko);
39 | 
40 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+create/gaussian.m:
--------------------------------------------------------------------------------
 1 | function X_k = gaussian(X, mu, Sigma, method)
 2 | %KNOCKOFFS.CREATE.GAUSSIAN Samples model-free multivariate normal knockoffs according
 3 | %  to the classical regression formulas, after computing the diagonal
 4 | %  matrix 'diag_s' according to the specified method.
 5 | %
 6 | %  X_k = KNOCKOFFS.CREATE.GAUSSIAN(X, mu, Sigma)
 7 | %  X_k = KNOCKOFFS.CREATE.GAUSSIAN(X, mu, Sigma, method)
 8 | %
 9 | %  Inputs:
10 | %       X    - n x p scaled data matrix
11 | %       mu   - 1 x p mean vector for the marginal distribution of X
12 | %     Sigma  - p x p covariance matrix for the marginal distribution of X
13 | %     diag_s - p x p diagonal matrix (equation 3.2)
14 | %     method - either 'equi' (for equi-correlated knockoffs), 'sdp'
15 | %              (for knockoffs optimized using semi-definite programming) or 
16 | %              'asdp' (for approximate SDP knockoffs)
17 | %              Default: 'sdp'
18 | %  Outputs:
19 | %       X_k - n x p matrix of knockoff variables
20 | %
21 | %  See also KNOCKOFFS.CREATE.GAUSSIAN_SAMPLE, KNOCKOFFS.CREATE.FIXED
22 | 
23 | if ~exist('method', 'var') || isempty(method), method = 'sdp'; end;
24 | 
25 | % Compute the diagonal matrix diag_s
26 | method = lower(method);
27 | switch method
28 |     case 'equi'
29 |         diag_s = sparse(diag(knockoffs.create.solveEqui(Sigma)));
30 |     case 'sdp'
31 |         diag_s = sparse(diag(knockoffs.create.solveSDP(Sigma)));
32 |     case 'asdp'
33 |         diag_s = sparse(diag(knockoffs.create.solveASDP(Sigma)));    
34 |     otherwise
35 |         error('Invalid Model-X Gaussian knockoff creation method %s', method)
36 | end
37 | 
38 | % Sample the knockoffs
39 | X_k = knockoffs.create.gaussian_sample(X, mu, Sigma, diag_s);
40 | 
41 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+create/gaussian_sample.m:
--------------------------------------------------------------------------------
 1 | function X_k = gaussian_sample(X, mu, Sigma, diag_s)
 2 | %KNOCKOFFS.GAUSSIAN.SAMPLE Samples model-free multivariate normal knockoffs according
 3 | %  to the classical regression formulas, for a pre-computed diagonal matrix
 4 | %  'diag_s'.
 5 | %
 6 | %   X_k = KNOCKOFFS.CREATE.GAUSSIAN_SAMPLE(X, mu, Sigma, diag_s)
 7 | %
 8 | %   Inputs:
 9 | %       X    - n x p scaled data matrix
10 | %       mu   - 1 x p mean vector for the marginal distribution of X
11 | %     Sigma  - p x p covariance matrix for the marginal distribution of X
12 | %     diag_s - p x p diagonal matrix (equation 3.2)
13 | %   
14 | %   Outputs:
15 | %       X_k - n x p matrix of knockoff variables
16 | %
17 | %   See also KNOCKOFFS.CREATE.GAUSSIAN
18 | 
19 | [n,p] = size(X);
20 | 
21 | % Compute the inverse covariance matrix of the original variables and 
22 | % multiply it by the diagonal matrix diag_s
23 | SigmaInv_s = Sigma\diag_s;
24 | 
25 | % Compute mean and covariance of the knockoffs
26 | mu_k = X-(X-repmat(mu,n,1))*SigmaInv_s;
27 | Sigma_k = 2*diag_s - diag_s*SigmaInv_s;
28 | 
29 | % Sample the knockoffs
30 | X_k = mu_k + randn(n,p)*chol(Sigma_k);
31 | 
32 | end
33 | 
34 | 


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+create/solveASDP.m:
--------------------------------------------------------------------------------
 1 | function s = solveASDP(Sigma, parallel)
 2 | %KNOCKOFFS.CREATE.SOLVEASDP Computes the diagonal matrix 'diags_s', used to 
 3 | % create Model-X or Fixed-X approximate SDP knockoffs.
 4 | %
 5 | % This function approximates the covariance matrix Sigma with a
 6 | % block-diagonal matrix constructed by clustering its columns. 
 7 | % The clusters are created from a single linkage dendrogram by joining the 
 8 | % leaves greedely in such a way that no cluster contains more than 10
 9 | % percent of all variables.
10 | %
11 | % Then, the approximate SDP problem factors into a number of independent 
12 | % subproblems that can be efficiently solved in parallel.
13 | %
14 | %   s = KNOCKOFFS.CREATE.SOLVEASDP(Sigma)
15 | %   s = KNOCKOFFS.CREATE.SOLVEASDP(Sigma, parallel)
16 | %
17 | %  Inputs:
18 | %     Sigma  - p x p covariance matrix for the marginal distribution of X
19 | %   parallel - whether to solve the subproblems in parallel (default: false)
20 | %
21 | %  Outputs:
22 | %    s       - a vector of length p
23 | %
24 | % See also KNOCKOFFS.CREATE.SOLVESDP, KNOCKOFFS.CREATE.SOLVEEQUI, 
25 | % KNOCKOFFS.CREATE.DIVIDESDP, KNOCKOFFS.CREATE.GAUSSIAN, KNOCKOFFS.CREATE.FIXED
26 | 
27 | p = length(Sigma);
28 | if ~exist('parallel', 'var') || isempty(parallel), parallel = false; end;
29 | 
30 | 
31 | % Approximate the covariance matrix with a block diagonal matrix
32 | [clusters_sdp,subSigma] = knockoffs.create.divideSDP(Sigma);
33 | 
34 | % Create the smaller SDP problems and solve them in parallel
35 | nclust_sdp = length(subSigma);
36 | sub_diag_s = cell(1,nclust_sdp);
37 | if parallel
38 |     parfor ksub = 1:nclust_sdp
39 |         sub_diag_s{ksub} = knockoffs.create.solveSDP(subSigma{ksub});
40 |     end
41 | else
42 |     for ksub = 1:nclust_sdp
43 |         sub_diag_s{ksub} = knockoffs.create.solveSDP(subSigma{ksub});
44 |     end
45 | end
46 | 
47 | % Put the results to all subproblems back together
48 | s_asdp = nan(p,1);
49 | for ksub = 1:nclust_sdp
50 |     s_asdp(clusters_sdp==ksub) = sub_diag_s{ksub};
51 | end
52 | 
53 | % Find the optimal shrinking factor to ensure positive-definiteness
54 | % via binary search
55 | iterations = 20;
56 | gamma_sdp = 1/2;
57 | for j = 2:iterations
58 |   [~,psd] = chol(2*Sigma-gamma_sdp*diag(s_asdp));
59 |   if psd==0
60 |     gamma_sdp = gamma_sdp + 1/2^j;
61 |   else
62 |     gamma_sdp = gamma_sdp - 1/2^j;
63 |   end
64 | end
65 | [~,psd]=chol(2*Sigma-gamma_sdp*diag(s_asdp));
66 | if psd~=0
67 |     gamma_sdp = gamma_sdp - 1/2^j;
68 | end
69 | 
70 | % Shrink the solution
71 | s = gamma_sdp*s_asdp;
72 | 
73 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+create/solveEqui.m:
--------------------------------------------------------------------------------
 1 | function s = solveEqui(Sigma)
 2 | %KNOCKOFFS.CREATE.EQUI Computes the diagonal matrix 'diags_s', used to sample
 3 | % Model-X and Fixed-X equi-correlated knockoffs.
 4 | %
 5 | %   s = KNOCKOFFS.CREATE.SOLVEEQUI(Sigma)
 6 | %
 7 | %  Inputs:
 8 | %     Sigma  - p x p covariance matrix for the marginal distribution of X
 9 | %
10 | %  Outputs:
11 | %          s - a vector of length p
12 | %
13 | % See also KNOCKOFFS.CREATE.SOLVESDP, KNOCKOFFS.CREATE.SOLVEASDP,
14 | % KNOCKOFFS.CREATE.GAUSSIAN, KNOCKOFFS.CREATE.FIXED
15 | 
16 | % Convert the covariance matrix into a correlation matrix
17 | [scaleSigma, corrSigma] = cov2corr(Sigma);
18 | 
19 | opts.isreal = true;
20 | opts.tol = 1e-6;
21 | lambda_min = eigs(corrSigma,1,'sm',opts);
22 | s = ones(length(corrSigma),1) * min(2*lambda_min, min(diag(corrSigma)));
23 | 
24 | % Compensate for numerical errors in CVX
25 | psd = 1;
26 | s_eps = 1e-8;
27 | while psd~=0
28 |     % Compute knockoff conditional covariance matrix
29 |     diag_s = sparse(diag(s.*(1-s_eps)));
30 |     SigmaInv_s = corrSigma\diag_s;
31 |     Sigma_k = 2*diag_s - diag_s*SigmaInv_s;
32 |     [~,psd] = chol(Sigma_k);
33 |     s_eps = s_eps*10;
34 | end
35 | s = s-s_eps/10;
36 | s(s < 0) = 0;
37 | 
38 | % Scale back the results for a covariance matrix
39 | s = s(:) .* (scaleSigma(:).^2);
40 | 
41 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+create/solveSDP.m:
--------------------------------------------------------------------------------
 1 | function s = solveSDP(Sigma)
 2 | %KNOCKOFFS.CREATE.SOLVESDP Computes the diagonal matrix 'diags_s', used to sample
 3 | % Model-X and Fixed-X SDP knockoffs.
 4 | %    
 5 | % Solves
 6 | %
 7 | % maximize    1' * s
 8 | % subect to   0 <= s_i <= 1, (for all 1<=i<=p)
 9 | %             G = [Sigma , Sigma - diag(s1); Sigma - diag(s1) , Sigma] >= 0
10 | %
11 | % The LMI is equivalent to 2G - diag(s) >= 0 and s >= 0
12 | % 
13 | % Using CVX, we solve this via the dual SDP.
14 | %
15 | %   s = KNOCKOFFS.CREATE.SOLVESDP(Sigma)
16 | %
17 | %  Inputs:
18 | %     Sigma  - p x p covariance matrix for the marginal distribution of X
19 | %
20 | %  Outputs:
21 | %          s - a vector of length p
22 | %
23 | % See also KNOCKOFFS.CREATE.SOLVEEQUI, KNOCKOFFS.CREATE.SOLVEASDP,
24 | % KNOCKOFFS.CREATE.GAUSSIAN, KNOCKOFFS.CREATE.FIXED
25 | 
26 | 
27 | % Validate CVX version.
28 | % (Build 1079 was released on Apr 23, 2014).
29 | if ~exist('cvx_begin', 'file')
30 |     error('knockoff:MissingCVX', ...
31 |           'CVX is not installed. To use SDP knockoffs, please install CVX.')
32 | elseif knockoffs.private.cvxBuild() < 1079
33 |     error('knockoff:OldCVX', ...
34 |           'CVX is too old. To use SDP knockoffs, please upgrade CVX.')
35 | end
36 | 
37 | p = length(Sigma);
38 | 
39 | % Convert the covariance matrix into a correlation matrix
40 | [scaleSigma, corrSigma] = cov2corr(Sigma);
41 | 
42 | % Optimize the parameter s of Equation 3.2 according to the SDP
43 | % minimization problem of Equation 2.14.
44 | warning('off')
45 | cvx_begin quiet
46 |     variable s(p);
47 |     maximize(sum(s)) %#ok<NODEF>
48 |     2*corrSigma-diag(s) == semidefinite(p); %#ok<EQEFF>
49 |     0 <= s <= 1 %#ok<CHAIN,NOPRT>
50 | cvx_end
51 | warning('on')
52 | s(s < 0) = 0;
53 | 
54 | % Try different solver if the first one fails
55 | if (any(isnan(s)))
56 |     error('CVX failed to solve the SDP required to construct knockoffs. Trying again with a different solver. To hange the current solver, type: cvx_solver <solver name>');
57 | end
58 | 
59 | % Compensate for numerical errors in CVX
60 | psd = 1;
61 | s_eps = 1e-8;
62 | while psd~=0
63 |     % Compute knockoff conditional covariance matrix
64 |     diag_s = sparse(diag(s.*(1-s_eps)));
65 |     SigmaInv_s = corrSigma\diag_s;
66 |     Sigma_k = 2*diag_s - diag_s*SigmaInv_s;
67 | 
68 |     [~,psd] = chol(Sigma_k);
69 |     s_eps = s_eps*10;
70 | end
71 | s = s.*(1-s_eps/10);
72 | s(s < 0) = 0;
73 | 
74 | % Scale back the results for a covariance matrix
75 | s = s(:) .* (scaleSigma(:).^2);
76 | 
77 | end
78 | 
79 | 


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+private/Contents.m:
--------------------------------------------------------------------------------
1 | % +PRIVATE
2 | %
3 | % Files
4 | %   cvxBuild - Returns the build number of the installed CVX package.
5 | %   normc    - Normalize columns of a matrix.
6 | %   canonicalSVD.m       - Reduced SVD with canonical sign choice
7 | %   divide_SDP.m         - Approximate a covariance matrix by a block diagonal matrix.


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+private/canonicalSVD.m:
--------------------------------------------------------------------------------
 1 | function [U,S,V] = canonicalSVD(X)
 2 | % KNOCKOFFS.PRIVATE.CANONICALSVD  Reduced SVD with canonical sign choice
 3 | %   [U,S,V] = KNOCKOFFS.PRIVATE.CANONICALSVD(X)
 4 | %
 5 | %   Computes a reduced SVD without sign ambiguity. Our convention is that
 6 | %   the sign of each vector in U is chosen such that the coefficient
 7 | %   with largest absolute value is positive.
 8 | 
 9 | [U,S,V] = svd(X,0);
10 | 
11 | for j = 1:min(size(X))
12 |     [~,i] = max(abs(U(:,j)));
13 |     if U(i,j) < 0
14 |         U(:,j) = -U(:,j);
15 |         V(:,j) = -V(:,j);
16 |     end
17 | end
18 | 
19 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+private/cvxBuild.m:
--------------------------------------------------------------------------------
 1 | function build = cvxBuild()
 2 | % KNOCKOFFS.PRIVATE.CVXBUILD  Returns the build number of the installed CVX package.
 3 | 
 4 | % CVX provides no structured way to get this information,
 5 | % so we resort to a hack. However, the natural hack--parsing the output
 6 | % of 'cvx_version'--does not work, as decribed here:
 7 | %
 8 | %   http://ask.cvxr.com/question/3058
 9 | %
10 | % The following quite terrible hack is reported to work for CVX versions
11 | % 2 and 3 (and possibly even 1).
12 | 
13 | cvx_version(1);
14 | global cvx___
15 | 
16 | fid = fopen([ cvx___.where, cvx___.fs, 'cvx_version.m' ]);
17 | source = fread(fid, Inf, 'uint8=>char')';
18 | fclose(fid);
19 | 
20 | buildStr = regexp(source, 'cvx_bld = ''(\d+)''', 'tokens');
21 | build = str2double(buildStr{1});
22 | 
23 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+private/decompose.m:
--------------------------------------------------------------------------------
 1 | function [U,S,V,U_perp] = decompose(X, randomize)
 2 | % KNOCKOFFS.PRIVATE.DECOMPOSE  Decompose design matrix X for knockoff creation
 3 | %   [U,S,V,U_perp] = KNOCKOFFS.PRIVATE.DECOMPOSE(X)
 4 | %   [U,S,V,U_perp] = KNOCKOFFS.PRIVATE.DECOMPOSE(X, randomize)
 5 | 
 6 | if ~exist('randomize', 'var') || isempty(randomize)
 7 |     randomize = false;
 8 | end
 9 | 
10 | % Check dimensions.
11 | [n, p] = size(X);
12 | if (n < 2*p)
13 |    error('knockoff:DimensionError', 'Data matrix must have n >= 2p')
14 | end
15 | 
16 | % Factorize X as X = USV' (reduced SVD).
17 | [U,S,V] = knockoffs.private.canonicalSVD(X);
18 | 
19 | % Construct an orthogonal matrix U_perp such that U_perp'*X = 0.
20 | [Q,~] = qr([U zeros(n,p)], 0); % Skinny QR.
21 | U_perp = Q(:,p+1:2*p);
22 | if randomize
23 |     [Q,~] = qr(randn(p),0); 
24 |     U_perp = U_perp * Q;
25 | end
26 | 
27 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+private/normc.m:
--------------------------------------------------------------------------------
 1 | function Y = normc(X)
 2 | %KNOCKOFFS.PRIVATE.NORMC Normalize columns of a matrix.
 3 | %   A clone of NORMC from the Neural Network toolbox.
 4 | 
 5 | n = size(X,1);
 6 | X = bsxfun(@minus,X,mean(X,1));
 7 | factors = 1 ./ sqrt(sum(X.^2, 1));
 8 | Y = X .* factors(ones(1,n),:);
 9 | 
10 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/+private/Contents.m:
--------------------------------------------------------------------------------
 1 | % +PRIVATE
 2 | %
 3 | % Files
 4 | %   forwardSelection        - Fast implementation of forward selection
 5 | %   forwardSelectionOMP     - Fast implementation of forward selection with orthogonal mathcing pursuit (OMP)
 6 | %   forwardSelectionSlow    - Slow reference implementation of forward selection
 7 | %   forwardSelectionSlowOMP - Slow reference implementation of forward 
 8 | %   lassoMaxLambda          - Maximum lambda's for which variables in lasso model
 9 | %   lassoMaxLambda_binom    - Maximum lambda's for which variables in lasso model (binomial response)
10 | %   lassoMaxLambda_probit   - Maximum lambda's for which variables in lasso model (probit response)
11 | %   sequentialfs            - Sequential feature selection
12 | 


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/+private/forwardSelection.m:
--------------------------------------------------------------------------------
 1 | function varargout = forwardSelection(X, y)
 2 | % KNOCKOFFS.STATS.PRIVATE.FORWARDSELECTION  Fast implementation of forward selection
 3 | %
 4 | %   Assumes that the columns of X are normalized to 1.
 5 | %
 6 | % See also KNOCKOFFS.STATS.PRIVATE.FORWARDSELECTIONOMP
 7 | 
 8 | X = knockoffs.private.normc(X);  % Standardize the variables
 9 | 
10 | [varargout{1:nargout}] = ...
11 |     knockoffs.stats.private.sequentialfs(@criterion, @target, X, y);
12 | 
13 | end
14 | 
15 | function c = criterion(~, x, residual)
16 |     c = -abs(dot(x, residual));
17 | end
18 | 
19 | function nextResidual = target(~, x, residual)
20 |     nextResidual = residual - dot(x, residual) .* x;
21 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/+private/forwardSelectionOMP.m:
--------------------------------------------------------------------------------
 1 | function varargout = forwardSelectionOMP(X, y)
 2 | % KNOCKOFFS.STATS.PRIVATE.FORWARDSELECTIONOMP  Fast implementation of 
 3 | %  forward selection with orthogonal mathcing pursuit (OMP)
 4 | %
 5 | %   Assumes that the columns of X are normalized to 1.
 6 | %
 7 | % See also KNOCKOFFS.STATS.PRIVATE.FORWARDSELECTION
 8 | 
 9 | [n,p] = size(X);
10 | 
11 | X = knockoffs.private.normc(X);  % Standardize the variables
12 | 
13 | Q = zeros(n,p);
14 | i = 1;
15 | 
16 | function nextResidual = target(~, x, residual)
17 |     % Orthonormalize using modified Gram-Schmidt.
18 |     for j = 1:i-1
19 |         x = x - dot(Q(:,j), x) .* Q(:,j);
20 |     end
21 |     q = x / norm(x);
22 |     
23 |     nextResidual = residual - dot(q,y) .* q;
24 |     Q(:,i) = q;
25 |     i = i+1;
26 | end
27 | 
28 | [varargout{1:nargout}] = ...
29 |     knockoffs.stats.private.sequentialfs(@criterion, @target, X, y);
30 | 
31 | end
32 | 
33 | function c = criterion(~, x, residual)
34 |     c = -abs(dot(x, residual));
35 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/+private/forwardSelectionSlow.m:
--------------------------------------------------------------------------------
 1 | function varargout = forwardSelectionSlow(X, y)
 2 | % KNOCKOFFS.STATS.PRIVATE.FORWARDSELECTIONSLOW  Slow reference implementation of forward selection
 3 | %
 4 | % See also KNOCKOFFS.STATS.PRIVATE.FORWARDSELECTION
 5 | 
 6 | [varargout{1:nargout}] = ...
 7 |     knockoffs.stats.private.sequentialfs(@criterion, @target, X, y);
 8 | 
 9 | end
10 | 
11 | function c = criterion(~, x, residual)
12 |     c = -abs(dot(x, residual));
13 | end
14 | 
15 | function nextResidual = target(~, x, residual)
16 |     [~,~,nextResidual] = regress(residual, x);
17 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/+private/forwardSelectionSlowOMP.m:
--------------------------------------------------------------------------------
 1 | function varargout = forwardSelectionSlowOMP(X, y)
 2 | % KNOCKOFFS.STATS.PRIVATE.FORWARDSELECTIONSLOWOMP  Slow reference implementation of forward 
 3 | %   selection with orthogonal matching pursuit (OMP)
 4 | %
 5 | % See also KNOCKOFFS.STATS.PRIVATE.FORWARDSELECTIONOMP
 6 | 
 7 | function residual = target(X, x, ~)
 8 |     warning_state = warning('off', 'stats:regress:RankDefDesignMat');
 9 |     [~,~,residual] = regress(y, [X x]);
10 |     warning(warning_state);
11 | end
12 | 
13 | [varargout{1:nargout}] = ...
14 |     knockoffs.stats.private.sequentialfs(@criterion, @target, X, y);
15 | 
16 | end
17 | 
18 | function c = criterion(~, x, residual)
19 |     c = -abs(dot(x, residual));
20 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/+private/lassoMaxLambda.m:
--------------------------------------------------------------------------------
 1 | function first_lambda = lassoMaxLambda(X, y, nlambda)
 2 | % KNOCKOFFS.STATS.PRIVATE.LASSOMAXLAMBDA  Maximum lambda's for which 
 3 | %  variables in lasso model
 4 | %   maxLambda = KNOCKOFFS.STATS.PRIVATE.LASSOMAXLAMBDA(X, y)
 5 | %   maxLambda = KNOCKOFFS.STATS.PRIVATE.LASSOMAXLAMBDA(X, y, nlambda)
 6 | %   maxLambda = KNOCKOFFS.STATS.PRIVATE.LASSOMAXLAMBDA(X, y, nlambda, intr)
 7 | %
 8 | %   For each variable (column in X), computes the maximum value of lambda 
 9 | %   at which the variable enters in the lasso model.
10 | 
11 | [n,p] = size(X);
12 | if ~exist('nlambda', 'var') || isempty(nlambda)
13 |     nlambda = 200;
14 | end
15 | 
16 | X = knockoffs.private.normc(X);  % Standardize the variables
17 | 
18 | options = glmnetSet();
19 | options.standardize = false;
20 | options.intr = true;
21 | options.standardize_resp = false;
22 | 
23 | lambda_max = max(abs(X'*y))/n;
24 | lambda_min = lambda_max/(2*1e3);
25 | k = (0:(nlambda-1))/nlambda;
26 | options.lambda = lambda_max .* (lambda_min/lambda_max).^k;
27 | 
28 | fit = glmnet(X,y,[],options);
29 | first_lambda = zeros(1,p);
30 | for j = 1:p
31 |     first_time = find(abs(fit.beta(j,:)) > 0, 1, 'first');
32 |     if isempty(first_time)
33 |         first_lambda(j) = 0;
34 |     else
35 |         first_lambda(j) = fit.lambda(first_time);
36 |     end
37 | end
38 | 
39 | % glmnet uses non-standard scaling of lambda.
40 | first_lambda = first_lambda * n;
41 | 
42 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/+private/lassoMaxLambda_binom.m:
--------------------------------------------------------------------------------
 1 | function first_lambda = lassoMaxLambda_binom(X, y, nlambda)
 2 | % KNOCKOFFS.STATS.PRIVATE.LASSOMAXLAMBDA_BINOM  Maximum lambda's for which 
 3 | % variables in lasso model (binomial response)
 4 | %   maxLambda = KNOCKOFFS.STATS.PRIVATE LASSOMAXLAMBDA_BINOM(X, y)
 5 | %   maxLambda = KNOCKOFFS.STATS.PRIVATE.LASSOMAXLAMBDA_BINOM(X, y, nlambda)
 6 | %
 7 | %   For each variable (column in X), computes the maximum value of lambda 
 8 | %   at which the variable enters in the lasso logistic regression model.
 9 | 
10 | [n,p] = size(X);
11 | if ~exist('nlambda', 'var') || isempty(nlambda)
12 |     nlambda = 200;
13 | end
14 | 
15 | X = knockoffs.private.normc(X);  % Standardize the variables
16 | 
17 | options = glmnetSet();
18 | options.standardize = false;
19 | options.intr = true;
20 | options.standardize_resp = false;
21 | 
22 | lambda_max = max(abs(X'*(0.5-y)))/n;
23 | lambda_min = lambda_max/(2*1e3);
24 | k = (0:(nlambda-1))/nlambda;
25 | options.lambda = lambda_max .* (lambda_min/lambda_max).^k;
26 | 
27 | fit = glmnet(X,y,'binomial',options); %diff 2
28 | first_lambda = zeros(1,p);
29 | for j = 1:p
30 |     first_time = find(abs(fit.beta(j,:)) > 0, 1, 'first');
31 |     if isempty(first_time)
32 |         first_lambda(j) = 0;
33 |     else
34 |         first_lambda(j) = fit.lambda(first_time);
35 |     end
36 | end
37 | 
38 | % glmnet uses non-standard scaling of lambda.
39 | first_lambda = first_lambda * n;
40 | 
41 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/+private/lassoMaxLambda_probit.m:
--------------------------------------------------------------------------------
 1 | function first_lambda = lassoMaxLambda_probit(X, y, nlambda)
 2 | % KNOCKOFFS.STATS.PRIVATE.LASSOMAXLAMBDA_PROBIT  Maximum lambda's for which 
 3 | % variables in lasso model (probit response)
 4 | %   maxLambda = KNOCKOFFS.STATS.PRIVATE.LASSOMAXLAMBDA_PROBIT(X, y)
 5 | %   maxLambda = KNOCKOFFS.STATS.PRIVATE.LASSOMAXLAMBDA_PROBIT(X, y, nlambda)
 6 | %
 7 | %   For each variable (column in X), computes the maximum value of lambda 
 8 | %   at which the variable enters in the lasso logistic regression model.
 9 | %   %%%CANNOT REMOVE INTERCEPT USING LASSOGLM%%%
10 | 
11 | [n,p] = size(X);
12 | if ~exist('nlambda', 'var') || isempty(nlambda)
13 |     nlambda = 200;
14 | end
15 | lambdaratio = 1/(2e3);
16 | 
17 | X = knockoffs.private.normc(X);  % Standardize the variables
18 | 
19 | [B,FitInfo] = lassoglm(X,y,'binomial','Link','probit','Standardize',false,'LambdaRatio',lambdaratio,'NumLambda',nlambda);
20 | if size(B,2)<nlambda, B = [repmat(B(:,1),1,nlambda-size(B,2)) B]; end
21 | B = B(:,nlambda:-1:1);
22 | lambdas = FitInfo.Lambda(end)*lambdaratio.^((0:(nlambda-1))/(nlambda-1));
23 | first_lambda = zeros(1,p);
24 | for j = 1:p
25 |     first_time = find(abs(B(j,:)) > 0, 1, 'first');
26 |     if isempty(first_time)
27 |         first_lambda(j) = 0;
28 |     else
29 |         first_lambda(j) = lambdas(first_time);
30 |     end
31 | end
32 | 
33 | % lassoglm uses non-standard scaling of lambda.
34 | first_lambda = first_lambda * n;
35 | 
36 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/+private/sequentialfs.m:
--------------------------------------------------------------------------------
 1 | function [added, history] = sequentialfs(crit_fn, target_fn, X, y)
 2 | % KNOCKOFFS.STATS.PRIVATE.SEQUENTIALFS  Sequential feature selection
 3 | %   added = KNOCKOFFS.STATS.PRIVATE.SEQUENTIALFS(crit_fn, target_fn, X, y, ...)
 4 | %   [added, history] = KNOCKOFFS.STATS.PRIVATE.SEQUENTIALFS(crit_fn, target_fn, X, y, ...)
 5 | %
 6 | %   This function is a variant of the standard MATLAB function of the same
 7 | %   name. It omits many features of that function, but it adds the ability
 8 | %   to change the target function at every step. This is useful for
 9 | %   computing residuals.
10 | 
11 | [n,p] = size(X);
12 | assert(isequal(size(y), [n 1]));
13 | 
14 | added = zeros(1,p);
15 | in = false(1,p);
16 | target = y;
17 | 
18 | if nargout > 1
19 |     history = struct('Crit', zeros(1,p), ...
20 |                      'Target', zeros(p,n), ...
21 |                      'In', false(p,p));
22 |     history.Target(1,:) = target;
23 | end
24 | 
25 | for step = 1:p
26 |     X_in = X(:,in);
27 |     available = find(~in);
28 | 
29 |     % Find the best variable to add among the remaining variables.
30 |     criteria = zeros(1, length(available));
31 |     for j = 1:length(available)        
32 |         criteria(j) = crit_fn(X_in, X(:,available(j)), target);
33 |     end
34 |     [best_crit, best_j] = min(criteria);
35 |     best_var = available(best_j);
36 |     added(step) = best_var;
37 |     in(best_var) = true;
38 |     
39 |     % Compute the new target from the old.
40 |     if step ~= p
41 |         target = target_fn(X_in, X(:,best_var), target);
42 |     end
43 |     
44 |     % Update history, if necessary.
45 |     if nargout > 1
46 |         history.In(step,:) = in;
47 |         history.Crit(step) = best_crit;
48 |         if step ~= p
49 |             history.Target(step+1,:) = target;
50 |         end
51 |     end
52 |     
53 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/Contents.m:
--------------------------------------------------------------------------------
 1 | % +STATS
 2 | %
 3 | % Files
 4 | %   bvsProbDiff               - The Bayesian posterior probability difference statistic W
 5 | %   bvsProbDiff2              - The Bayesian posterior probability difference statistic W (with additional constraint)
 6 | %   forwardSelection          - Forward selection statistic W
 7 | %   forwardSelectionOMP       - Forward selection statistic with OMP
 8 | %   lassoCoefDiff             - Coefficient difference lasso statistic W with cross-validation
 9 | %   lassoCoefDiff_bin         - Coefficient difference lasso statistic W with cross-validation (binomial response)
10 | %   lassoCoefDiff_bin_refit   - Coefficient difference lasso statistic W with cross-validated lambda (binomial response)
11 | %   lassoLambdaDifference     - Lambda difference lasso statistic W
12 | %   lassoLambdaDifference_bin - Lambda difference lasso statistic W (binomial response)
13 | %   lassoLambdaSignedMax      - Signed maximum lasso statistic W
14 | %   lassoLambdaSignedMax_bin  - Signed maximum lasso statistic W (binomial response)
15 | %   olsCoefDiff               - The coefficient difference OLS statistic W
16 | %   randomForest              - The random forest feature importance difference W
17 | %   ridgeCoefDiff             - Ccoefficient difference ridge statistic W
18 | %   ridgeCoefDiff_bin         - Coefficient difference ridge statistic W (binomial response)
19 | %   sqrtLassoCoefDiff         - 
20 | %   stabilitySignedMax        - Signed difference of stability selection W
21 | %   stabilitySignedMax_bin    - Signed difference of stability selection W (binomial response)


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/bvsProbDiff.m:
--------------------------------------------------------------------------------
 1 | function [W,Z] = bvsProbDiff(X, X_ko, y, options)
 2 | % KNOCKOFFS.STATS.BVSPROBDIFF  The Bayesian posterior probability difference statistic W
 3 | %   [W, Z] = KNOCKOFFS.STATS.BVSPROBDIFF(X, X_ko, y)
 4 | %   [W, Z] = KNOCKOFFS.STATS.BVSPROBDIFF(X, X_ko, y, options)
 5 | %
 6 | %   Computes the statistic
 7 | %
 8 | %     W_j = |P_j| - |\tilde P_j|,
 9 | %
10 | %   where P_j and \tilde P_j are the posterior probabilities of the jth
11 | %   variable and its knockoff being nonzero in the Bayesian variable
12 | %   selection problem with conjugate priors (for multivariate normal X).
13 | %
14 | % See also KNOCKOFFS.STATS.BVSPROBDIFF
15 | 
16 | if ~exist('options', 'var')
17 |   options = struct();
18 |   options.tau2 = 1;
19 |   options.pi = 0.02; %this is a fraction of p, not 2p
20 |   options.alpha = 2; %shape
21 |   options.delta = 1; %scale=beta on Wikipedia
22 |   options.burn = 1000;
23 |   options.nit = 3000;
24 | end
25 | 
26 | tau2 = options.tau2;
27 | pi = options.pi/2; %note division by 2
28 | alpha = options.alpha;
29 | delta = options.delta;
30 | burn = options.burn;
31 | nit = options.nit;
32 | 
33 | [n,p] = size(X);
34 | Xall = [X X_ko];
35 | 
36 | sig2 = nan(burn+nit,1);
37 | g = nan(2*p,burn+nit);
38 | b = nan(2*p,burn+nit);
39 | 
40 | sig2(1) = delta/(alpha-1);
41 | g(:,1) = zeros(2*p,1);
42 | b(:,1) = zeros(2*p,1);
43 | 
44 | X2 = sum(Xall.^2,1);
45 | XtX = Xall'*Xall;
46 | r = y; %residual vector
47 | Xr = Xall'*r;
48 | tic
49 | for i = 2:(burn+nit)
50 |   % beta updates
51 |   for j = 1:(2*p)
52 |     if g(j,i-1)==0
53 |       b(j,i) = normrnd(0,sqrt(tau2));
54 |     else
55 |       b(j,i) = normrnd((Xr(j)+X2(j)*b(j,i-1))/(X2(j)+sig2(i-1)/tau2), sqrt(sig2(i-1)/(X2(j)+sig2(i-1)/tau2)));
56 |       Xr = Xr - XtX(:,j)*(b(j,i)-b(j,i-1));
57 |     end
58 |   end
59 |   r = r - Xall(:,g(:,i-1)==1)*(b(g(:,i-1)==1,i)-b(g(:,i-1)==1,i-1));
60 |   
61 |   % gamma updates
62 |   for j = 1:(2*p)
63 |     if g(j,i-1)==1
64 |       logpr1 = log(pi) - sum(r.^2)/(2*sig2(i-1));
65 |       logpr0 = log(1-pi) - sum((r+Xall(:,j)*b(j,i)).^2)/(2*sig2(i-1));
66 |     else
67 |       logpr1 = log(pi) - sum((r-Xall(:,j)*b(j,i)).^2)/(2*sig2(i-1));
68 |       logpr0 = log(1-pi) - sum(r.^2)/(2*sig2(i-1));
69 |     end
70 |     g(j,i) = binornd(1,1/(1+exp(logpr0-logpr1)));
71 |     if g(j,i)~=g(j,i-1), r = r - Xall(:,j)*b(j,i)*(g(j,i)-g(j,i-1)); end
72 |   end
73 |   dg = g(:,i)~=g(:,i-1);
74 |   Xr = Xr - XtX(:,dg)*(b(dg,i).*(g(dg,i)-g(dg,i-1)));
75 |   
76 |   % sigma2 updates
77 |   sig2(i) = 1/gamrnd(alpha+n/2,1/(delta+sum(r.^2)/2));
78 | 
79 |   subtime = toc;
80 |   if i==burn, fprintf(['Burn-in finished after ' num2str(subtime) 'sec\n']); end
81 |   if i>burn && mod(i-burn,nit/2)==0, fprintf(['Iteration ' num2str(i-burn) ' complete after ' num2str(subtime) '\n']); end
82 | end
83 | Z = mean(g(:,burn+(1:nit)),2);
84 | 
85 | W = abs(Z(1:p))-abs(Z((p+1):(2*p)));
86 | 
87 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/bvsProbDiff2.m:
--------------------------------------------------------------------------------
  1 | function [W,Z] = bvsProbDiff2(X, X_ko, y, options)
  2 | % KNOCKOFFS.STATS.BVSPROBDIFF2  The Bayesian posterior probability 
  3 | %  difference statistic W (with additional constraint)
  4 | %   [W, Z] = KNOCKOFFS.STATS.BVSPROBDIFF2(X, X_ko, y)
  5 | %   [W, Z] = KNOCKOFFS.STATS.BVSPROBDIFF2(X, X_ko, y, options)
  6 | %
  7 | %   Computes the statistic
  8 | %
  9 | %     W_j = |P_j| - |\tilde P_j|,
 10 | %
 11 | %   where P_j and \tilde P_j are the posterior probabilities of the jth
 12 | %   variable and its knockoff being nonzero in the Bayesian variable
 13 | %   selection problem with conjugate priors (for multivariate normal X).
 14 | %   Requires Gamma_j + \tilde Gamma_j <= 1
 15 | %
 16 | % See also KNOCKOFFS.STATS.BVSPROBDIFF2
 17 | 
 18 | if ~exist('options', 'var')
 19 |   options = struct();
 20 |   options.tau2 = 1;
 21 |   options.pi = 0.02; %this is a fraction of p, not 2p
 22 |   options.alpha = 2; %shape
 23 |   options.delta = 1; %scale=beta on Wikipedia
 24 |   options.burn = 1000;
 25 |   options.nit = 3000;
 26 | end
 27 | 
 28 | tau2 = options.tau2;
 29 | pi = options.pi;
 30 | alpha = options.alpha;
 31 | delta = options.delta;
 32 | burn = options.burn;
 33 | nit = options.nit;
 34 | 
 35 | [n,p] = size(X);
 36 | Xall = [X X_ko];
 37 | 
 38 | sig2 = nan(burn+nit,1);
 39 | g = nan(2*p,burn+nit);
 40 | b = nan(2*p,burn+nit);
 41 | 
 42 | sig2(1) = delta/(alpha-1);
 43 | g(:,1) = zeros(2*p,1);
 44 | b(:,1) = zeros(2*p,1);
 45 | 
 46 | X2 = sum(Xall.^2,1);
 47 | XtX = Xall'*Xall;
 48 | r = y; %residual vector
 49 | Xr = Xall'*r;
 50 | %tic
 51 | for i = 2:(burn+nit)
 52 |   % beta updates
 53 |   for j = 1:(2*p)
 54 |     if g(j,i-1)==0
 55 |       b(j,i) = normrnd(0,sqrt(tau2));
 56 |     else
 57 |       b(j,i) = normrnd((Xr(j)+X2(j)*b(j,i-1))/(X2(j)+sig2(i-1)/tau2), sqrt(sig2(i-1)/(X2(j)+sig2(i-1)/tau2)));
 58 |       Xr = Xr - XtX(:,j)*(b(j,i)-b(j,i-1));
 59 |     end
 60 |   end
 61 |   r = r - Xall(:,g(:,i-1)==1)*(b(g(:,i-1)==1,i)-b(g(:,i-1)==1,i-1));
 62 |   
 63 |   % gamma updates
 64 |   for j = 1:p
 65 |     if g(j,i-1)==0 && g(j+p,i-1)==0
 66 |       r0 = r; r1 = r-Xall(:,j)*b(j,i); r2 = r-Xall(:,j+p)*b(j+p,i);
 67 |     elseif g(j,i-1)==1 && g(j+p,i-1)==0
 68 |       r0 = r+Xall(:,j)*b(j,i); r1 = r; r2 = r0-Xall(:,j+p)*b(j+p,i);
 69 |     else %g(j,i-1)==0 && g(j+p,i-1)==1
 70 |       r0 = r+Xall(:,j+p)*b(j+p,i); r1 = r0-Xall(:,j)*b(j,i); r2 = r;
 71 |     end
 72 |     logpr0 = log(1-pi) - sum(r0.^2)/(2*sig2(i-1));
 73 |     logpr1 = log(pi/2) - sum(r1.^2)/(2*sig2(i-1));
 74 |     logpr2 = log(pi/2) - sum(r2.^2)/(2*sig2(i-1));
 75 |     choicej = find(mnrnd(1,[1/(1+exp(logpr1-logpr0)+exp(logpr2-logpr0)) 
 76 |                             1/(exp(logpr0-logpr1)+1+exp(logpr2-logpr1)) 
 77 |                             1/(exp(logpr0-logpr2)+exp(logpr1-logpr2)+1)]));
 78 |     switch choicej
 79 |       case 1
 80 |         g(j,i) = 0; g(j+p,i) = 0; r = r0;
 81 |       case 2
 82 |         g(j,i) = 1; g(j+p,i) = 0; r = r1;
 83 |       case 3
 84 |         g(j,i) = 0; g(j+p,i) = 1; r = r2;
 85 |     end
 86 |   end
 87 |   dg = g(:,i)~=g(:,i-1);
 88 |   Xr = Xr - XtX(:,dg)*(b(dg,i).*(g(dg,i)-g(dg,i-1)));
 89 |   
 90 |   % sigma2 updates
 91 |   sig2(i) = 1/gamrnd(alpha+n/2,1/(delta+sum(r.^2)/2));
 92 | 
 93 | %   subtime = toc;
 94 | %   if i==burn, fprintf(['Burn-in finished after ' num2str(subtime) 'sec\n']); end
 95 | %   if i>burn && mod(i-burn,nit/2)==0, fprintf(['Iteration ' num2str(i-burn) ' complete after ' num2str(subtime) '\n']); end
 96 | end
 97 | Z = mean(g(:,burn+(1:nit)),2);
 98 | 
 99 | W = abs(Z(1:p))-abs(Z((p+1):(2*p)));
100 | 
101 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/forwardSelection.m:
--------------------------------------------------------------------------------
 1 | function [W, Z] = forwardSelection(X, X_ko, y)
 2 | % KNOCKOFFS.STATS.FORWARDSELECTION  Forward selection statistic W
 3 | %   [W, Z] = KNOCKOFFS.STATS.FORWARDSELECTION(X, X_ko, y)
 4 | %
 5 | %   Computes the statistic
 6 | %
 7 | %     W_j = max(Z_j, Z_{j+p}) * sgn(Z_j - Z_{j+p})
 8 | %
 9 | %   where Z_1,\dots,Z_{2p} give the reverse order in which the 2p variables
10 | %   (the originals and the knockoffs) enter the forward selection model.
11 | %
12 | %   See also KNOCKOFFS.STATS.FORWARDSELECTIONOMP.
13 | 
14 | added = knockoffs.stats.private.forwardSelection([X X_ko], y);
15 | [~,order] = sort(added);
16 | 
17 | p = size(X,2);
18 | Z = 2*p + 1 - order;
19 | orig = 1:p; ko = (p+1):(2*p);
20 | W = max(Z(orig), Z(ko)) .* sign(Z(orig) - Z(ko));
21 | 
22 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/forwardSelectionOMP.m:
--------------------------------------------------------------------------------
 1 | function [W, Z] = forwardSelectionOMP(X, X_ko, y)
 2 | % KNOCKOFFS.STATS.FORWARDSELECTIONOMP  Forward selection statistic with OMP
 3 | %   [W, Z] = KNOCKOFFS.STATS.FORWARDSELECTIONOMP(X, X_ko, y)
 4 | %
 5 | %   This variant of forward selection uses orthogonal matching pursuit
 6 | %   (OMP).
 7 | %
 8 | %   See also KNOCKOFFS.STATS.FORWARDSELECTION.
 9 | 
10 | added = knockoffs.stats.private.forwardSelectionOMP([X X_ko], y);
11 | [~,order] = sort(added);
12 | 
13 | p = size(X,2);
14 | Z = 2*p + 1 - order;
15 | orig = 1:p; ko = (p+1):(2*p);
16 | W = max(Z(orig), Z(ko)) .* sign(Z(orig) - Z(ko));
17 | 
18 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/lassoCoefDiff.m:
--------------------------------------------------------------------------------
 1 | function [W,Z] = lassoCoefDiff(X, X_ko, y, nfolds, cv)
 2 | % KNOCKOFFS.STATS.LASSOCOEFDIFF  Coefficient difference lasso statistic W 
 3 | % with cross-validation
 4 | %   [W, Z] = KNOCKOFFS.STATS.LASSOCOEFDIFF(X, X_ko, y)
 5 | %   [W, Z] = KNOCKOFFS.STATS.LASSOCOEFDIFF(X, X_ko, y, nfolds)
 6 | %   [W, Z] = KNOCKOFFS.STATS.LASSOCOEFDIFF(X, X_ko, y, nfolds, cv)
 7 | %
 8 | %   Computes the statistic
 9 | %
10 | %     W_j = |Z_j| - |\tilde Z_j|,
11 | %
12 | %   where Z_j and \tilde Z_j are the coefficient values of the 
13 | %   jth variable and its knockoff, respectively, resulting from
14 | %   cross-validated lasso regression.
15 | %
16 | %   See also KNOCKOFFS.STATS.LASSOCOEFDIFF_BIN.
17 | 
18 | if ~exist('nfolds', 'var'), nfolds = []; end
19 | if ~exist('cv', 'var'), cv = 'lambda_1se'; end
20 | 
21 | p = size(X,2);
22 | 
23 | options = glmnetSet();
24 | options.standardize = true;
25 | options.intr = true;
26 | options.standardize_resp = false;
27 | options.alpha = 1; % lasso regression
28 | 
29 | Z = cvglmnetCoef(cvglmnet([X X_ko],y,'gaussian',options,[],nfolds),cv);
30 | Z = Z(2:end); % drop intercept
31 | 
32 | W = abs(Z(1:p))-abs(Z((p+1):(2*p)));
33 | 
34 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/lassoCoefDiff_bin.m:
--------------------------------------------------------------------------------
 1 | function [W,Z] = lassoCoefDiff_bin(X, X_ko, y, nfolds)
 2 | % KNOCKOFFS.STATS.LASSOCOEFDIFF_BIN  Coefficient difference lasso statistic 
 3 | % W with cross-validation (binomial response)
 4 | %   [W, Z] = KNOCKOFFS.STATS.LASSOCOEFDIFF_BIN(X, X_ko, y)
 5 | %   [W, Z] = KNOCKOFFS.STATS.LASSOCOEFDIFF_BIN(X, X_ko, y, nfolds)
 6 | %
 7 | %   Computes the statistic
 8 | %
 9 | %     W_j = |Z_j| - |\tilde Z_j|,
10 | %
11 | %   where Z_j and \tilde Z_j are the coefficient values of the 
12 | %   jth variable and its knockoff, respectively, resulting from
13 | %   cross-validated logistic regression with L1 regularization.
14 | %
15 | %   See also KNOCKOFFS.STATS.LASSOCOEFDIFF, KNOCKOFFS.STATS.LASSOCOEFDIFF_BIN_REFIT.
16 | 
17 | if ~exist('nfolds', 'var'), nfolds = []; end
18 | 
19 | p = size(X,2);
20 | 
21 | options = glmnetSet();
22 | options.standardize = true;
23 | options.intr = true;
24 | options.standardize_resp = false;
25 | options.alpha = 1; % lasso regression
26 | 
27 | Z = cvglmnetCoef(cvglmnet([X X_ko],y,'binomial',options,[],nfolds)); %uses default 1se rule
28 | Z = Z(2:end); % drop intercept
29 | 
30 | W = abs(Z(1:p))-abs(Z((p+1):(2*p)));
31 | 
32 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/lassoCoefDiff_bin_refit.m:
--------------------------------------------------------------------------------
 1 | function [W,Z] = lassoCoefDiff_bin_refit(X, X_ko, y, nfolds)
 2 | % KNOCKOFFS.STATS.LASSOCOEFDIFF_BIN_REFIT Coefficient difference lasso 
 3 | % statistic W with cross-validated lambda (binomial response)
 4 | %   [W, Z] = KNOCKOFFS.STATS.LASSOCOEFDIFF_BIN_REFIT(X, X_ko, y)
 5 | %   [W, Z] = KNOCKOFFS.STATS.LASSOCOEFDIFF_BIN_REFIT(X, X_ko, y, nfolds)
 6 | %
 7 | %   Computes the statistic
 8 | %
 9 | %     W_j = |Z_j| - |\tilde Z_j|,
10 | %
11 | %   where Z_j and \tilde Z_j are the coefficient values of the 
12 | %   jth variable and its knockoff, respectively, resulting from
13 | %   cross-validated logistic regression with L1 regularization.
14 | %   The coefficients are obtained from a re-fitted model (on the full data) 
15 | %   after lambda has been selected by cross-validation.
16 | %
17 | %   See also KNOCKOFFS.STATS.LASSOCOEFDIFF, KNOCKOFFS.STATS.LASSOCOEFDIFF_BIN.
18 | 
19 | if ~exist('nfolds', 'var'), nfolds = []; end
20 | 
21 | p = size(X,2);
22 | 
23 | options = glmnetSet();
24 | options.standardize = true;
25 | options.intr = true;
26 | options.standardize_resp = false;
27 | options.alpha = 1; % lasso regression
28 | 
29 | fit = cvglmnet([X X_ko],y,'binomial',options,[],nfolds);
30 | lambda = max(fit.lambda(fit.cvm<=min(fit.cvup)));
31 | Z = cvglmnetCoef(fit,lambda);
32 | Z = Z(2:end); % drop intercept
33 | 
34 | W = abs(Z(1:p))-abs(Z((p+1):(2*p)));
35 | 
36 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/lassoLambdaDifference.m:
--------------------------------------------------------------------------------
 1 | function [W, Z] = lassoLambdaDifference(X, X_ko, y, nlambda)
 2 | % KNOCKOFFS.STATS.LASSOLAMBDADIFFERENCE  Difference lasso statistic W
 3 | %   [W, Z] = KNOCKOFFS.STATS.LASSOLAMBDADIFFERENCE(X, X_ko, y)
 4 | %   [W, Z] = KNOCKOFFS.STATS.LASSOLAMBDADIFFERENCE(X, X_ko, y, nlambda)
 5 | %
 6 | %   Computes the statistic
 7 | %
 8 | %     W_j = Z_j - \tilde Z_j,
 9 | %
10 | %   where Z_j and \tilde Z_j are the maximum values of the regularization
11 | %   parameter lambda at which the jth variable and its knockoff,
12 | %   respectively, enter the lasso model.
13 | %
14 | %   See also KNOCKOFFS.STATS.LASSOLAMBDASIGNEDMAX, KNOCKOFFS.STATS.LASSOLAMBDADIFFERENCEBIN.
15 | 
16 | if ~exist('nlambda', 'var'), nlambda = []; end
17 | 
18 | Z = knockoffs.stats.private.lassoMaxLambda([X X_ko], y, nlambda);
19 | 
20 | p = size(X,2);
21 | orig = 1:p; ko = (p+1):(2*p);
22 | W = Z(orig) - Z(ko);
23 | 
24 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/lassoLambdaDifference_bin.m:
--------------------------------------------------------------------------------
 1 | function [W, Z] = lassoLambdaDifference_bin(X, X_ko, y, nlambda)
 2 | % KNOCKOFFS.STATS.LASSOLAMBDADIFFERENCEBIN  Difference lasso statistic W
 3 | %   [W, Z] = KNOCKOFFS.STATS.LASSOLAMBDADIFFERENCEBIN(X, X_ko, y)
 4 | %   [W, Z] = KNOCKOFFS.STATS.LASSOLAMBDADIFFERENCEBIN(X, X_ko, y, nlambda)
 5 | %
 6 | %   Computes the statistic
 7 | %
 8 | %     W_j = Z_j - \tilde Z_j,
 9 | %
10 | %   where Z_j and \tilde Z_j are the maximum values of the regularization
11 | %   parameter lambda at which the jth variable and its knockoff,
12 | %   respectively, enter the logistic regression model with L1 penalty.
13 | %
14 | %   See also KNOCKOFFS.STATS.LASSOLAMBDADIFFERENCE.
15 | 
16 | if ~exist('nlambda', 'var'), nlambda = []; end
17 | 
18 | Z = knockoffs.stats.private.lassoMaxLambda_binom([X X_ko], y, nlambda);
19 | 
20 | p = size(X,2);
21 | orig = 1:p; ko = (p+1):(2*p);
22 | W = Z(orig) - Z(ko);
23 | 
24 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/lassoLambdaSignedMax.m:
--------------------------------------------------------------------------------
 1 | function [W, Z] = lassoLambdaSignedMax(X, X_ko, y, nlambda)
 2 | % KNOCKOFFS.STATS.LASSOLAMBDASIGNEDMAX  Signed maximum lasso statistic W
 3 | %   [W, Z] = KNOCKOFFS.STATS.LASSOLAMBDASIGNEDMAX(X, X_ko, y)
 4 | %   [W, Z] = KNOCKOFFS.STATS.LASSOLAMBDASIGNEDMAX(X, X_ko, y, nlambda)
 5 | %
 6 | %   Computes the laso statistic of Equation 1.7:
 7 | %
 8 | %     W_j = max(Z_j, \tilde Z_j) * sgn(Z_j - \tilde Z_j).
 9 | %
10 | %   Here Z_j and \tilde Z_j are the maximum valued of the regularization
11 | %   parameter lambda at which the jth variable and its knockoff,
12 | %   respectively, enter the lasso model.
13 | %
14 | %   Note that the lasso path is not computed exactly, but approximated by
15 | %   a fine grid of lambda values. The optional parameter 'nlambda' controls
16 | %   the number of points in this grid. The default value is 200.
17 | %   If the lasso path contains closely spaced knots, it may be useful 
18 | %   to increase the value of 'nlambda'.
19 | %   The demo 'FirstExamples' shows how to do this.
20 | %
21 | %   See also KNOCKOFFS.STATS.LASSOLAMBDASIGNEDMAX.
22 | 
23 | if ~exist('nlambda', 'var'), nlambda = 200; end
24 | 
25 | Z = knockoffs.stats.private.lassoMaxLambda([X X_ko], y, nlambda);
26 | 
27 | p = size(X,2);
28 | orig = 1:p; ko = (p+1):(2*p);
29 | W = max(Z(orig), Z(ko)) .* sign(Z(orig) - Z(ko));
30 | 
31 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/lassoLambdaSignedMax_bin.m:
--------------------------------------------------------------------------------
 1 | function [W, Z] = lassoLambdaSignedMax_bin(X, X_ko, y, nlambda)
 2 | % KNOCKOFFS.STATS.LASSOLAMBDASIGNEDMAX_BIN  Signed maximum lasso statistic 
 3 | % W (binomial response)
 4 | %   [W, Z] = KNOCKOFFS.STATS.LASSOLAMBDASIGNEDMAX_BIN(X, X_ko, y)
 5 | %   [W, Z] = KNOCKOFFS.STATS.LASSOLAMBDASIGNEDMAX_BIN(X, X_ko, y, nlambda)
 6 | %
 7 | %   Computes the laso statistic of Equation 1.7:
 8 | %
 9 | %     W_j = max(Z_j, \tilde Z_j) * sgn(Z_j - \tilde Z_j).
10 | %
11 | %   Here Z_j and \tilde Z_j are the maximum valued of the regularization
12 | %   parameter lambda at which the jth variable and its knockoff,
13 | %   respectively, enter the lasso logistic regression model.
14 | %
15 | %   Note that the lasso path is not computed exactly, but approximated by
16 | %   a fine grid of lambda values. The optional parameter 'nlambda' controls
17 | %   the number of points in this grid. The default value is 200.
18 | %   If the lasso path contains closely spaced knots, it may be useful 
19 | %   to increase the value of 'nlambda'.
20 | %   The demo 'FirstExamples' shows how to do this.
21 | %
22 | %   See also KNOCKOFFS.STATS.LASSOLAMBDASIGNEDMAX.
23 | 
24 | if ~exist('nlambda', 'var'), nlambda = 200; end
25 | 
26 | Z = knockoffs.stats.private.lassoMaxLambda_binom([X X_ko], y, nlambda); %only diff
27 | 
28 | p = size(X,2);
29 | orig = 1:p; ko = (p+1):(2*p);
30 | W = max(Z(orig), Z(ko)) .* sign(Z(orig) - Z(ko));
31 | 
32 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/olsCoefDiff.m:
--------------------------------------------------------------------------------
 1 | function [W,Z] = olsCoefDiff(X, X_ko, y)
 2 | % KNOCKOFFS.STATS.OLSCOEFDIFF  The coefficient difference OLS statistic W
 3 | %   [W, Z] = KNOCKOFFS.STATS.OLSCOEFDIFF(X, X_ko, y)
 4 | %
 5 | %   Computes the statistic
 6 | %
 7 | %     W_j = |Z_j| - |\tilde Z_j|,
 8 | %
 9 | %   where Z_j and \tilde Z_j are the coefficient values of the 
10 | %   jth variable and its knockoff, respectively, resulting from
11 | %   OLS regression.
12 | %
13 | % See also KNOCKOFFS.STATS.LASSOCOEFDIFF, KNOCKOFFS.STATS.RIDGECOEFDIFF
14 | 
15 | p = size(X,2);
16 | 
17 | Z = glmfit([X X_ko],y,'normal');
18 | Z = Z(2:end); % drop intercept
19 | 
20 | W = abs(Z(1:p))-abs(Z((p+1):(2*p)));
21 | 
22 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/randomForest.m:
--------------------------------------------------------------------------------
 1 | function [W,Z] = randomForest(X, X_ko, y, ntrees, plotTF)
 2 | % KNOCKOFFS.STATS.RANDOMFOREST  The random forest feature importance difference W
 3 | %   [W, Z] = KNOCKOFFS.STATS.RANDOMFOREST(X, X_ko, y)
 4 | %   [W, Z] = KNOCKOFFS.STATS.RANDOMFOREST(X, X_ko, y, ntrees)
 5 | %   [W, Z] = KNOCKOFFS.STATS.RANDOMFOREST(X, X_ko, y, ntrees, plot)
 6 | %
 7 | %   Computes the statistic
 8 | %
 9 | %     W_j = |Z_j| - |\tilde Z_j|,
10 | %
11 | %   where Z_j and \tilde Z_j are the features importances of the 
12 | %   jth variable and its knockoff, respectively, resulting from
13 | %   fitting a random forest.
14 | %
15 | %   The importance of a variable is measured as the total decrease
16 | %   in node impurities from splitting on that variable, averaged over all trees. 
17 | %   For regression, the node impurity is measured by residual sum of squares.
18 | %   For classification, it is measured by the Gini index.
19 | 
20 | if ~exist('ntrees', 'var'), ntrees = 1000; end
21 | if ~exist('plotTF', 'var'), plotTF = false; end
22 | 
23 | p = size(X,2);
24 | 
25 | %TODO: maybe parallelize this
26 | B = TreeBagger(ntrees,[X X_ko],y,'method','regression','OOBPredictorImportance','On');%,'NumPrint',10);
27 | Z = B.OOBPermutedPredictorDeltaError;
28 | if plotTF, plot(oobError(B)); end
29 | W = abs(Z(1:p))-abs(Z((p+1):(2*p)));
30 | 
31 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/ridgeCoefDiff.m:
--------------------------------------------------------------------------------
 1 | function [W,Z] = ridgeCoefDiff(X, X_ko, y, nfolds)
 2 | % KNOCKOFFS.STATS.RIDGECOEFDIFF  Ccoefficient difference ridge statistic W
 3 | %   [W, Z] = KNOCKOFFS.STATS.RIDGECOEFDIFF(X, X_ko, y)
 4 | %   [W, Z] = KNOCKOFFS.STATS.RIDGECOEFDIFF(X, X_ko, y, nfolds)
 5 | %
 6 | %   Computes the statistic
 7 | %
 8 | %     W_j = Z_j - \tilde Z_j,
 9 | %
10 | %   where Z_j and \tilde Z_j are the coefficient values of the 
11 | %   jth variable and its knockoff, respectively, resulting from
12 | %   cross-validated ridge regression.
13 | %
14 | % See also KNOCKOFFS.STATS.LASSOCOEFDIFF, KNOCKOFFS.STATS.OLSCOEFDIFF, KNOCKOFFS.STATS.RIDGECOEFDIFF_BIN
15 | 
16 | if ~exist('nfolds', 'var'), nfolds = []; end
17 | 
18 | p = size(X,2);
19 | 
20 | options = glmnetSet();
21 | options.standardize = true;
22 | options.intr = true;
23 | options.standardize_resp = false;
24 | options.alpha = 0; % ridge regression
25 | 
26 | Z = cvglmnetCoef(cvglmnet([X X_ko],y,'gaussian',options,[],nfolds)); %uses default 1se rule
27 | Z = Z(2:end); % drop intercept
28 | 
29 | W = abs(Z(1:p))-abs(Z((p+1):(2*p)));
30 | 
31 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/ridgeCoefDiff_bin.m:
--------------------------------------------------------------------------------
 1 | function [W,Z] = ridgeCoefDiff_bin(X, X_ko, y, nfolds)
 2 | % KNOCKOFFS.STATS.RIDGECOEFDIFF_BIN  Coefficient difference ridge statistic 
 3 | % W (binomial response)
 4 | %   [W, Z] = KNOCKOFFS.STATS.RIDGECOEFDIFF_BIN(X, X_ko, y)
 5 | %   [W, Z] = KNOCKOFFS.STATS.RIDGECOEFDIFF_BIN(X, X_ko, y, nfolds)
 6 | %
 7 | %   Computes the statistic
 8 | %
 9 | %     W_j = Z_j - \tilde Z_j,
10 | %
11 | %   where Z_j and \tilde Z_j are the coefficient values of the 
12 | %   jth variable and its knockoff, respectively, resulting from
13 | %   cross-validated logistic regression with L2 regularization.
14 | %
15 | % See also KNOCKOFFS.STATS.RIDGECOEFDIFF
16 | 
17 | if ~exist('nfolds', 'var'), nfolds = []; end
18 | 
19 | p = size(X,2);
20 | 
21 | options = glmnetSet();
22 | options.standardize = true;
23 | options.intr = true;
24 | options.standardize_resp = false;
25 | options.alpha = 0; % ridge regression
26 | lambda_max = max(abs(X'*(y-1/2)))/size(X,1);
27 | lambda_min = lambda_max/(2*1e3);
28 | nlambda = 100;
29 | k = (0:(nlambda-1))/nlambda;
30 | options.lambda = lambda_max .* (lambda_min/lambda_max).^k;
31 | 
32 | Z = cvglmnetCoef(cvglmnet([X X_ko],y,'binomial',options,[],nfolds)); %uses default 1se rule
33 | Z = Z(2:end); % drop intercept
34 | 
35 | W = abs(Z(1:p))-abs(Z((p+1):(2*p)));
36 | 
37 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/sqrtLassoCoefDiff.m:
--------------------------------------------------------------------------------
 1 | function [W,Z] = sqrtLassoCoefDiff(X, X_ko, y, lambda)
 2 | % KNOCKOFFS.STATS.SQRTLASSOCOEFDIF  Signed difference of stability selection W
 3 | %   [W, Z] = SQRTLASSOCOEFDIF(X, X_ko, y)
 4 | %   [W, Z] = SQRTLASSOCOEFDIF(X, X_ko, y, lambda)
 5 | %
 6 | %   Computes the statistic
 7 | %
 8 | %     W_j = max(Z_j, \tilde Z_j) * sgn(Z_j - \tilde Z_j),
 9 | %
10 | %   where Z_j and \tilde Z_j are the stability selection probabilities
11 | %   values of the jth variable and its knockoff, respectively, resulting 
12 | %   from fitting the SQRT-lasso.
13 | %
14 | % See also KNOCKOFFS.STATS.STABILITYSIGNEDMAX
15 | 
16 | m = 1000;
17 | [n,p] = size(X);
18 | alpha = 0.05;
19 | kappa = 0.7;
20 | 
21 | if ~exist('lambda', 'var')
22 |   eps = normrnd(0,1,n,m);
23 |   S = [X X_ko]'*eps/n;
24 |   Sinf = max(abs(S));
25 |   lambda = kappa*n*quantile(Sinf,1-alpha);
26 | end
27 | 
28 | [Z,~] = SqrtLassoIterative_WebPage([X X_ko], y, lambda, ones(2*p,1));
29 | 
30 | W = abs(Z(1:p))-abs(Z((p+1):(2*p)));
31 | 
32 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/stabilitySignedMax.m:
--------------------------------------------------------------------------------
 1 | function [W,Z] = stabilitySignedMax(X, X_ko, y, weakness, nRep)
 2 | % KNOCKOFFS.STATS.STABILITYSIGNEDMAX  Signed difference of stability selection W
 3 | %   [W, Z] = KNOCKOFFS.STATS.STABILITYSIGNEDMAX(X, X_ko, y)
 4 | %   [W, Z] = KNOCKOFFS.STATS.STABILITYSIGNEDMAX(X, X_ko, y, weakness)
 5 | %   [W, Z] = KNOCKOFFS.STATS.STABILITYSIGNEDMAX(X, X_ko, y, weakness, nRep)
 6 | %
 7 | %   Computes the statistic
 8 | %
 9 | %     W_j = max(Z_j, \tilde Z_j) * sgn(Z_j - \tilde Z_j),
10 | %
11 | %   where Z_j and \tilde Z_j are the stability selection probabilities
12 | %   values of the jth variable and its knockoff, respectively, resulting 
13 | %   from repeated randomized lasso.
14 | %
15 | % See also KNOCKOFFS.STATS.STABILITYSIGNEDMAX_BIN
16 | 
17 | if ~exist('weakness', 'var'), weakness = 0.2; end
18 | if ~exist('nRep', 'var'), nRep = 100; end
19 | tol = 1e-6;
20 | 
21 | [n,p] = size(X);
22 | 
23 | options = glmnetSet();
24 | options.standardize = true;
25 | options.intr = true;
26 | options.standardize_resp = false;
27 | options.alpha = 1; % lasso regression
28 | 
29 | nsub = floor(n/2);
30 | 
31 | fit = glmnet(bsxfun(@times,[X(1:nsub,:) X_ko(1:nsub,:)],rand(1,2*p)*(1-weakness)+weakness), y(1:nsub), 'gaussian', options);
32 | options.lambda = fit.lambda;
33 | probs = zeros(2*p,length(options.lambda));
34 | 
35 | for i = 1:nRep
36 |     indsub = randperm(n,nsub);
37 |     Xsub = bsxfun(@times,[X(indsub,:) X_ko(indsub,:)],rand(1,2*p)*(1-weakness)+weakness);
38 |     fit = glmnet(Xsub,y(indsub),'gaussian',options);
39 |     probs = probs + (abs(fit.beta)>tol)/nRep;
40 | end
41 | Z = max(probs,[],2);
42 | orig = 1:p; ko = (p+1):(2*p);
43 | W = max(Z(orig), Z(ko)) .* sign(Z(orig) - Z(ko));
44 | 
45 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+stats/stabilitySignedMax_bin.m:
--------------------------------------------------------------------------------
 1 | function [W,Z] = stabilitySignedMax_bin(X, X_ko, y, weakness, nRep)
 2 | % KNOCKOFFS.STATS.STABILITYSIGNEDMAX_BIN  Signed difference of stability 
 3 | % selection W (binomial response)
 4 | %   [W, Z] = KNOCKOFFS.STATS.STABILITYSIGNEDMAX_BIN(X, X_ko, y)
 5 | %   [W, Z] = KNOCKOFFS.STATS.STABILITYSIGNEDMAX_BIN(X, X_ko, y, weakness)
 6 | %   [W, Z] = KNOCKOFFS.STATS.STABILITYSIGNEDMAX_BIN(X, X_ko, y, weakness, nRep)
 7 | %
 8 | %   Computes the statistic
 9 | %
10 | %     W_j = max(Z_j, \tilde Z_j) * sgn(Z_j - \tilde Z_j),
11 | %
12 | %   where Z_j and \tilde Z_j are the stability selection probabilities
13 | %   values of the jth variable and its knockoff, respectively, resulting 
14 | %   from repeated randomized logistic regression with L1 regularization.
15 | %
16 | % See also KNOCKOFFS.STATS.STABILITYSIGNEDMAX
17 | 
18 | if ~exist('weakness', 'var'), weakness = 0.2; end
19 | if ~exist('nRep', 'var'), nRep = 100; end
20 | tol = 1e-6;
21 | 
22 | [n,p] = size(X);
23 | 
24 | options = glmnetSet();
25 | options.standardize = true;
26 | options.intr = true;
27 | options.standardize_resp = false;
28 | options.alpha = 1; % lasso regression
29 | 
30 | nsub = floor(n/2);
31 | 
32 | fit = glmnet(bsxfun(@times,[X(1:nsub,:) X_ko(1:nsub,:)],rand(1,2*p)*(1-weakness)+weakness), y(1:nsub), 'binomial', options);
33 | options.lambda = fit.lambda;
34 | probs = zeros(2*p,length(options.lambda));
35 | 
36 | for i = 1:nRep
37 |     indsub = randperm(n,nsub);
38 |     Xsub = bsxfun(@times,[X(indsub,:) X_ko(indsub,:)],rand(1,2*p)*(1-weakness)+weakness);
39 |     fit = glmnet(Xsub,y(indsub),'binomial',options);
40 |     probs = probs + (abs(fit.beta)>tol)/nRep;
41 | end
42 | Z = max(probs,[],2);
43 | orig = 1:p; ko = (p+1):(2*p);
44 | W = max(Z(orig), Z(ko)) .* sign(Z(orig) - Z(ko));
45 | 
46 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+tests/CreateASDPTest.m:
--------------------------------------------------------------------------------
 1 | classdef CreateASDPTest < knockoffs.tests.KnockoffTestCase
 2 |     
 3 |     methods (Test)
 4 |         function testCovariances(self)
 5 |             X = knockoffs.private.normc(randn(20,10));
 6 |             X_ko = knockoffs.create.fixed_SDP(X, false, true);
 7 |             self.verifyCovariances(X, X_ko);
 8 |         end
 9 | 
10 |         function testRandomizedCovariances(self)
11 |             X = knockoffs.private.normc(randn(20,10));
12 |             X_ko = knockoffs.create.fixed_SDP(X, true, true);
13 |             self.verifyCovariances(X, X_ko);
14 |         end
15 | 
16 |         function testDimensionCheck(self)
17 |             X = knockoffs.private.normc(randn(10,10));
18 |             self.verifyError(@() knockoffs.create.fixed_SDP(X,false,true), ...
19 |                 'knockoff:DimensionError')
20 |         end
21 |         
22 |         function testDiagonalSigmaCorrectness(self)
23 |             diag_Sigma = 0.1:0.1:10;
24 |             Sigma = diag(diag_Sigma);
25 |             s_asdp = knockoffs.create.solveASDP(Sigma);
26 |             s_asdp = reshape(s_asdp, size(diag_Sigma));
27 |             self.verifyAlmostEqual(diag_Sigma, s_asdp);
28 |         end
29 |     end
30 |     
31 |     methods
32 |         function verifyCovariances(self, X, X_ko)
33 |             G = X'*X;
34 |             self.verifyAlmostEqual(X_ko'*X_ko, G);
35 |             self.verifyAlmostEqual(offdiag(X'*X_ko), offdiag(G))
36 |             self.verifyLessThan(diag(X'*X_ko), 1 + 1e-5);
37 |         end
38 |     end
39 | 
40 | end
41 | 
42 | function B = offdiag(A)
43 | B = A - diag(diag(A));
44 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+tests/CreateEquiTest.m:
--------------------------------------------------------------------------------
 1 | classdef CreateEquiTest < knockoffs.tests.KnockoffTestCase
 2 |     
 3 |     methods (Test)
 4 |         function testCovariances(self)
 5 |             X = knockoffs.private.normc(randn(20,10));
 6 |             X_ko = knockoffs.create.fixed_equi(X, false);
 7 |             self.verifyCovariances(X, X_ko);
 8 |         end
 9 | 
10 |         function testRandomizedCovariances(self)
11 |             X = knockoffs.private.normc(randn(20,10));
12 |             X_ko = knockoffs.create.fixed_equi(X, true);
13 |             self.verifyCovariances(X, X_ko);
14 |         end
15 | 
16 |         function testDimensionCheck(self)
17 |             X = knockoffs.private.normc(randn(10,10));
18 |             self.verifyError(...
19 |                 @() knockoffs.create.fixed_equi(X), ...
20 |                 'knockoff:DimensionError')
21 |         end
22 |         
23 |         function testPermutationInvariance(self)
24 |             X = knockoffs.private.normc(randn(20,10));
25 |             I = randperm(10);
26 |             X_ko = knockoffs.create.fixed_equi(X, false);
27 |             X_perm_ko = knockoffs.create.fixed_equi(X(:,I), false);
28 |             self.verifyAlmostEqual(X_ko(:,I), X_perm_ko)
29 |         end
30 |     end
31 |     
32 |     methods
33 |         function verifyCovariances(self, X, X_ko)
34 |             G = X'*X;
35 |             s = min(2*min(eig(G)), 1);
36 |             s = repmat(s, [1, size(X,2)]);
37 |             self.verifyAlmostEqual(X_ko'*X_ko, G);
38 |             self.verifyAlmostEqual(X'*X_ko, G - diag(s));
39 |         end
40 |     end
41 | 
42 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+tests/CreateGaussian.m:
--------------------------------------------------------------------------------
 1 | classdef CreateGaussian < knockoffs.tests.KnockoffTestCase
 2 |     
 3 |     methods (Test)
 4 |         function testEquiIdentity(self)
 5 |             p = 100;
 6 |             sigma = 0.4;
 7 |             Sigma = sigma * eye(p);
 8 |             diag_s = knockoffs.create.solveEqui(Sigma);
 9 |             diag_s_expected = repmat(sigma, p, 1);
10 |             self.verifyAlmostEqual(diag_s, diag_s_expected);
11 |         end
12 |         function testEquiToeplitz(self)
13 |             p = 100;
14 |             rho = 0.5;
15 |             Sigma = toeplitz(rho.^(0:(p-1)));
16 |             diag_s = knockoffs.create.solveEqui(Sigma);
17 |             diag_s_expected = repmat(min(1,2*eigs(Sigma,1,'SM')), p, 1);
18 |             self.verifyAlmostEqual(diag_s, diag_s_expected);
19 |         end
20 | 
21 |         function testSDP(self)
22 |             diag_Sigma = 0.1:0.1:4;
23 |             Sigma = diag(diag_Sigma(randperm(length(diag_Sigma))));
24 |             diag_s = sparse(diag(knockoffs.create.solveSDP(Sigma)));
25 |             diag_s_expected = sparse(Sigma);
26 |             self.verifyAlmostEqual(diag_s, diag_s_expected);
27 |         end
28 | 
29 |         function testEquiCov(self)
30 |             p = 5;
31 |             rho = 0.5;
32 |             Sigma = toeplitz(rho.^(0:(p-1)));
33 |             diag_s = sparse(diag(knockoffs.create.solveEqui(Sigma)));
34 |             diag_s_expected = sparse(diag(repmat(min(1,2*eigs(Sigma,1,'SM')), p, 1)));
35 |             n = 10000000;
36 |             mu = randn(1,p);
37 |             X = mvnrnd(mu, Sigma, n);
38 |             X_k = knockoffs.create.gaussian_sample(X, mu, Sigma, diag_s);
39 |             G = [Sigma, Sigma-diag_s_expected; Sigma-diag_s_expected, Sigma];
40 |             Delta = abs(G - cov([X,X_k]));
41 |             self.verifyLessThan(max(Delta(:)),1e-2);
42 |         end
43 |         
44 |         function testSDPCovSmall(self)
45 |             p = 5;
46 |             rho = 0.2;
47 |             Sigma = toeplitz(rho.^(0:(p-1)));
48 |             diag_s = sparse(diag(knockoffs.create.solveSDP(Sigma)));
49 |             diag_s_expected = sparse(eye(p));
50 |             n = 10000000;
51 |             mu = randn(1,p);
52 |             X = mvnrnd(mu, Sigma, n);
53 |             X_k = knockoffs.create.gaussian_sample(X, mu, Sigma, diag_s);
54 |             G = [Sigma, Sigma-diag_s_expected; Sigma-diag_s_expected, Sigma];
55 |             Delta = abs(G - cov([X,X_k]));
56 |             self.verifyLessThan(max(Delta(:)),1e-2);
57 |         end
58 | 
59 |         function testSDPCovLarge(self)
60 |             p = 10;
61 |             Sigma = corr(randn(2*p,p));
62 |             diag_s = sparse(diag(knockoffs.create.solveSDP(Sigma)));
63 |             n = 1000000;
64 |             mu = randn(1,p);
65 |             X = mvnrnd(mu, Sigma, n);
66 |             X_k = knockoffs.create.gaussian_sample(X, mu, Sigma, diag_s);
67 |             iden = diag(ones(2*p-abs(p),1),p) + diag(ones(2*p-abs(p),1),-p);
68 |             G = [Sigma, Sigma; Sigma, Sigma];
69 |             G(iden~=0) = 0;            
70 |             Ghat = cov([X,X_k]);            
71 |             Ghat(iden~=0) = 0;
72 |             Delta = abs(G - Ghat);
73 |             self.verifyLessThan(max(Delta(:)),1e-2);
74 |         end
75 |        end
76 |     
77 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+tests/CreateSDPTest.m:
--------------------------------------------------------------------------------
 1 | classdef CreateSDPTest < knockoffs.tests.KnockoffTestCase
 2 |     
 3 |     methods (Test)
 4 |         function testCovariances(self)
 5 |             X = knockoffs.private.normc(randn(20,10));
 6 |             X_ko = knockoffs.create.fixed_SDP(X, false);
 7 |             self.verifyCovariances(X, X_ko);
 8 |         end
 9 | 
10 |         function testRandomizedCovariances(self)
11 |             X = knockoffs.private.normc(randn(20,10));
12 |             X_ko = knockoffs.create.fixed_SDP(X, true);
13 |             self.verifyCovariances(X, X_ko);
14 |         end
15 | 
16 |         function testDimensionCheck(self)
17 |             X = knockoffs.private.normc(randn(10,10));
18 |             self.verifyError(@() knockoffs.create.fixed_SDP(X), ...
19 |                 'knockoff:DimensionError')
20 |         end
21 |         
22 |         function testPermutationInvariance(self)
23 |             X = knockoffs.private.normc(randn(20,10));
24 |             I = randperm(10);
25 |             X_ko = knockoffs.create.fixed_SDP(X, false);
26 |             X_perm_ko = knockoffs.create.fixed_SDP(X(:,I), false);
27 |             self.verifyEqual(X_ko(:,I), X_perm_ko, 'AbsTol', 1e-4);
28 |         end
29 |         
30 |         function testDiagonalSigmaCorrectness(self)
31 |             diag_Sigma = 0.1:0.1:10;
32 |             Sigma = diag(diag_Sigma);
33 |             s_sdp = knockoffs.create.solveSDP(Sigma);
34 |             s_sdp = reshape(s_sdp, size(diag_Sigma));
35 |             self.verifyAlmostEqual(diag_Sigma, s_sdp);
36 |         end
37 |     end
38 |     
39 |     methods
40 |         function verifyCovariances(self, X, X_ko)
41 |             G = X'*X;
42 |             self.verifyAlmostEqual(X_ko'*X_ko, G);
43 |             self.verifyAlmostEqual(offdiag(X'*X_ko), offdiag(G))
44 |             self.verifyLessThan(diag(X'*X_ko), 1 + 1e-5);
45 |         end
46 |     end
47 | 
48 | end
49 | 
50 | function B = offdiag(A)
51 | B = A - diag(diag(A));
52 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+tests/FSBenchmark.m:
--------------------------------------------------------------------------------
 1 | function FSBenchmark
 2 | 
 3 | n = 800; p = 400;
 4 | X = knockoffs.private.normc(randn(n, p));
 5 | y = X * randn(p,1) + 0.1 .* randn(n,1);
 6 | 
 7 | D = table();
 8 | D.call = {'FS_Naive'; 'FS_Opt'; 'FS_OMP_Naive'; 'FS_OMP_Opt'};
 9 | D.time = [
10 |     timeit(@() knockoffs.stats.private.forwardSelectionSlow(X,y));
11 |     timeit(@() knockoffs.stats.private.forwardSelection(X,y));
12 |     timeit(@() knockoffs.stats.private.forwardSelectionSlowOMP(X,y));
13 |     timeit(@() knockoffs.stats.private.forwardSelectionOMP(X,y));
14 |     ];
15 | D.Properties.VariableUnits = {'' 's'};
16 | display(D);
17 | 
18 | end
19 | 
20 | 


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+tests/FilterTest.m:
--------------------------------------------------------------------------------
 1 | classdef FilterTest < knockoffs.tests.KnockoffTestCase
 2 |     % Test the main entry point for this package.
 3 |     
 4 |     methods (Test)
 5 |         % Test whether the filter is invariant under permutations of
 6 |         % the columns of the design matrix.
 7 |         function testPermutationInvarianceFixed(self)
 8 |             n = 100; p = 50; k = 5; q = 0.20;
 9 |             X = randn(n, p);
10 |             beta = zeros(p, 1);
11 |             beta(randsample(p,k)) = 3.5;
12 |             y = X*beta + randn(n,1);
13 |             
14 |             X = array2table(X);
15 |             I = randperm(p);
16 |             S = knockoffs.filter(X, y, q, {'fixed'});
17 |             S_perm = knockoffs.filter(X(:,I), y, q, {'fixed'});
18 |             self.verifyEqual(sort(S), sort(S_perm));
19 |         end
20 | 
21 |         function testPermutationInvarianceGaussian(self)
22 |             n = 100; p = 50; k = 5; q = 0.20; rho = 0.5;
23 |             Sigma = toeplitz(rho.^(0:(p-1)));
24 |             mu = randn(1,p);
25 |             X = mvnrnd(mu, Sigma, n);
26 |             beta = zeros(p, 1);
27 |             beta(randsample(p,k)) = 3.5;
28 |             y = X*beta + randn(n,1);
29 |             
30 |             stats = @knockoffs.stats.lassoLambdaSignedMax;
31 |             X = array2table(X);
32 |             I = randperm(p);
33 |             seed = randi(100000);
34 |             rng(seed);
35 |             S = knockoffs.filter(X,y, q, {'gaussian',mu,Sigma},'Method','equi', 'Statistics',stats);
36 |             rng(seed);
37 |             S_perm = knockoffs.filter(X(:,I), y, q, {'gaussian',mu,Sigma},'Method','equi', 'Statistics',stats);
38 |             self.verifyEqual(sort(S), sort(S_perm));
39 |         end
40 |              
41 |     end
42 | 
43 | end
44 | 


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+tests/ForwardSelectionTest.m:
--------------------------------------------------------------------------------
 1 | classdef ForwardSelectionTest < knockoffs.tests.KnockoffTestCase
 2 |     
 3 |     methods (Test)
 4 |         function testSequentialFS(self)
 5 |             X = [ 1 2 3 4 5 ];
 6 |             y = 10;
 7 |             added = knockoffs.stats.private.sequentialfs(...
 8 |                 @(~,x,t) abs(x-t), @(~,~,t) t-1, X, y);
 9 |             self.verifyEqual(added, [ 5 4 3 2 1]);
10 |             
11 |             [added, history] = knockoffs.stats.private.sequentialfs(...
12 |                 @(~,x,t) (x-t)^2, @(~,~,t) t-1, X, y);
13 |             self.verifyEqual(added, [ 5 4 3 2 1]);
14 |             self.verifyEqual(history.Crit, [ 25 25 25 25 25 ]);
15 |             self.verifyEqual(history.Target, [ 10 9 8 7 6 ]');
16 |             self.verifyEqual(history.In, logical(...
17 |                 [ 0 0 0 0 1;
18 |                   0 0 0 1 1;
19 |                   0 0 1 1 1;
20 |                   0 1 1 1 1;
21 |                   1 1 1 1 1; ]));
22 |         end
23 |         
24 |         function testForwardSelectionSlow(self)
25 |             X = [ 1 0 0;
26 |                   1 0 0;
27 |                   1 0 0;
28 |                   0 1 0;
29 |                   0 0 1;
30 |                   0 0 1; ];
31 |             y = [ 1 1 1 1 1 1]';
32 |             added = knockoffs.stats.private.forwardSelectionSlow(X, y);
33 |             self.verifyEqual(added, [ 1 3 2 ]);
34 |         end
35 |         
36 |         function testForwardSelectionSlowOMP(self)
37 |             X = [ 1 0 0;
38 |                   1 0 0;
39 |                   1 0 0;
40 |                   0 1 0;
41 |                   0 0 1;
42 |                   0 0 1; ];
43 |             y = [ 1 1 1 1 1 1]';
44 |             added = knockoffs.stats.private.forwardSelectionSlowOMP(X, y);
45 |             self.verifyEqual(added, [ 1 3 2 ]);
46 |         end
47 |         
48 |         function testForwardSelectionFast(self)
49 |             % Check the optimized implementation by comparing it to the
50 |             % slow reference implementation.
51 |             n = 200; p = 100;
52 |             X = knockoffs.private.normc(randn(n, p));
53 |             y = X * randn(p,1) + 0.1 .* randn(n,1);
54 |             self.verifyResiduals(...
55 |                 @knockoffs.stats.private.forwardSelection, ...
56 |                 @knockoffs.stats.private.forwardSelectionSlow, ...
57 |                 X, y);
58 |         end
59 |         
60 |         function testForwardSelectionFastOmp(self)
61 |             % Check the optimized implementation by comparing it to the
62 |             % slow reference implementation.
63 |             n = 200; p = 100;
64 |             X = knockoffs.private.normc(randn(n, p));
65 |             y = X * randn(p,1) + 0.1 .* randn(n,1);
66 |             self.verifyResiduals(...
67 |                 @knockoffs.stats.private.forwardSelectionOMP, ...
68 |                 @knockoffs.stats.private.forwardSelectionSlowOMP, ...
69 |                 X, y);
70 |          end
71 |     end
72 |     
73 |     methods
74 |         function verifyResiduals(self, actual_fn, expected_fn, varargin)
75 |             [~,historyActual] = actual_fn(varargin{:});
76 |             [~,historyExpected] = expected_fn(varargin{:});
77 |             self.verifyAlmostEqual(historyActual.Target', ...
78 |                                    historyExpected.Target');
79 |         end
80 |     end
81 | 
82 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+tests/KnockoffTestCase.m:
--------------------------------------------------------------------------------
 1 | classdef KnockoffTestCase < matlab.unittest.TestCase
 2 |     
 3 |     methods
 4 |         function verifyAlmostEqual(self, actual, expected, rtol)
 5 |             if ~exist('rtol', 'var')
 6 |                 rtol = 1e-5;
 7 |             end
 8 |             self.verifyEqual(actual, expected, 'RelTol', rtol)
 9 |         end
10 |     end
11 |     
12 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/+tests/LassoTest.m:
--------------------------------------------------------------------------------
 1 | classdef LassoTest < knockoffs.tests.KnockoffTestCase
 2 |     
 3 |     methods (Test)
 4 |         % Test the special case of an orthonormal design (X'X = I),
 5 |         % in which case the lasso minimization problem
 6 |         %
 7 |         %   (1/2) ||y - X*beta||_2^2 + lambda * ||beta||_1
 8 |         %
 9 |         % has the closed form solution
10 |         %
11 |         %     beta = sgn(beta_LS) max(0, abs(beta_LS) - lambda)),
12 |         %
13 |         % where beta_LS is the ordinary least-squares solution.
14 |         function testOrthonormal(self)
15 |             n = 10; p = 5; sigma = 1e-2;
16 |             X = randn(n, p);
17 |             X = orth(bsxfun(@minus,X,mean(X,1)));
18 |             beta = randn(p,1);
19 |             y = X*beta + sigma .* randn(n,1);
20 |             
21 |             betaLS = X'*y;
22 |             nlambda = 10000;
23 |             lambdaMax = knockoffs.stats.private.lassoMaxLambda(X, y, nlambda);
24 |             self.verifyAlmostEqual(lambdaMax, abs(betaLS'), 1e-3);
25 |         end
26 |         
27 |         % Test that lassoMaxLambda is invariant under permutation.
28 |         function testPermutationInvariance(self)
29 |             n = 100; p = 30; k = 5; sigma = 1e-2;
30 |             X = knockoffs.private.normc(randn(n, p));
31 |             beta = zeros(p,1);
32 |             beta(randsample(p, k)) = 1;
33 |             y = X*beta + sigma .* randn(n,1);
34 |             
35 |             I = randperm(p);
36 |             nlambda = 10000;
37 |             lambdaMax = knockoffs.stats.private.lassoMaxLambda(X, y, nlambda);
38 |             lambdaMaxPerm = knockoffs.stats.private.lassoMaxLambda(X(:,I), y, nlambda);
39 |             self.verifyAlmostEqual(lambdaMax(I), lambdaMaxPerm, 1e-2);
40 |             
41 |             [~,path] = sort(lambdaMax(I), 'descend');
42 |             [~,pathPerm] = sort(lambdaMaxPerm, 'descend');
43 |             self.verifyEqual(path, pathPerm);
44 |         end
45 |     end
46 | 
47 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/Contents.m:
--------------------------------------------------------------------------------
 1 | % KNOCKOFF The Model-Free Knockoff Filter for controlling the false discovery rate (FDR).
 2 | %
 3 | % The knockoffs framework constructs artificial 'knockoffs' for the 
 4 | % variables in a statistical model and then selects only variables that 
 5 | % are clearly better than their fake copies. Our model-free approach makes 
 6 | % knockoffs possible for data from any model, no matter how high-dimensional.
 7 | %
 8 | % Reference: http://statweb.stanford.edu/~candes/MF_Knockoffs/
 9 | %
10 | % Files
11 | %   create    - Create Model-Free knockoffs given the model parameters and a 
12 | %   filter    - Run the knockoff filter on a data set.
13 | %   select    - Select variables based on the knockoff statistics
14 | %   threshold - Compute the threshold for variable selection
15 | %
16 | % For more information, try typing:
17 | %   - help knockoffs.knocks
18 | %   - help knockoffs.stats
19 | %
20 | % For usage examples, see the scripts in the 'examples' directory


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/create.m:
--------------------------------------------------------------------------------
 1 | function X_k = create(X, model, varargin)
 2 | % KNOCKOFFS.CREATE  Create knockoffs given the model parameters and a 
 3 | % matrix of observations for the original variables.
 4 | %
 5 | %   X_k = KNOCKOFFS.CREATE(X, model, {model parameters}, ...)
 6 | %
 7 | %   By default, creates Gaussian sdp-correleated knockoffs.
 8 | %
 9 | %   Inputs:
10 | %      X    - n x p (scaled) data matrix
11 | %             Requires (n >= 2*p) if model='fixed'
12 | %    model  - 'gaussian' for Model-X Gaussian variables or
13 | %             'fixed' for Fixed-X variables
14 | %             Default: 'gaussian'
15 | %
16 | %   Required model-specific inputs:
17 | %      Mu   - p x 1 mean vector. Required if model='gaussian'
18 | %     Sigma - p x p covariance matrix. Required if model='gaussian' 
19 | %   
20 | %   Optional Inputs:
21 | %       'Method' - Method to use for creating knockoffs.
22 | %
23 | %   Optional model-specific inputs:
24 | %     'Randomize' - whether to use randomization in the construction of
25 | %                   the knockoff variables (if model='fixed')
26 | %
27 | %   Outputs:
28 | %       X_k - n x p knockoff variable matrix
29 | %
30 | %   See also KNOCKOFFS.CREATE.GAUSSIAN, KNOCKOFFS.CREATE.FIXED, KNOCKOFFS.SELECT.
31 | 
32 | if ~exist('model', 'var') || isempty(model), model = 'gaussian'; end;
33 | 
34 | parser = inputParser;
35 | parser.CaseSensitive = false;
36 | if (~verLessThan('matlab', '8.2')) % R2013b or later
37 |     parser.PartialMatching = false;
38 | end
39 | istable_safe = @(x) ~verLessThan('matlab', '8.2') && istable(x);
40 | 
41 | parser.addRequired('X', @(x) isnumeric(x) || istable_safe(x));
42 | parser.addRequired('model', @isstr);
43 | 
44 | model = lower(model);
45 | switch model
46 |     case 'gaussian'
47 |         parser.addRequired('Mu', @isnumeric);
48 |         parser.addRequired('Sigma', @(x) isnumeric(x) && all(eig(x) > 0));
49 |         parser.addOptional('Method', 'sdp');
50 |         parser.parse(X, model, varargin{:});
51 |         sampleK = @knockoffs.create.gaussian;
52 |         X_k = sampleK(X,parser.Results.Mu, parser.Results.Sigma, parser.Results.Method);
53 |     case 'fixed'
54 |         parser.addOptional('Method', 'sdp');
55 |         parser.addOptional('Randomize', []);
56 |         parser.parse(X, model, varargin{:});
57 |         sampleK = @knockoffs.create.fixed;
58 |         X_k = sampleK(X, parser.Results.Method, parser.Results.Randomize);
59 |     otherwise
60 |         error('Invalid variables model %s', model)
61 | end
62 | 
63 | end
64 | 


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/select.m:
--------------------------------------------------------------------------------
 1 | function S = select(W, q, method)
 2 | % KNOCKOFFS.SELECT  Select variables based on the knockoff statistics
 3 | %
 4 | %   S = KNOCKOFFS.SELECT(W, q) select using 'knockoff' method
 5 | %   S = KNOCKOFFS.SELECT(W, q, method) select with given method
 6 | %
 7 | %   Inputs:
 8 | %       W - statistics W_j for testing null hypothesis beta_j = 0.
 9 | %       q - target FDR
10 | %       method - either 'knockoff' or 'knockoff+'
11 | %                Default: 'knockoff+'
12 | %
13 | %   Outputs:
14 | %       S - array of selected variable indices
15 | %
16 | %   See also KNOCKOFFS.THRESHOLD.
17 | 
18 | if ~exist('method', 'var')
19 |     method = [];
20 | end
21 | 
22 | W = reshape(W, 1, []);
23 | T = knockoffs.threshold(W, q, method);
24 | S = find(W >= T);
25 | 
26 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/+knockoffs/threshold.m:
--------------------------------------------------------------------------------
 1 | function T = threshold(W, q, method)
 2 | % KNOCKOFFS.THRESHOLD  Compute the threshold for variable selection
 3 | %   T = KNOCKOFFS.THRESHOLD(W, q) threshold using 'knockoff' method
 4 | %   T = KNOCKOFFS.THRESHOLD(W, q, method) threshold with given method
 5 | %
 6 | %   Inputs:
 7 | %       W - statistics W_j for testing null hypothesis beta_j = 0.
 8 | %       q - target FDR
 9 | %       method - either 'knockoff' or 'knockoff+'
10 | %                Default: 'knockoff'
11 | %
12 | %   Outputs:
13 | %       T - threshold for variable selection
14 | %
15 | %   See also KNOCKOFFS.SELECT.
16 | 
17 | if ~exist('method', 'var') || isempty(method)
18 |     method = 'knockoff+';
19 | end
20 | 
21 | switch lower(method)
22 |     case 'knockoff'
23 |         offset = 0;
24 |     case 'knockoff+'
25 |         offset = 1;
26 |     otherwise
27 |         error('Invalid threshold method %s', method)
28 | end
29 | 
30 | W = reshape(W, 1, []);
31 | t = sort([0 abs(W(W~=0))]);
32 | ratio = zeros(1, length(t));
33 | for i = 1:length(t)
34 |     ratio(i) = (offset + sum(W <= -t(i))) / max(1, sum(W >= t(i)));
35 | end
36 | 
37 | index = find(ratio <= q, 1, 'first');
38 | if isempty(index)
39 |     T = Inf;
40 | else
41 |     T = t(index);
42 | end
43 | 
44 | end


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/README.txt:
--------------------------------------------------------------------------------
 1 | The Knockoff Filter for MATLAB
 2 | ====================================
 3 | 
 4 | Version: 0.9.1
 5 | Date: 10/14/2017
 6 | Authors: Matteo Sesia, Lucas Janson, Emmanuel Candès, Yingying Fan,
 7 |          Jinchi Lv.
 8 | License: GPL-3
 9 | Contact: The code is updated and maintained by Matteo Sesia,
10 |          msesia(AT)stanford.edu (1 July 2017)
11 | 
12 | This package is an implementation of the knockoff filter in MATLAB.
13 | https://web.stanford.edu/group/candes/knockoffs/index.html
14 | 


--------------------------------------------------------------------------------
/MATLAB/knockoffs_matlab/examples/examples_advanced.m:
--------------------------------------------------------------------------------
 1 | % This demo illustrates some advanced usage of the Knockoffs package 
 2 | % on a synthetic data set.
 3 | 
 4 | %% Synthetic problem parameters
 5 | 
 6 | n = 1000;         % Number of data points
 7 | p = 1000;         % Number of variables
 8 | k = 60;           % Number of variables with nonzero coefficients
 9 | amplitude = 15;   % Magnitude of nonzero coefficients
10 | sigma = 1;        % Noise level
11 | q = 0.10;         % Target false discovery rate (FDR)
12 | 
13 | rng(123);         % Random seed
14 | 
15 | %% Synthetic problem construction
16 | % We generate the data by sampling the rows of X from a multivariate normal
17 | % distribution with mean zero and identity covariance matrix.
18 | % Conditional on X, the response y is drawn from a logistic regression model
19 | % with k non-zero coefficients
20 | 
21 | Sigma = eye(p);
22 | mu = zeros(1,p);
23 | 
24 | S0 = randsample(p,k);
25 | beta = zeros(p,1);
26 | beta(S0) = amplitude/sqrt(n);
27 | invlogit = @(x) exp(x)./(1+exp(x));
28 | sampleY = @(X) binornd(1,invlogit(X*beta));
29 | 
30 | trueDiscoveries = @(S) sum(beta(S) > 0);
31 | power = @(S) trueDiscoveries(S)*100/k;
32 | FDP = @(S) sum(beta(S) == 0) / max(1, length(S));
33 | FDPp = @(S) sum(beta(S) == 0) / (1/q + length(S));
34 | printSummary = @(S) fprintf(...
35 |     ['%d true discoveries (Power = %2.2f%%)\n' ...
36 |      'FDP = %2.2f%% (target FDR = %2.f%%)\n'], ...
37 |     trueDiscoveries(S), power(S), 100*FDP(S), 100*q);
38 | 
39 | %% Running the knockoff filter steps manually
40 | % The main function 'knockoffs.filter' is a wrapper around simpler functions
41 | % that create knockoffs, compute test statistics, and perform variable
42 | % selection. When more control is necessary, these functions may be
43 | % called directly. We demonstrate this below in reproducing the plot of
44 | % Figure 1.
45 | 
46 | X = mvnrnd(mu, Sigma, n);
47 | y = sampleY(X);
48 | Xmodel = {'gaussian', mu, Sigma};
49 | X_k = knockoffs.create(X, Xmodel{:});
50 | [W,Z] = knockoffs.stats.lassoLambdaSignedMax_bin(X, X_k, y);
51 | t = knockoffs.threshold(W, q);
52 | 
53 | fig = figure();
54 | hold on
55 | set(fig, 'DefaultTextInterpreter', 'latex');
56 | gscatter(Z(1:p), Z(p+1:2*p), ismember(1:p, S0), 'kr');
57 | plot([t t 0], [0 t t], 'k');
58 | hold off
59 | 
60 | xlabel('Value of $\lambda$ when $X_j$ enters model');
61 | ylabel('Value of $\lambda$ when $\tilde X_j$ enters model');
62 | limits = [0 ceil(max(Z))];
63 | xlim(limits); ylim(limits);
64 | title('Knockoff Filter with Lasso Statistics');
65 | legend('Null feature', 'Non-null feature');
66 | line = refline(1,0);
67 | set(line, 'LineStyle', ':', 'Color', 'black');
68 | 
69 | 
70 | %% Running the knockoff filter manually multiple times
71 | % The function 'knockoffs' is also a  wrapper around simpler 
72 | % functions that create knockoffs for a specific model.
73 | % When knockoffs for multiple identically distributed realizations of the 
74 | % data matrix X are needed, one can save computation time by precomputing 
75 | % the diagonal entries of 'diag_s' (e.g. by solving the SDP, for SDP
76 | % knockoffs). We demonstrate this below.
77 | 
78 | diag_s = sparse(diag(knockoffs.create.solveSDP(Sigma)));
79 | m=10;
80 | [fdp, fdpp, pwr] = deal(nan(m,1));
81 | 
82 | for i = 1:m
83 |     X = mvnrnd(mu, Sigma, n);
84 |     y = sampleY(X);
85 |     X_k = knockoffs.create.gaussian_sample(X, mu, Sigma, diag_s);
86 |     W = knockoffs.stats.lassoCoefDiff_bin(X, X_k, y);
87 |     S = knockoffs.select(W, q, 'knockoff');
88 |     fdp(i) = FDP(S); fdpp(i) = FDPp(S); pwr(i) = trueDiscoveries(S)/k;
89 | end
90 | fprintf('Mean FDP: %2.2f, Mean FDPp: %2.2f, Mean Power: %2.2f%%\n', mean([fdp fdpp pwr*100]))
91 | boxplot([fdp, fdpp, pwr], 'Labels',{'Fdp', 'Fdpp', 'Power'})
92 | 


--------------------------------------------------------------------------------
/R/README.md:
--------------------------------------------------------------------------------
 1 | The Knockoff Filter for R
 2 | ==========================
 3 | 
 4 | This package provides a versatile R interface to the knockoff methodology.
 5 | 
 6 | # Installation
 7 | 
 8 | ## Stable version
 9 | 
10 | The stable version of this package is hosted on [CRAN](https://cran.r-project.org/package=knockoff). 
11 | 
12 | To install this package from CRAN, run the following command in your R console:
13 | ```r
14 | install.packages("knockoff")
15 | ```
16 | 
17 | ## Development version
18 | 
19 | You can install the lastest development version by cloning this repository and building the package from source. Alternatively, you can install it directly from your R console using the [devtools](https://CRAN.R-project.org/package=devtools) package.
20 | 
21 | To install this package with devtools, run the following command in your R console:
22 | 
23 | ```r
24 | devtools::install_github("msesia/knockoff-filter/R/knockoff")
25 | ```
26 | 
27 | If you also want install the vignettes along with the package, type instead:
28 | 
29 | ```r
30 | devtools::install_github("msesia/knockoff-filter/R/knockoff", build_vignette = TRUE)
31 | ```
32 | 
33 | Note that building the vignettes may require additional R packages.
34 | 
35 | ## Resources
36 | For more information, visit: https://web.stanford.edu/group/candes/knockoffs
37 | 
38 | ## News
39 | 
40 | To read about the latest changes, visit the [NEWS page](knockoff/NEWS).
41 | 
42 | ## Credits
43 | 
44 | An earlier version of this package was developed by Evan Patterson: https://bitbucket.org/epatters/knockoff-filter.
45 | 
46 | ## License
47 | 
48 | This software is distributed under the [GPLv3 license](https://www.gnu.org/licenses/gpl-3.0.en.html) and it comes with ABSOLUTELY NO WARRANTY.


--------------------------------------------------------------------------------
/R/knockoff/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^knockoff\.Rcheck$
4 | ^knockoff.*\.tar\.gz$
5 | ^knockoff.*\.tgz$
6 | 


--------------------------------------------------------------------------------
/R/knockoff/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: knockoff
 2 | Type: Package
 3 | Title: The Knockoff Filter for Controlled Variable Selection
 4 | Version: 0.3.5
 5 | Date: 2022-01-31
 6 | Authors@R: c(person("Rina", "Foygel Barber", role = c("ctb"),
 7 |                      comment="Development of the original Fixed-X Knockoffs"),
 8 |              person("Emmanuel", "Candes", role = c("ctb"),
 9 |                     comment="Development of Model-X Knockoffs and original Fixed-X Knockoffs"),
10 |              person("Lucas", "Janson", role = c("ctb"),
11 |                     comment="Development of Model-X Knockoffs"),
12 |              person("Evan", "Patterson", role = c("aut"),
13 |                     comment="Earlier R package for the original Fixed-X Knockoffs"),
14 |              person("Matteo", "Sesia", role = c("aut","cre"),
15 |                     comment="R package for Model-X Knockoffs", email="sesia@marshall.usc.edu"))
16 | Description: The knockoff filter is a general procedure for controlling the false discovery rate (FDR)
17 |   when performing variable selection. 
18 |   For more information, see the website below and the accompanying paper: Candes et al., 
19 |   "Panning for gold: model-X knockoffs for high-dimensional controlled variable selection", 
20 |   J. R. Statist. Soc. B (2018) 80, 3, pp. 551-577.
21 | License: GPL-3
22 | URL: https://web.stanford.edu/group/candes/knockoffs/index.html
23 | Depends: methods, stats
24 | Imports: Rdsdp, Matrix, corpcor, glmnet, RSpectra, gtools, utils
25 | Suggests: knitr, testthat, rmarkdown, lars, ranger, stabs, RPtests, doParallel, parallel
26 | RoxygenNote: 7.1.2
27 | VignetteBuilder: knitr
28 | 


--------------------------------------------------------------------------------
/R/knockoff/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(print,knockoff.result)
 4 | export(create.fixed)
 5 | export(create.gaussian)
 6 | export(create.second_order)
 7 | export(create.solve_asdp)
 8 | export(create.solve_equi)
 9 | export(create.solve_sdp)
10 | export(knockoff.filter)
11 | export(knockoff.threshold)
12 | export(stat.forward_selection)
13 | export(stat.glmnet_coefdiff)
14 | export(stat.glmnet_lambdadiff)
15 | export(stat.glmnet_lambdasmax)
16 | export(stat.lasso_coefdiff)
17 | export(stat.lasso_coefdiff_bin)
18 | export(stat.lasso_lambdadiff)
19 | export(stat.lasso_lambdadiff_bin)
20 | export(stat.lasso_lambdasmax)
21 | export(stat.lasso_lambdasmax_bin)
22 | export(stat.random_forest)
23 | export(stat.sqrt_lasso)
24 | export(stat.stability_selection)
25 | import(methods)
26 | import(stats)
27 | 


--------------------------------------------------------------------------------
/R/knockoff/NEWS:
--------------------------------------------------------------------------------
 1 | knockoff 0.3.5 (01/31/2022)
 2 | ---------------------------------------
 3 | Minor changes:
 4 | * Fixed bug in stat.glmnet_coefdiff: "cox" family has no intercept
 5 | * Fixed call to sample.int in tests/testthat/test_stats.R.
 6 |   
 7 | knockoff 0.3.4 (03/08/2021)
 8 | ---------------------------------------
 9 | Minor changes:
10 | * Fixed bug in stat.glmnet_coefdiff and stat.glmnet_lambdadiff, 
11 |   which did not previously work with "multinomial" family
12 | 
13 | knockoff 0.3.3 (06/01/2020)
14 | ---------------------------------------
15 | Minor changes:
16 | * Removed dependency on flare, which is no longer on CRAN.
17 | 
18 | knockoff 0.3.2 (08/03/2018)
19 | ---------------------------------------
20 | Fixes:
21 | * Fixed bug that caused incorrect knockoff statistics in the presence
22 |   of knockoff copies identical to their own original variable.
23 | 
24 | Minor changes:
25 | * Changed scaling of variables for computation of importance measures.
26 | 
27 | knockoff 0.3.1.1 (06/28/2018)
28 | ---------------------------------------
29 | Minor changes:
30 | * Improved algorithm for solving SDP
31 | * Improved algorithm for solving ASDP
32 | * Returning X instead of throwing error in Gaussian knockoffs,
33 |   if covariance matrix is not positive-definite
34 | 
35 | Documentation:
36 | * Minor improvements to package description file
37 | 
38 | knockoff 0.3.0 (10/17/2017)
39 | ---------------------------------------
40 | Features:
41 | * Added support for Model-X knockoffs
42 | * Added importance statistics
43 | * Native support for SDP knockoffs (no need to call Python)
44 | 
45 | Major changes:
46 | * Model-X knockoffs are used by default
47 | * Cross-validated lasso statistics are used by default
48 | * SDP knockoffs are used by default
49 | * Offset 1 is used by default
50 | 
51 | Documentation:
52 | * Updated and expanded vignettes
53 | 
54 | knockoff 0.2.1
55 | ---------------------------------------
56 | Documentation:
57 | * Add vignette showing how to analyze a real data set (on HIV drug resistance),
58 |   including all the preprocessing steps.
59 | 
60 | knockoff 0.2 (02/04/2015)
61 | ---------------------------------------
62 | Changes:
63 | * The knockoff procedure is now fully deterministic by default. Randomization 
64 |   can be enabled if desired.
65 | 
66 | Fixes:
67 | * Fix numerical precision bug in equicorrelated knockoff creation
68 | 
69 | knockoff 0.1.1 (12/19/2014)
70 | ---------------------------------------
71 | Features:
72 | * Expose the optional 'nlambda' parameter for lasso statistics
73 | 
74 | Fixes:
75 | * Better documentation for SDP knockoffs
76 | * Minor bug fixes
77 | 
78 | knockoff 0.1 (12/05/2014)
79 | ---------------------------------------
80 | Initial release!


--------------------------------------------------------------------------------
/R/knockoff/R/create_gaussian.R:
--------------------------------------------------------------------------------
 1 | #' Model-X Gaussian knockoffs
 2 | #' 
 3 | #' Samples multivariate Gaussian model-X knockoff variables.
 4 | #' 
 5 | #' @param X n-by-p matrix of original variables.
 6 | #' @param mu vector of length p, indicating the mean parameter of the Gaussian model for \eqn{X}.
 7 | #' @param Sigma p-by-p covariance matrix for the Gaussian model of \eqn{X}.
 8 | #' @param method either "equi", "sdp" or "asdp" (default: "asdp").
 9 | #' This determines the method that will be used to minimize the correlation between the original variables and the knockoffs.
10 | #' @param diag_s vector of length p, containing the pre-computed covariances between the original 
11 | #' variables and the knockoffs. This will be computed according to \code{method}, if not supplied. 
12 | #' @return A n-by-p matrix of knockoff variables.
13 | #' 
14 | #' @family create
15 | #' 
16 | #' @references 
17 | #'   Candes et al., Panning for Gold: Model-free Knockoffs for High-dimensional Controlled Variable Selection,
18 | #'   arXiv:1610.02351 (2016).
19 | #'   \href{https://web.stanford.edu/group/candes/knockoffs/index.html}{https://web.stanford.edu/group/candes/knockoffs/index.html}
20 | #' 
21 | #' @examples
22 | #' set.seed(2022)
23 | #' p=200; n=100; k=15
24 | #' rho = 0.4
25 | #' mu = rep(0,p); Sigma = toeplitz(rho^(0:(p-1)))
26 | #' X = matrix(rnorm(n*p),n) %*% chol(Sigma)
27 | #' nonzero = sample(p, k)
28 | #' beta = 3.5 * (1:p %in% nonzero)
29 | #' y = X %*% beta + rnorm(n)
30 | #' 
31 | #' # Basic usage with default arguments
32 | #' knockoffs = function(X) create.gaussian(X, mu, Sigma)
33 | #' result = knockoff.filter(X, y, knockoffs=knockoffs)
34 | #' print(result$selected)
35 | #' 
36 | #' # Advanced usage with custom arguments
37 | #' knockoffs = function(X) create.gaussian(X, mu, Sigma, method='equi')
38 | #' result = knockoff.filter(X, y, knockoffs=knockoffs)
39 | #' print(result$selected)
40 | #' 
41 | #' @export
42 | create.gaussian <- function(X, mu, Sigma, method=c("asdp","sdp","equi"), diag_s=NULL) {
43 |   method = match.arg(method)
44 |   
45 |   # Do not use ASDP unless p>500
46 |   if ((nrow(Sigma)<=500) && method=="asdp") {
47 |     method="sdp"
48 |   }
49 |   
50 |   if (is.null(diag_s)) {
51 |     diag_s = diag(switch(match.arg(method),
52 |                     'equi' = create.solve_equi(Sigma),
53 |                     'sdp'  = create.solve_sdp(Sigma),
54 |                     'asdp' = create.solve_asdp(Sigma)))
55 |   }
56 |   if (is.null(dim(diag_s))) {
57 |     diag_s = diag(diag_s,length(diag_s))
58 |   }
59 |   
60 |   # If diag_s is zero, we can only generate trivial knockoffs.
61 |   if(all(diag_s==0)) {
62 |     warning("The conditional knockoff covariance matrix is not positive definite. Knockoffs will have no power.")
63 |     return(X)
64 |   }
65 |   
66 |   SigmaInv_s = solve(Sigma,diag_s)
67 |   mu_k = X - sweep(X,2,mu,"-") %*% SigmaInv_s
68 |   Sigma_k = 2*diag_s - diag_s %*% SigmaInv_s
69 |   X_k = mu_k + matrix(rnorm(ncol(X)*nrow(X)),nrow(X)) %*% chol(Sigma_k)
70 | }


--------------------------------------------------------------------------------
/R/knockoff/R/create_second_order.R:
--------------------------------------------------------------------------------
 1 | #' Second-order Gaussian knockoffs
 2 | #' 
 3 | #' This function samples second-order multivariate Gaussian knockoff variables.
 4 | #' First, a multivariate Gaussian distribution is fitted to the observations of X.
 5 | #' Then, Gaussian knockoffs are generated according to the estimated model.
 6 | #' 
 7 | #' @param X n-by-p matrix of original variables.
 8 | #' @param method either "equi", "sdp" or "asdp" (default: "asdp").
 9 | #' This determines the method that will be used to minimize the correlation between the original variables and the knockoffs.
10 | #' @param shrink whether to shrink the estimated covariance matrix (default: F).
11 | #' @return A n-by-p matrix of knockoff variables.
12 | #'  
13 | #' @family create
14 | #' 
15 | #' @details
16 | #' If the argument \code{shrink} is set to T, a James-Stein-type shrinkage estimator for
17 | #' the covariance matrix is used instead of the traditional maximum-likelihood estimate. This option
18 | #' requires the package \code{corpcor}. See \code{\link[corpcor]{cov.shrink}} for more details.
19 | #' 
20 | #' Even if the argument \code{shrink} is set to F, in the case that the estimated covariance 
21 | #' matrix is not positive-definite, this function will apply some shrinkage.
22 | #' 
23 | #' @references 
24 | #'   Candes et al., Panning for Gold: Model-free Knockoffs for High-dimensional Controlled Variable Selection,
25 | #'   arXiv:1610.02351 (2016).
26 | #'   \href{https://web.stanford.edu/group/candes/knockoffs/index.html}{https://web.stanford.edu/group/candes/knockoffs/index.html}
27 | #'   
28 | #' @examples
29 | #' set.seed(2022)
30 | #' p=200; n=100; k=15
31 | #' rho = 0.4
32 | #' Sigma = toeplitz(rho^(0:(p-1)))
33 | #' X = matrix(rnorm(n*p),n) %*% chol(Sigma)
34 | #' nonzero = sample(p, k)
35 | #' beta = 3.5 * (1:p %in% nonzero)
36 | #' y = X %*% beta + rnorm(n)
37 | #' 
38 | #' # Basic usage with default arguments
39 | #' result = knockoff.filter(X, y, knockoffs=create.second_order)
40 | #' print(result$selected)
41 | #' 
42 | #' # Advanced usage with custom arguments
43 | #' knockoffs = function(X) create.second_order(X, method='equi')
44 | #' result = knockoff.filter(X, y, knockoffs=knockoffs)
45 | #' print(result$selected)   
46 | #'   
47 | #' @export
48 | create.second_order <- function(X, method=c("asdp","equi","sdp"), shrink=F) {
49 |   method = match.arg(method)
50 |   
51 |   # Estimate the mean vectorand covariance matrix
52 |   mu = colMeans(X)
53 |   
54 |   # Estimate the covariance matrix
55 |   if(!shrink) {
56 |     Sigma = cov(X)
57 |     # Verify that the covariance matrix is positive-definite
58 |     if(!is_posdef(Sigma)) {
59 |       shrink=TRUE
60 |     }
61 |   }
62 |   if(shrink) {
63 |     if (!requireNamespace('corpcor', quietly=T))
64 |       stop('corpcor is not installed', call.=F)
65 |     Sigma = tryCatch({suppressWarnings(matrix(as.numeric(corpcor::cov.shrink(X,verbose=F)), nrow=ncol(X)))},
66 |                      warning = function(w){}, error = function(e) {
67 |                        stop("SVD failed in the shrinkage estimation of the covariance matrix. Try upgrading R to version >= 3.3.0")
68 |                      }, finally = {})
69 |   }
70 | 
71 |   # Sample the Gaussian knockoffs
72 |   create.gaussian(X, mu, Sigma, method=method)
73 | }


--------------------------------------------------------------------------------
/R/knockoff/R/knockoff.R:
--------------------------------------------------------------------------------
 1 | #' knockoff: A package for controlled variable selection
 2 | #'
 3 | #' This package implements the Knockoff Filter, which is a powerful and versatile tool for 
 4 | #' controlled variable selection.
 5 | #' 
 6 | #' @section Outline:
 7 | #' The procedure is based on the contruction of artificial 'knockoff copies' of the variables 
 8 | #' present in the given statistical model. Then, it selects those variables that are clearly better 
 9 | #' than their corresponding knockoffs, based on some measure of variable importance.
10 | #' A wide range of statistics and machine learning tools can be exploited to estimate the 
11 | #' importance of each variable, while guaranteeing finite-sample control of the false
12 | #' discovery rate (FDR).
13 | #' 
14 | #' The Knockoff Filter controls the FDR in either of two statistical scenarios:
15 | #' \itemize{
16 | #'  \item{The "model-X" scenario: }{the response \eqn{Y} can depend on the variables \eqn{X=(X_1,\ldots,X_p)}
17 | #'  in an arbitrary and unknown fashion, but the distribution of \eqn{X} must be known. In thise case
18 | #'  there are no constraints on the dimensions \eqn{n} and \eqn{p} of the problem.}
19 | #'  \item{The "fixed-X" scenario: }{the response \eqn{Y} depends upon \eqn{X} through a 
20 | #'  homoscedastic Gaussian linear model and the problem is low-dimensional (\eqn{n \geq p}). 
21 | #'  In this case, no modeling assumptions on \eqn{X} are required. }
22 | #' }
23 | #' 
24 | #' For more information, see the website below and the accompanying paper.
25 | #' 
26 | #' \url{https://web.stanford.edu/group/candes/knockoffs/index.html}
27 | #' 
28 | #' @docType package
29 | #' @name knockoff
30 | NULL
31 | 


--------------------------------------------------------------------------------
/R/knockoff/R/solve_equi.R:
--------------------------------------------------------------------------------
 1 | #' Optimization for equi-correlated fixed-X and Gaussian knockoffs
 2 | #' 
 3 | #' This function solves a very simple optimization problem needed to create fixed-X and 
 4 | #' Gaussian SDP knockoffs on the full the covariance matrix. This may be significantly
 5 | #' less powerful than \code{\link{create.solve_sdp}}.
 6 | #' 
 7 | #' @param Sigma positive-definite p-by-p covariance matrix.
 8 | #' @return The solution \eqn{s} to the optimization problem defined above.
 9 | #' 
10 | #' @details Computes the closed-form solution to the semidefinite programming problem:
11 | #'  \deqn{ \mathrm{maximize}  \; s \quad
12 | #'        \mathrm{subject} \; \mathrm{to:}   \; 0 \leq s \leq 1, \;
13 | #'        2\Sigma - sI \geq 0 }
14 | #' used to generate equi-correlated knockoffs.
15 | #' 
16 | #' The closed form-solution to this problem is \eqn{s = 2\lambda_{\mathrm{min}}(\Sigma) \land 1}.
17 | #' 
18 | #' @family optimization
19 | #' 
20 | #' @export
21 | create.solve_equi <- function(Sigma) {
22 |   # Check that covariance matrix is symmetric
23 |   stopifnot(isSymmetric(Sigma))
24 |   p = nrow(Sigma)
25 |   tol = 1e-10
26 |   # Convert the covariance matrix to a correlation matrix
27 |   G = cov2cor(Sigma)
28 |   
29 |   # Check that the input matrix is positive-definite
30 |   if (!is_posdef(G)) {
31 |     stop('The covariance matrix is not positive-definite: cannot solve SDP',immediate.=T)
32 |   }
33 |   
34 |   if (p>2) {
35 |     converged=FALSE
36 |     maxitr=10000
37 |     while (!converged) {
38 |       lambda_min = RSpectra::eigs(G, 1, which="SR", opts=list(retvec = FALSE, maxitr=100000, tol=1e-8))$values
39 |       if (length(lambda_min)==1) {
40 |         converged = TRUE
41 |       } else {
42 |         if (maxitr>1e8) {
43 |           warning('In creation of equi-correlated knockoffs, while computing the smallest eigenvalue of the 
44 |                 covariance matrix. RSpectra::eigs did not converge. Giving up and computing full SVD with built-in R function.',immediate.=T)
45 |           lambda_min = eigen(G, symmetric=T, only.values = T)$values[p]
46 |           converged=TRUE
47 |         } else {
48 |           warning('In creation of equi-correlated knockoffs, while computing the smallest eigenvalue of the 
49 |                 covariance matrix. RSpectra::eigs did not converge. Trying again with increased number of iterations.',immediate.=T)
50 |           maxitr = maxitr*10
51 |         }
52 |       }
53 |     }
54 |   } else {
55 |     lambda_min = eigen(G, symmetric=T, only.values = T)$values[p]
56 |   }
57 |   
58 |   if (lambda_min<0) {
59 |     stop('In creation of equi-correlated knockoffs, while computing the smallest eigenvalue of the 
60 |                 covariance matrix. The covariance matrix is not positive-definite.')
61 |   }
62 |   
63 |   s = rep(1, nrow(Sigma)) * min(2*lambda_min, 1)
64 |   
65 |   # Compensate for numerical errors (feasibility)
66 |   psd = 0;
67 |   s_eps = 1e-8;
68 |   while (psd==0) {
69 |     psd = is_posdef(2*G-diag(s*(1-s_eps),length(s)))
70 |     if (!psd) {
71 |       s_eps = s_eps*10
72 |     }
73 |   }
74 |   s = s*(1-s_eps)
75 |   
76 |   # Scale back the results for a covariance matrix
77 |   return(s*diag(Sigma))
78 | }


--------------------------------------------------------------------------------
/R/knockoff/R/solve_sdp.R:
--------------------------------------------------------------------------------
  1 | #' Optimization for fixed-X and Gaussian knockoffs
  2 | #'
  3 | #' This function solves the optimization problem needed to create fixed-X and Gaussian SDP knockoffs
  4 | #' on the full covariance matrix. This will be more powerful than \code{\link{create.solve_asdp}},
  5 | #' but more computationally expensive.
  6 | #' 
  7 | #' @param Sigma positive-definite p-by-p covariance matrix.
  8 | #' @param maxit maximum number of iterations for the solver (default: 1000).
  9 | #' @param gaptol tolerance for duality gap as a fraction of the value of the objective functions (default: 1e-6).
 10 | #' @param verbose whether to display progress (default: FALSE).
 11 | #' @return The solution \eqn{s} to the semidefinite programming problem defined above.
 12 | #'
 13 | #' @details
 14 | #' Solves the semidefinite programming problem:
 15 | #'
 16 | #'   \deqn{ \mathrm{maximize}      \; \mathrm{sum}(s) \quad
 17 | #'           \mathrm{subject} \; \mathrm{to}    0 \leq s \leq 1, \;
 18 | #'                                  2\Sigma - \mathrm{diag}(s) \geq 0}
 19 | #' 
 20 | #' This problem is solved using the interior-point method implemented in \code{\link[Rdsdp]{dsdp}}.
 21 | #'
 22 | #' If the matrix Sigma supplied by the user is a non-scaled covariance matrix 
 23 | #' (i.e. its diagonal entries are not all equal to 1), then the appropriate scaling is applied before
 24 | #' solving the SDP defined above. The result is then scaled back before being returned, as to match 
 25 | #' the original scaling of the covariance matrix supplied by the user.
 26 | #' 
 27 | #' @family optimization
 28 | #' 
 29 | #' @export
 30 | create.solve_sdp <- function(Sigma, gaptol=1e-6, maxit=1000, verbose=FALSE) {
 31 |   # Check that covariance matrix is symmetric
 32 |   stopifnot(isSymmetric(Sigma))
 33 |   # Convert the covariance matrix to a correlation matrix
 34 |   G = cov2cor(Sigma)
 35 |   p = dim(G)[1]
 36 |   
 37 |   # Check that the input matrix is positive-definite
 38 |   if (!is_posdef(G)) {
 39 |     warning('The covariance matrix is not positive-definite: knockoffs may not have power.', immediate.=T)
 40 |   }
 41 |   
 42 |   # Convert problem for SCS
 43 |   
 44 |   # Linear constraints
 45 |   Cl1 = rep(0,p)
 46 |   Al1 = -Matrix::Diagonal(p)
 47 |   Cl2 = rep(1,p)
 48 |   Al2 = Matrix::Diagonal(p)
 49 |   
 50 |   # Positive-definite cone
 51 |   d_As = c(diag(p))
 52 |   As = Matrix::Diagonal(length(d_As), x=d_As)
 53 |   As = As[which(Matrix::rowSums(As) > 0),] 
 54 |   Cs = c(2*G)
 55 |   
 56 |   # Assemble constraints and cones
 57 |   A = cbind(Al1,Al2,As)
 58 |   C = matrix(c(Cl1,Cl2,Cs),1)
 59 |   K=NULL
 60 |   K$s=p
 61 |   K$l=2*p
 62 |   
 63 |   # Objective
 64 |   b = rep(1,p)
 65 |   
 66 |   # Solve SDP with Rdsdp
 67 |   OPTIONS=NULL
 68 |   OPTIONS$gaptol=gaptol
 69 |   OPTIONS$maxit=maxit
 70 |   OPTIONS$logsummary=0
 71 |   OPTIONS$outputstats=0
 72 |   OPTIONS$print=0
 73 |   if(verbose) cat("Solving SDP ... ")
 74 |   sol = Rdsdp::dsdp(A,b,C,K,OPTIONS)
 75 |   if(verbose) cat("done. \n")
 76 |   
 77 |   # Check whether the solution is feasible
 78 |   if( ! identical(sol$STATS$stype,"PDFeasible")) {
 79 |     warning('The SDP solver returned a non-feasible solution. Knockoffs may lose power.')
 80 |   }
 81 |   
 82 |   # Clip solution to correct numerical errors (domain)
 83 |   s = sol$y
 84 |   s[s<0]=0
 85 |   s[s>1]=1
 86 |   
 87 |   # Compensate for numerical errors (feasibility)
 88 |   if(verbose) cat("Verifying that the solution is correct ... ")
 89 |   psd = 0
 90 |   s_eps = 1e-8
 91 |   while ((psd==0) & (s_eps<=0.1)) {
 92 |     if (is_posdef(2*G-diag(s*(1-s_eps),length(s)),tol=1e-9)) {
 93 |       psd  = 1
 94 |     }
 95 |     else {
 96 |       s_eps = s_eps*10
 97 |     }
 98 |   }
 99 |   s = s*(1-s_eps)
100 |   s[s<0]=0
101 |   if(verbose) cat("done. \n")
102 |   
103 |   # Verify that the solution is correct
104 |   if (all(s==0)) {
105 |     warning('In creation of SDP knockoffs, procedure failed. Knockoffs will have no power.',immediate.=T)
106 |   }
107 |   
108 |   # Scale back the results for a covariance matrix
109 |   return(s*diag(Sigma))
110 | }
111 | 
112 | #' Vectorize a matrix into the SCS format
113 | #'  
114 | #' @rdname vectorize_matrix
115 | #' @keywords internal
116 | create.vectorize_matrix = function(M) {
117 |   # Scale the off-diagonal entries by sqrt(2)
118 |   vectorized_matrix = M
119 |   vectorized_matrix[lower.tri(M,diag=FALSE)] = M[lower.tri(M,diag=FALSE)] * sqrt(2)
120 |   # Stack the lower triangular elements column-wise
121 |   vectorized_matrix = vectorized_matrix[lower.tri(vectorized_matrix,diag=TRUE)]
122 | }


--------------------------------------------------------------------------------
/R/knockoff/R/stats_forward_selection.R:
--------------------------------------------------------------------------------
  1 | #' Importance statistics based on forward selection
  2 | #' 
  3 | #' Computes the statistic
  4 | #'   \deqn{W_j = \max(Z_j, Z_{j+p}) \cdot \mathrm{sgn}(Z_j - Z_{j+p}),}
  5 | #' where \eqn{Z_1,\dots,Z_{2p}} give the reverse order in which the 2p
  6 | #' variables (the originals and the knockoffs) enter the forward selection 
  7 | #' model.
  8 | #' See the Details for information about forward selection.
  9 | #' 
 10 | #' In \emph{forward selection}, the variables are chosen iteratively to maximize
 11 | #' the inner product with the residual from the previous step. The initial
 12 | #' residual is always \code{y}. In standard forward selection
 13 | #' (\code{stat.forward_selection}), the next residual is the remainder after
 14 | #' regressing on the selected variable; when orthogonal matching pursuit
 15 | #' is used, the next residual is the remainder
 16 | #' after regressing on \emph{all} the previously selected variables.
 17 | #' 
 18 | #' @param X    n-by-p matrix of original variables.
 19 | #' @param X_k  n-by-p matrix of knockoff variables.
 20 | #' @param y    numeric vector of length n, containing the response variables.
 21 | #' @param omp  whether to use orthogonal matching pursuit (default: F).
 22 | #' @return A vector of statistics \eqn{W} of length p.
 23 | #' 
 24 | #' @family statistics
 25 | #' 
 26 | #' @examples
 27 | #' set.seed(2022)
 28 | #' p=100; n=100; k=15
 29 | #' mu = rep(0,p); Sigma = diag(p)
 30 | #' X = matrix(rnorm(n*p),n)
 31 | #' nonzero = sample(p, k)
 32 | #' beta = 3.5 * (1:p %in% nonzero)
 33 | #' y = X %*% beta + rnorm(n)
 34 | #' knockoffs = function(X) create.gaussian(X, mu, Sigma)
 35 | #' 
 36 | #' # Basic usage with default arguments
 37 | #' result = knockoff.filter(X, y, knockoffs=knockoffs,
 38 | #'                            statistic=stat.forward_selection)
 39 | #' print(result$selected)
 40 | #' 
 41 | #' # Advanced usage with custom arguments
 42 | #' foo = stat.forward_selection
 43 | #' k_stat = function(X, X_k, y) foo(X, X_k, y, omp=TRUE)
 44 | #' result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
 45 | #' print(result$selected)
 46 | #' 
 47 | #' @rdname stat.forward_selection
 48 | #' @export
 49 | stat.forward_selection <- function(X, X_k, y, omp=F) {
 50 |   if( is.numeric(y) ){
 51 |     y = as.vector(y)
 52 |   } else {
 53 |     stop('Knockoff statistic stat.forward_selection requires the input y to be a numeric vector')
 54 |   }
 55 |   p = ncol(X)
 56 |   X = scale(X)
 57 |   X_k = scale(X_k)
 58 |   
 59 |   # Randomly swap columns of X and Xk
 60 |   swap = rbinom(ncol(X),1,0.5)
 61 |   swap.M = matrix(swap,nrow=nrow(X),ncol=length(swap),byrow=TRUE)
 62 |   X.swap  = X * (1-swap.M) + X_k * swap.M
 63 |   Xk.swap = X * swap.M + X_k * (1-swap.M)
 64 |   
 65 |   # Compute statistics
 66 |   path = fs(cbind(X.swap, Xk.swap), y, omp)
 67 |   Z = 2*p + 1 - order(path) # Are we recycling here?
 68 |   orig = 1:p
 69 |   W = pmax(Z[orig], Z[orig+p]) * sign(Z[orig] - Z[orig+p])
 70 |   # Correct for swapping of columns of X and Xk
 71 |   W = W * (1-2*swap)
 72 | }
 73 | 
 74 | #' Forward selection
 75 | #' 
 76 | #' Perform forward variable selection with or without OMP
 77 | #' 
 78 | #' @param X matrix of predictors
 79 | #' @param y response vector
 80 | #' @param omp whether to use orthogonal matching pursuit (OMP)
 81 | #' @return vector with jth component the variable added at step j
 82 | #' 
 83 | #' @keywords internal
 84 | fs <- function(X, y, omp=FALSE) {
 85 |   n = nrow(X); p = ncol(X)
 86 |   stopifnot(n == length(y))
 87 |   path = rep.int(0, p)
 88 |   in_model = rep(FALSE, p)
 89 |   residual = y
 90 |   if (omp) Q = matrix(0, n, p)
 91 |   
 92 |   for (step in 1:p) {
 93 |     # Find the best variable to add among the remaining variables.
 94 |     available_vars = which(!in_model)
 95 |     products = apply(X[,!in_model,drop=F], 2,
 96 |                      function(x) abs(sum(x * residual)))
 97 |     best_var = available_vars[which.max(products)][1]
 98 |     path[step] = best_var
 99 |     in_model[best_var] = TRUE
100 |     
101 |     # Update the residual.
102 |     x = X[,best_var]
103 |     if (step == p) break
104 |     if (omp) {
105 |       for (j in seq(1, length.out=step-1))
106 |         x = x - Q[,j]%*%x * Q[,j]
107 |       q = x / sqrt(sum(x^2))
108 |       Q[,step] = q
109 |       residual = residual - (q%*%y)[1] * q
110 |     } 
111 |     else {
112 |       residual = residual - (x %*% residual)[1] * x
113 |     }
114 |   }
115 |   return(path)
116 | }
117 | 


--------------------------------------------------------------------------------
/R/knockoff/R/stats_lasso_cv.R:
--------------------------------------------------------------------------------
 1 | #' Importance statistics based the lasso with cross-validation
 2 | #' 
 3 | #' Fits a linear regression model via penalized maximum likelihood and cross-validation.
 4 | #' Then, compute the difference statistic
 5 | #'   \deqn{W_j = |Z_j| - |\tilde{Z}_j|}
 6 | #' where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the coefficient estimates for the 
 7 | #' jth variable and its knockoff, respectively. The value of the regularization
 8 | #' parameter \eqn{\lambda} is selected by cross-validation and computed with \code{glmnet}.
 9 | #' 
10 | #' @param X n-by-p matrix of original variables.
11 | #' @param X_k n-by-p matrix of knockoff variables.
12 | #' @param y vector of length n, containing the response variables. It should be numeric
13 | #' @param cores Number of cores used to compute the statistics by running cv.glmnet.
14 | #' If not specified, the number of cores is set to approximately half of the number of cores 
15 | #' detected by the parallel package.
16 | #' @param ... additional arguments specific to \code{glmnet} (see Details).
17 | #' @return A vector of statistics \eqn{W} of length p.
18 | #' 
19 | #' @details This function uses the \code{glmnet} package to fit the lasso path and 
20 | #' is a wrapper around the more general \link{stat.glmnet_coefdiff}.
21 | #' 
22 | #' The statistics \eqn{W_j} are constructed by taking the difference 
23 | #' between the coefficient of the j-th variable and its knockoff.
24 | #'  
25 | #' By default, the value of the regularization parameter is chosen by 10-fold cross-validation.
26 | #' 
27 | #' The optional \code{nlambda} parameter can be used to control the granularity of the 
28 | #' grid of \eqn{\lambda}'s. The default value of \code{nlambda} is \code{500},
29 | #' where \code{p} is the number of columns of \code{X}.
30 | #' 
31 | #' Unless a lambda sequence is provided by the user, this function generates it on a 
32 | #' log-linear scale before calling 'glmnet' (default 'nlambda': 500).
33 | #' 
34 | #' For a complete list of the available additional arguments, see \code{\link[glmnet]{cv.glmnet}}
35 | #' and \code{\link[glmnet]{glmnet}}.
36 | #' 
37 | #' @family statistics
38 | #' 
39 | #' @examples
40 | #' set.seed(2022)
41 | #' p=200; n=100; k=15
42 | #' mu = rep(0,p); Sigma = diag(p)
43 | #' X = matrix(rnorm(n*p),n)
44 | #' nonzero = sample(p, k)
45 | #' beta = 3.5 * (1:p %in% nonzero)
46 | #' y = X %*% beta + rnorm(n)
47 | #' knockoffs = function(X) create.gaussian(X, mu, Sigma)
48 | #' 
49 | #' # Basic usage with default arguments
50 | #' result = knockoff.filter(X, y, knockoffs=knockoffs, 
51 | #'                            statistic=stat.lasso_coefdiff)
52 | #' print(result$selected)
53 | #' 
54 | #' # Advanced usage with custom arguments
55 | #' foo = stat.lasso_coefdiff
56 | #' k_stat = function(X, X_k, y) foo(X, X_k, y, nlambda=200)
57 | #' result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
58 | #' print(result$selected)
59 | #' 
60 | #' @rdname stat.lasso_coefdiff
61 | #' @export
62 | stat.lasso_coefdiff <- function(X, X_k, y, cores=2, ...) {
63 |   if( is.numeric(y) ){
64 |     y = as.vector(y)
65 |   } else {
66 |     stop('Knockoff statistic stat.lasso_coefdiff requires the input y to be a numeric vector')
67 |   }
68 |   
69 |   stat.glmnet_coefdiff(X, X_k, y, family='gaussian', cores=cores, ...)
70 | }
71 | 


--------------------------------------------------------------------------------
/R/knockoff/R/stats_lasso_cv_bin.R:
--------------------------------------------------------------------------------
 1 | #' Importance statistics based on regularized logistic regression with cross-validation
 2 | #' 
 3 | #' Fits a logistic regression model via penalized maximum likelihood and cross-validation.
 4 | #' Then, compute the difference statistic
 5 | #'   \deqn{W_j = |Z_j| - |\tilde{Z}_j|}
 6 | #' where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the coefficient estimates for the 
 7 | #' jth variable and its knockoff, respectively. The value of the regularization
 8 | #' parameter \eqn{\lambda} is selected by cross-validation and computed with \code{glmnet}.
 9 | #' 
10 | #' @param X n-by-p matrix of original variables..
11 | #' @param X_k n-by-p matrix of knockoff variables.
12 | #' @param y vector of length n, containing the response variables. It should be either a factor with two levels, 
13 | #' or a two-column matrix of counts or proportions 
14 | #' (the second column is treated as the target class; for a factor, the last level 
15 | #' in alphabetical order is the target class). If y is presented as a vector, 
16 | #' it will be coerced into a factor.
17 | #' @param cores Number of cores used to compute the statistics by running cv.glmnet.
18 | #' If not specified, the number of cores is set to approximately half of the number of cores 
19 | #' detected by the parallel package.
20 | #' @param ... additional arguments specific to \code{glmnet} (see Details).
21 | #' @return A vector of statistics \eqn{W} of length p.
22 | #' 
23 | #' @details This function uses the \code{glmnet} package to fit the penalized logistic regression path
24 | #' and is a wrapper around the more general \code{\link{stat.glmnet_coefdiff}}.
25 | #' 
26 | #' The statistics \eqn{W_j} are constructed by taking the difference 
27 | #' between the coefficient of the j-th variable and its knockoff.
28 | #'  
29 | #' By default, the value of the regularization parameter is chosen by 10-fold cross-validation.
30 | #' 
31 | #' The optional \code{nlambda} parameter can be used to control the granularity of the 
32 | #' grid of \eqn{\lambda}'s. The default value of \code{nlambda} is \code{500},
33 | #' where \code{p} is the number of columns of \code{X}.
34 | #' 
35 | #' For a complete list of the available additional arguments, see \code{\link[glmnet]{cv.glmnet}}
36 | #' and \code{\link[glmnet]{glmnet}}.
37 | #' 
38 | #' @family statistics
39 | #' 
40 | #' @examples
41 | #' set.seed(2022)
42 | #' p=200; n=100; k=15
43 | #' mu = rep(0,p); Sigma = diag(p)
44 | #' X = matrix(rnorm(n*p),n)
45 | #' nonzero = sample(p, k)
46 | #' beta = 3.5 * (1:p %in% nonzero)
47 | #' pr = 1/(1+exp(-X %*% beta))
48 | #' y = rbinom(n,1,pr)
49 | #' knockoffs = function(X) create.gaussian(X, mu, Sigma)
50 | #' 
51 | #' # Basic usage with default arguments
52 | #' result = knockoff.filter(X, y, knockoffs=knockoffs, 
53 | #'                            statistic=stat.lasso_coefdiff_bin)
54 | #' print(result$selected)
55 | #' 
56 | #' # Advanced usage with custom arguments
57 | #' foo = stat.lasso_coefdiff_bin
58 | #' k_stat = function(X, X_k, y) foo(X, X_k, y, nlambda=200)
59 | #' result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
60 | #' print(result$selected)
61 | #' 
62 | #' @rdname stat.lasso_coefdiff_bin
63 | #' @export
64 | stat.lasso_coefdiff_bin <- function(X, X_k, y, cores=2, ...) {
65 |   if (!is.factor(y) && !is.numeric(y)) {
66 |     stop('Input y must be either of numeric or factor type')
67 |   }
68 |   stat.glmnet_coefdiff(X, X_k, y, family='binomial', cores=cores, ...)
69 | }
70 | 


--------------------------------------------------------------------------------
/R/knockoff/R/stats_random_forest.R:
--------------------------------------------------------------------------------
 1 | #' Importance statistics based on random forests
 2 | #' 
 3 | #' Computes the difference statistic
 4 | #'   \deqn{W_j = |Z_j| - |\tilde{Z}_j|}
 5 | #' where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the random forest feature importances
 6 | #' of the jth variable and its knockoff, respectively.
 7 | #' 
 8 | #' @param X n-by-p matrix of original variables.
 9 | #' @param X_k n-by-p matrix of knockoff variables.
10 | #' @param y vector of length n, containing the response variables. If a factor, classification is assumed, 
11 | #' otherwise regression is assumed.
12 | #' @param ... additional arguments specific to \code{ranger} (see Details).
13 | #' @return A vector of statistics \eqn{W} of length p.
14 | #' 
15 | #' @details This function uses the \code{ranger} package to compute variable 
16 | #' importance measures. The importance of a variable is measured as the total decrease
17 | #' in node impurities from splitting on that variable, averaged over all trees. 
18 | #' For regression, the node impurity is measured by residual sum of squares.
19 | #' For classification, it is measured by the Gini index.
20 | #' 
21 | #' For a complete list of the available additional arguments, see \code{\link[ranger]{ranger}}. 
22 | #' 
23 | #' @family statistics
24 | #' 
25 | #' @examples
26 | #' set.seed(2022)
27 | #' p=200; n=100; k=15
28 | #' mu = rep(0,p); Sigma = diag(p)
29 | #' X = matrix(rnorm(n*p),n)
30 | #' nonzero = sample(p, k)
31 | #' beta = 3.5 * (1:p %in% nonzero)
32 | #' y = X %*% beta + rnorm(n)
33 | #' knockoffs = function(X) create.gaussian(X, mu, Sigma)
34 | #' 
35 | #' # Basic usage with default arguments
36 | #' result = knockoff.filter(X, y, knockoffs=knockoffs, 
37 | #'                            statistic=stat.random_forest)
38 | #' print(result$selected)
39 | #' 
40 | #' # Advanced usage with custom arguments
41 | #' foo = stat.random_forest
42 | #' k_stat = function(X, X_k, y) foo(X, X_k, y, nodesize=5)
43 | #' result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
44 | #' print(result$selected)
45 | #' 
46 | #' @rdname stat.random_forest
47 | #' @export
48 | stat.random_forest <- function(X, X_k, y, ...) {
49 |   if (!requireNamespace('ranger', quietly=T))
50 |     stop('ranger is not installed', call.=F)
51 |   
52 |   # Randomly swap columns of X and Xk
53 |   swap = rbinom(ncol(X),1,0.5)
54 |   swap.M = matrix(swap,nrow=nrow(X),ncol=length(swap),byrow=TRUE)
55 |   X.swap  = X * (1-swap.M) + X_k * swap.M
56 |   Xk.swap = X * swap.M + X_k * (1-swap.M)
57 |   
58 |   # Compute statistics
59 |   Z = random_forest_importance(cbind(X.swap, Xk.swap), y) 
60 |   p = ncol(X)
61 |   orig = 1:p
62 |   W = abs(Z[orig]) - abs(Z[orig+p])
63 |   
64 |   # Correct for swapping of columns of X and Xk
65 |   W = W * (1-2*swap)
66 | }
67 | 
68 | #' @keywords internal
69 | random_forest_importance <- function(X, y, ...) {
70 |   df = data.frame(y=y, X=X)
71 |   rfFit = ranger::ranger(y~., data=df, importance="impurity", write.forest=F, ...)
72 |   as.vector(rfFit$variable.importance)
73 | }


--------------------------------------------------------------------------------
/R/knockoff/R/stats_sqrt_lasso.R:
--------------------------------------------------------------------------------
 1 | #' Importance statistics based on the square-root lasso
 2 | #' 
 3 | #' Computes the signed maximum statistic
 4 | #'   \deqn{W_j = \max(Z_j, \tilde{Z}_j) \cdot \mathrm{sgn}(Z_j - \tilde{Z}_j),}
 5 | #' where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the maximum values of 
 6 | #' \eqn{\lambda} at which the jth variable and its knockoff, respectively,
 7 | #' enter the SQRT lasso model.
 8 | #' 
 9 | #' @param X n-by-p matrix of original variables.
10 | #' @param X_k n-by-p matrix of knockoff variables.
11 | #' @param y vector of length n, containing the response variables of numeric type.
12 | #' @param ... additional arguments specific to \code{slim}.
13 | #' @return A vector of statistics \eqn{W} of length p.
14 | #' 
15 | #' @details With default parameters, this function uses the package \code{RPtests}
16 | #' to run the SQRT lasso. By specifying the appropriate optional parameters, 
17 | #' one can use different Lasso variants including Dantzig Selector, LAD Lasso,
18 | #' SQRT Lasso and Lq Lasso for estimating high dimensional sparse linear models.
19 | #' 
20 | #' For a complete list of the available additional arguments, see \code{\link[RPtests]{sqrt_lasso}}.
21 | #' 
22 | #' @family statistics
23 | #' 
24 | #' @examples
25 | #' set.seed(2022)
26 | #' p=50; n=50; k=10
27 | #' mu = rep(0,p); Sigma = diag(p)
28 | #' X = matrix(rnorm(n*p),n)
29 | #' nonzero = sample(p, k)
30 | #' beta = 3.5 * (1:p %in% nonzero)
31 | #' y = X %*% beta + rnorm(n)
32 | #' knockoffs = function(X) create.gaussian(X, mu, Sigma)
33 | #' 
34 | #' # Basic usage with default arguments
35 | #' result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=stat.sqrt_lasso)
36 | #' print(result$selected)
37 | #' 
38 | #' # Advanced usage with custom arguments
39 | #' foo = stat.sqrt_lasso
40 | #' k_stat = function(X, X_k, y) foo(X, X_k, y, q=0.5)
41 | #' result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
42 | #' print(result$selected)
43 | #' 
44 | #' @rdname stat.sqrt_lasso
45 | #' @export
46 | stat.sqrt_lasso <- function(X, X_k, y, ...) {
47 |   if (!requireNamespace('RPtests', quietly=T))
48 |     stop('RPtests is not installed', call.=F)
49 |   if (!(is.vector(y) && is.numeric(y)))  {
50 |     stop('Knockoff statistic stat.sqrt_lasso requires the input y to be a numeric vector')
51 |   }
52 |   p = ncol(X)
53 |   
54 |   # Randomly swap columns of X and Xk
55 |   swap = rbinom(ncol(X),1,0.5)
56 |   swap.M = matrix(swap,nrow=nrow(X),ncol=length(swap),byrow=TRUE)
57 |   X.swap  = X * (1-swap.M) + X_k * swap.M
58 |   Xk.swap = X * swap.M + X_k * (1-swap.M)
59 |   
60 |   # Compute statistics
61 |   Z = RPtests::sqrt_lasso(cbind(X.swap, Xk.swap), as.numeric(y), ...)
62 |   p = ncol(X)
63 |   orig = 1:p
64 |   W = pmax(Z[orig], Z[orig+p]) * sign(Z[orig] - Z[orig+p])
65 |   
66 |   # Correct for swapping of columns of X and Xk
67 |   W = W * (1-2*swap)
68 | }


--------------------------------------------------------------------------------
/R/knockoff/R/stats_stability_selection.R:
--------------------------------------------------------------------------------
 1 | #' Importance statistics based on stability selection
 2 | #' 
 3 | #' Computes the difference statistic
 4 | #'   \deqn{W_j = |Z_j| - |\tilde{Z}_j|}
 5 | #' where \eqn{Z_j} and \eqn{\tilde{Z}_j} are measure the importance
 6 | #' of the jth variable and its knockoff, respectively, based on the 
 7 | #' stability of their selection upon subsampling of the data.
 8 | #' 
 9 | #' @param X n-by-p matrix of original variables.
10 | #' @param X_k n-by-p matrix of knockoff variables.
11 | #' @param y response vector (length n)
12 | #' @param fitfun fitfun a function that takes the arguments x, y as above, 
13 | #' and additionally the number of variables to include in each model q. 
14 | #' The function then needs to fit the model and to return a logical vector 
15 | #' that indicates which variable was selected (among the q selected variables).
16 | #' The name of the function should be prefixed by 'stabs::'.
17 | #' @param ... additional arguments specific to 'stabs' (see Details).
18 | #' @return A vector of statistics \eqn{W} of length p.
19 | #'   
20 | #' @details This function uses the \code{stabs} package to compute
21 | #' variable selection stability. The selection stability of the j-th 
22 | #' variable is defined as its probability of being selected upon random
23 | #' subsampling of the data. The default method for selecting variables 
24 | #' in each subsampled dataset is \code{\link[stabs]{lars.lasso}}.
25 | #' 
26 | #' For a complete list of the available additional arguments, see \code{\link[stabs]{stabsel}}. 
27 | #' 
28 | #' @family statistics
29 | #' 
30 | #' @examples
31 | #' set.seed(2022)
32 | #' p=50; n=50; k=15
33 | #' mu = rep(0,p); Sigma = diag(p)
34 | #' X = matrix(rnorm(n*p),n)
35 | #' nonzero = sample(p, k)
36 | #' beta = 3.5 * (1:p %in% nonzero)
37 | #' y = X %*% beta + rnorm(n)
38 | #' knockoffs = function(X) create.gaussian(X, mu, Sigma)
39 | #' 
40 | #' # Basic usage with default arguments
41 | #' result = knockoff.filter(X, y, knockoffs=knockoffs,
42 | #'                          statistic=stat.stability_selection)
43 | #' print(result$selected)
44 | #' 
45 | #' 
46 | #' @rdname stat.stability_selection
47 | #' @export
48 | stat.stability_selection <- function(X, X_k, y, fitfun = stabs::lars.lasso, ...) {
49 |   if (!requireNamespace('stabs', quietly=T))
50 |     stop('stabs is not installed', call.=F)
51 |   if (!is.vector(y)) {
52 |     stop('Knockoff statistic stat.stability_selection requires the input y to be a vector')
53 |   }
54 |   
55 |   # Randomly swap columns of X and Xk
56 |   swap = rbinom(ncol(X),1,0.5)
57 |   swap.M = matrix(swap,nrow=nrow(X),ncol=length(swap),byrow=TRUE)
58 |   X.swap  = X * (1-swap.M) + X_k * swap.M
59 |   Xk.swap = X * swap.M + X_k * (1-swap.M)
60 |   
61 |   # Compute statistics
62 |   Z = stability_selection_importance(cbind(X.swap, Xk.swap), y, fitfun=fitfun, ...)
63 |   p = ncol(X)
64 |   orig = 1:p
65 |   W = abs(Z[orig]) - abs(Z[orig+p])
66 |   
67 |   # Correct for swapping of columns of X and Xk
68 |   W = W * (1-2*swap)
69 | }
70 | 
71 | #' Stability selection
72 | #' 
73 | #' Perform variable selection with stability selection
74 | #' 
75 | #' @param X matrix of predictors
76 | #' @param y response vector
77 | #' @return vector with jth component the selection probability of variable j
78 | #' 
79 | #' @keywords internal
80 | stability_selection_importance <- function(X, y, ...) {
81 |   X = scale(X)
82 |   
83 |   if (!methods::hasArg(cutoff) ) {
84 |     cutoff = 0.75
85 |   }
86 |   if (!methods::hasArg(PFER) ) {
87 |     PFER = 1
88 |   }
89 |   
90 |   stabFit = stabs::stabsel(X, y, cutoff=cutoff, PFER=PFER, ...)
91 |   rowMeans(unname(stabFit$phat))
92 | }
93 | 


--------------------------------------------------------------------------------
/R/knockoff/R/util.R:
--------------------------------------------------------------------------------
 1 | # Fast versions of diag(d) %*% X and X %*% diag(d).
 2 | `%diag*%` <- function(d, X) d * X
 3 | `%*diag%` <- function(X, d) t(t(X) * d)
 4 | 
 5 | # Efficient test for matrix positive-definiteness
 6 | # 
 7 | # Computes the smallest eigenvalue of a matrix A to verify whether
 8 | # A is positive-definite
 9 | #' @keywords internal
10 | is_posdef = function(A, tol=1e-9) {
11 |   p = nrow(matrix(A))
12 |   
13 |   if (p<500) {
14 |     lambda_min = min(eigen(A)$values)
15 |   }
16 |   else {
17 |     oldw <- getOption("warn")
18 |     options(warn = -1)
19 |     lambda_min = RSpectra::eigs(A, 1, which="SM", opts=list(retvec = FALSE, maxitr=100, tol))$values
20 |     options(warn = oldw)
21 |     if( length(lambda_min)==0 ) {
22 |       # RSpectra::eigs did not converge. Using eigen instead."
23 |       lambda_min = min(eigen(A)$values)
24 |     }
25 |   }
26 |   return (lambda_min>tol*10)
27 | }
28 |   
29 | # Reduced SVD with canonical sign choice.
30 | # 
31 | # Our convention is that the sign of each vector in U is chosen such that the
32 | # coefficient with the largest absolute value is positive.
33 | #' @keywords internal
34 | canonical_svd = function(X) {
35 |   X.svd = tryCatch({
36 |     svd(X)
37 |   }, warning = function(w){}, error = function(e) {
38 |       stop("SVD failed in the creation of fixed-design knockoffs. Try upgrading R to version >= 3.3.0")
39 |   }, finally = {})
40 |   
41 |   for (j in 1:min(dim(X))) {
42 |     i = which.max(abs(X.svd$u[,j]))
43 |     if (X.svd$u[i,j] < 0) {
44 |       X.svd$u[,j] = -X.svd$u[,j]
45 |       X.svd$v[,j] = -X.svd$v[,j]
46 |   }
47 |     }
48 |   return(X.svd)
49 | }
50 | 
51 | # Scale the columns of a matrix to have unit norm.
52 | #' @keywords internal
53 | normc = function(X,center=T) {
54 |   X.centered = scale(X, center=center, scale=F)
55 |   X.scaled = scale(X.centered, center=F, scale=sqrt(colSums(X.centered^2)))
56 |   X.scaled[,] # No attributes
57 | }
58 | 
59 | # Generate a random matrix with i.i.d. normal entries.
60 | #' @keywords internal
61 | rnorm_matrix = function(n, p, mean=0, sd=1) {
62 |   matrix(rnorm(n*p, mean, sd), nrow=n, ncol=p)
63 | }
64 | 
65 | # Generate a random, sparse regression problem.
66 | #' @keywords internal
67 | random_problem = function(n, p, k=NULL, amplitude=3) {
68 |   if (is.null(k)) k = max(1, as.integer(p/5))
69 |   X = normc(rnorm_matrix(n, p))
70 |   nonzero = sample(p, k)
71 |   beta = amplitude * (1:p %in% nonzero)
72 |   y.sample <- function() X %*% beta + rnorm(n)
73 |   list(X = X, beta = beta, y = y.sample(), y.sample = y.sample)
74 | }
75 | 
76 | # Evaluate an expression with the given random seed, then restore the old seed.
77 | #' @keywords internal
78 | with_seed = function(seed, expr) {
79 |   seed.old = if (exists('.Random.seed')) .Random.seed else NULL
80 |   set.seed(seed)
81 |   on.exit({
82 |     if (is.null(seed.old)) {
83 |       if (exists('.Random.seed')) 
84 |         rm(.Random.seed, envir=.GlobalEnv)
85 |     } else {
86 |       .Random.seed <<- seed.old
87 |     }
88 |   })
89 |   expr
90 | }


--------------------------------------------------------------------------------
/R/knockoff/knockoff.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --with-keep.source
18 | PackageCheckArgs: --as-cran
19 | PackageRoxygenize: rd,collate,namespace
20 | 


--------------------------------------------------------------------------------
/R/knockoff/man/create.fixed.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/create_fixed.R
 3 | \name{create.fixed}
 4 | \alias{create.fixed}
 5 | \title{Fixed-X knockoffs}
 6 | \usage{
 7 | create.fixed(
 8 |   X,
 9 |   method = c("sdp", "equi"),
10 |   sigma = NULL,
11 |   y = NULL,
12 |   randomize = F
13 | )
14 | }
15 | \arguments{
16 | \item{X}{normalized n-by-p matrix of original variables.(\eqn{n \geq p}).}
17 | 
18 | \item{method}{either "equi" or "sdp" (default: "sdp").
19 | This determines the method that will be used to minimize the correlation between the original variables and the knockoffs.}
20 | 
21 | \item{sigma}{the noise level, used to augment the data with extra rows if necessary (default: NULL).}
22 | 
23 | \item{y}{vector of length n, containing the observed responses. 
24 | This is needed to estimate the noise level if the parameter \code{sigma} is not provided, 
25 | in case \eqn{p \leq n < 2p} (default: NULL).}
26 | 
27 | \item{randomize}{whether the knockoffs are constructed deterministically or randomized (default: F).}
28 | }
29 | \value{
30 | An object of class "knockoff.variables". This is a list 
31 |  containing at least the following components:
32 |  \item{X}{n-by-p matrix of original variables (possibly augmented or transformed).}
33 |  \item{Xk}{n-by-p matrix of knockoff variables.}
34 |  \item{y}{vector of observed responses (possibly augmented). }
35 | }
36 | \description{
37 | Creates fixed-X knockoff variables.
38 | }
39 | \details{
40 | Fixed-X knockoffs assume a homoscedastic linear regression model for \eqn{Y|X}. Moreover, they only guarantee
41 | FDR control when used in combination with statistics satisfying the "sufficiency" property. 
42 | In particular, the default statistics based on the cross-validated lasso does not satisfy this 
43 | property and should not be used with fixed-X knockoffs.
44 | }
45 | \examples{
46 | set.seed(2022)
47 | p=100; n=200; k=15
48 | X = matrix(rnorm(n*p),n)
49 | nonzero = sample(p, k)
50 | beta = 5.5 * (1:p \%in\% nonzero)
51 | y = X \%*\% beta + rnorm(n)
52 | 
53 | # Basic usage with default arguments
54 | result = knockoff.filter(X, y, knockoffs=create.fixed)
55 | print(result$selected)
56 | 
57 | # Advanced usage with custom arguments
58 | knockoffs = function(X) create.fixed(X, method='equi')
59 | result = knockoff.filter(X, y, knockoffs=knockoffs)
60 | print(result$selected) 
61 | 
62 | }
63 | \references{
64 | Barber and Candes,
65 |   Controlling the false discovery rate via knockoffs. 
66 |   Ann. Statist. 43 (2015), no. 5, 2055--2085.
67 |   \doi{10.1214/15-AOS1337}
68 | }
69 | \seealso{
70 | Other create: 
71 | \code{\link{create.gaussian}()},
72 | \code{\link{create.second_order}()}
73 | }
74 | \concept{create}
75 | 


--------------------------------------------------------------------------------
/R/knockoff/man/create.gaussian.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/create_gaussian.R
 3 | \name{create.gaussian}
 4 | \alias{create.gaussian}
 5 | \title{Model-X Gaussian knockoffs}
 6 | \usage{
 7 | create.gaussian(X, mu, Sigma, method = c("asdp", "sdp", "equi"), diag_s = NULL)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{mu}{vector of length p, indicating the mean parameter of the Gaussian model for \eqn{X}.}
13 | 
14 | \item{Sigma}{p-by-p covariance matrix for the Gaussian model of \eqn{X}.}
15 | 
16 | \item{method}{either "equi", "sdp" or "asdp" (default: "asdp").
17 | This determines the method that will be used to minimize the correlation between the original variables and the knockoffs.}
18 | 
19 | \item{diag_s}{vector of length p, containing the pre-computed covariances between the original 
20 | variables and the knockoffs. This will be computed according to \code{method}, if not supplied.}
21 | }
22 | \value{
23 | A n-by-p matrix of knockoff variables.
24 | }
25 | \description{
26 | Samples multivariate Gaussian model-X knockoff variables.
27 | }
28 | \examples{
29 | set.seed(2022)
30 | p=200; n=100; k=15
31 | rho = 0.4
32 | mu = rep(0,p); Sigma = toeplitz(rho^(0:(p-1)))
33 | X = matrix(rnorm(n*p),n) \%*\% chol(Sigma)
34 | nonzero = sample(p, k)
35 | beta = 3.5 * (1:p \%in\% nonzero)
36 | y = X \%*\% beta + rnorm(n)
37 | 
38 | # Basic usage with default arguments
39 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
40 | result = knockoff.filter(X, y, knockoffs=knockoffs)
41 | print(result$selected)
42 | 
43 | # Advanced usage with custom arguments
44 | knockoffs = function(X) create.gaussian(X, mu, Sigma, method='equi')
45 | result = knockoff.filter(X, y, knockoffs=knockoffs)
46 | print(result$selected)
47 | 
48 | }
49 | \references{
50 | Candes et al., Panning for Gold: Model-free Knockoffs for High-dimensional Controlled Variable Selection,
51 |   arXiv:1610.02351 (2016).
52 |   \href{https://web.stanford.edu/group/candes/knockoffs/index.html}{https://web.stanford.edu/group/candes/knockoffs/index.html}
53 | }
54 | \seealso{
55 | Other create: 
56 | \code{\link{create.fixed}()},
57 | \code{\link{create.second_order}()}
58 | }
59 | \concept{create}
60 | 


--------------------------------------------------------------------------------
/R/knockoff/man/create.second_order.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/create_second_order.R
 3 | \name{create.second_order}
 4 | \alias{create.second_order}
 5 | \title{Second-order Gaussian knockoffs}
 6 | \usage{
 7 | create.second_order(X, method = c("asdp", "equi", "sdp"), shrink = F)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{method}{either "equi", "sdp" or "asdp" (default: "asdp").
13 | This determines the method that will be used to minimize the correlation between the original variables and the knockoffs.}
14 | 
15 | \item{shrink}{whether to shrink the estimated covariance matrix (default: F).}
16 | }
17 | \value{
18 | A n-by-p matrix of knockoff variables.
19 | }
20 | \description{
21 | This function samples second-order multivariate Gaussian knockoff variables.
22 | First, a multivariate Gaussian distribution is fitted to the observations of X.
23 | Then, Gaussian knockoffs are generated according to the estimated model.
24 | }
25 | \details{
26 | If the argument \code{shrink} is set to T, a James-Stein-type shrinkage estimator for
27 | the covariance matrix is used instead of the traditional maximum-likelihood estimate. This option
28 | requires the package \code{corpcor}. See \code{\link[corpcor]{cov.shrink}} for more details.
29 | 
30 | Even if the argument \code{shrink} is set to F, in the case that the estimated covariance 
31 | matrix is not positive-definite, this function will apply some shrinkage.
32 | }
33 | \examples{
34 | set.seed(2022)
35 | p=200; n=100; k=15
36 | rho = 0.4
37 | Sigma = toeplitz(rho^(0:(p-1)))
38 | X = matrix(rnorm(n*p),n) \%*\% chol(Sigma)
39 | nonzero = sample(p, k)
40 | beta = 3.5 * (1:p \%in\% nonzero)
41 | y = X \%*\% beta + rnorm(n)
42 | 
43 | # Basic usage with default arguments
44 | result = knockoff.filter(X, y, knockoffs=create.second_order)
45 | print(result$selected)
46 | 
47 | # Advanced usage with custom arguments
48 | knockoffs = function(X) create.second_order(X, method='equi')
49 | result = knockoff.filter(X, y, knockoffs=knockoffs)
50 | print(result$selected)   
51 |   
52 | }
53 | \references{
54 | Candes et al., Panning for Gold: Model-free Knockoffs for High-dimensional Controlled Variable Selection,
55 |   arXiv:1610.02351 (2016).
56 |   \href{https://web.stanford.edu/group/candes/knockoffs/index.html}{https://web.stanford.edu/group/candes/knockoffs/index.html}
57 | }
58 | \seealso{
59 | Other create: 
60 | \code{\link{create.fixed}()},
61 | \code{\link{create.gaussian}()}
62 | }
63 | \concept{create}
64 | 


--------------------------------------------------------------------------------
/R/knockoff/man/create.solve_asdp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/solve_asdp.R
 3 | \name{create.solve_asdp}
 4 | \alias{create.solve_asdp}
 5 | \title{Relaxed optimization for fixed-X and Gaussian knockoffs}
 6 | \usage{
 7 | create.solve_asdp(
 8 |   Sigma,
 9 |   max.size = 500,
10 |   gaptol = 1e-06,
11 |   maxit = 1000,
12 |   verbose = FALSE
13 | )
14 | }
15 | \arguments{
16 | \item{Sigma}{positive-definite p-by-p covariance matrix.}
17 | 
18 | \item{max.size}{size of the largest block in the block-diagonal approximation of Sigma (default: 500). See Details.}
19 | 
20 | \item{gaptol}{tolerance for duality gap as a fraction of the value of the objective functions (default: 1e-6).}
21 | 
22 | \item{maxit}{the maximum number of iterations for the solver (default: 1000).}
23 | 
24 | \item{verbose}{whether to display progress (default: FALSE).}
25 | }
26 | \value{
27 | The solution \eqn{s} to the semidefinite program defined above.
28 | }
29 | \description{
30 | This function solves the optimization problem needed to create fixed-X and Gaussian SDP knockoffs
31 | on a block-diagonal approximation of the covariance matrix. This will be less
32 | powerful than \code{\link{create.solve_sdp}}, but more computationally efficient.
33 | }
34 | \details{
35 | Solves the following two-step semidefinite program:
36 | 
37 |   (step 1)  \deqn{ \mathrm{maximize}     \; \mathrm{sum}(s) \quad
38 |                    \mathrm{subject} \; \mathrm{to:}  \; 0 \leq s \leq 1, \;
39 |                                          2 \Sigma_{\mathrm{approx}} - \mathrm{diag}(s) \geq 0}
40 |                              
41 |   (step 2) \deqn{ \mathrm{maximize}      \; \gamma \quad
42 |                   \mathrm{subject} \; \mathrm{to:}    \; \mathrm{diag}(\gamma s) \leq 2 \Sigma}
43 | 
44 | Each smaller SDP is solved using the interior-point method implemented in \code{\link[Rdsdp]{dsdp}}.
45 | 
46 | The parameter max.size controls the size of the largest semidefinite program that needs to be solved.
47 | A larger value of max.size will increase the computation cost, while yielding a solution closer to
48 | that of the original semidefinite program.
49 | 
50 | If the matrix Sigma supplied by the user is a non-scaled covariance matrix 
51 | (i.e. its diagonal entries are not all equal to 1), then the appropriate scaling is applied before
52 | solving the SDP defined above. The result is then scaled back before being returned, as to match 
53 | the original scaling of the covariance matrix supplied by the user.
54 | }
55 | \seealso{
56 | Other optimization: 
57 | \code{\link{create.solve_equi}()},
58 | \code{\link{create.solve_sdp}()}
59 | }
60 | \concept{optimization}
61 | 


--------------------------------------------------------------------------------
/R/knockoff/man/create.solve_equi.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/solve_equi.R
 3 | \name{create.solve_equi}
 4 | \alias{create.solve_equi}
 5 | \title{Optimization for equi-correlated fixed-X and Gaussian knockoffs}
 6 | \usage{
 7 | create.solve_equi(Sigma)
 8 | }
 9 | \arguments{
10 | \item{Sigma}{positive-definite p-by-p covariance matrix.}
11 | }
12 | \value{
13 | The solution \eqn{s} to the optimization problem defined above.
14 | }
15 | \description{
16 | This function solves a very simple optimization problem needed to create fixed-X and 
17 | Gaussian SDP knockoffs on the full the covariance matrix. This may be significantly
18 | less powerful than \code{\link{create.solve_sdp}}.
19 | }
20 | \details{
21 | Computes the closed-form solution to the semidefinite programming problem:
22 |  \deqn{ \mathrm{maximize}  \; s \quad
23 |        \mathrm{subject} \; \mathrm{to:}   \; 0 \leq s \leq 1, \;
24 |        2\Sigma - sI \geq 0 }
25 | used to generate equi-correlated knockoffs.
26 | 
27 | The closed form-solution to this problem is \eqn{s = 2\lambda_{\mathrm{min}}(\Sigma) \land 1}.
28 | }
29 | \seealso{
30 | Other optimization: 
31 | \code{\link{create.solve_asdp}()},
32 | \code{\link{create.solve_sdp}()}
33 | }
34 | \concept{optimization}
35 | 


--------------------------------------------------------------------------------
/R/knockoff/man/create.solve_sdp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/solve_sdp.R
 3 | \name{create.solve_sdp}
 4 | \alias{create.solve_sdp}
 5 | \title{Optimization for fixed-X and Gaussian knockoffs}
 6 | \usage{
 7 | create.solve_sdp(Sigma, gaptol = 1e-06, maxit = 1000, verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{Sigma}{positive-definite p-by-p covariance matrix.}
11 | 
12 | \item{gaptol}{tolerance for duality gap as a fraction of the value of the objective functions (default: 1e-6).}
13 | 
14 | \item{maxit}{maximum number of iterations for the solver (default: 1000).}
15 | 
16 | \item{verbose}{whether to display progress (default: FALSE).}
17 | }
18 | \value{
19 | The solution \eqn{s} to the semidefinite programming problem defined above.
20 | }
21 | \description{
22 | This function solves the optimization problem needed to create fixed-X and Gaussian SDP knockoffs
23 | on the full covariance matrix. This will be more powerful than \code{\link{create.solve_asdp}},
24 | but more computationally expensive.
25 | }
26 | \details{
27 | Solves the semidefinite programming problem:
28 | 
29 |   \deqn{ \mathrm{maximize}      \; \mathrm{sum}(s) \quad
30 |           \mathrm{subject} \; \mathrm{to}    0 \leq s \leq 1, \;
31 |                                  2\Sigma - \mathrm{diag}(s) \geq 0}
32 | 
33 | This problem is solved using the interior-point method implemented in \code{\link[Rdsdp]{dsdp}}.
34 | 
35 | If the matrix Sigma supplied by the user is a non-scaled covariance matrix 
36 | (i.e. its diagonal entries are not all equal to 1), then the appropriate scaling is applied before
37 | solving the SDP defined above. The result is then scaled back before being returned, as to match 
38 | the original scaling of the covariance matrix supplied by the user.
39 | }
40 | \seealso{
41 | Other optimization: 
42 | \code{\link{create.solve_asdp}()},
43 | \code{\link{create.solve_equi}()}
44 | }
45 | \concept{optimization}
46 | 


--------------------------------------------------------------------------------
/R/knockoff/man/create_equicorrelated.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/create_fixed.R
 3 | \name{create_equicorrelated}
 4 | \alias{create_equicorrelated}
 5 | \title{Create equicorrelated fixed-X knockoffs.}
 6 | \usage{
 7 | create_equicorrelated(X, randomize)
 8 | }
 9 | \description{
10 | Create equicorrelated fixed-X knockoffs.
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/R/knockoff/man/create_sdp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/create_fixed.R
 3 | \name{create_sdp}
 4 | \alias{create_sdp}
 5 | \title{Create SDP fixed-X knockoffs.}
 6 | \usage{
 7 | create_sdp(X, randomize)
 8 | }
 9 | \description{
10 | Create SDP fixed-X knockoffs.
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/R/knockoff/man/decompose.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/create_fixed.R
 3 | \name{decompose}
 4 | \alias{decompose}
 5 | \title{Compute the SVD of X and construct an orthogonal matrix U_perp such that U_perp * U = 0.}
 6 | \usage{
 7 | decompose(X, randomize)
 8 | }
 9 | \description{
10 | Compute the SVD of X and construct an orthogonal matrix U_perp such that U_perp * U = 0.
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/R/knockoff/man/divide.sdp.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/solve_asdp.R
 3 | \name{divide.sdp}
 4 | \alias{divide.sdp}
 5 | \title{Approximate a covariance matrix by a block diagonal matrix with blocks
 6 | of approximately equal size using Ward's method for hierarchical clustering}
 7 | \usage{
 8 | divide.sdp(Sigma, max.size)
 9 | }
10 | \description{
11 | Approximate a covariance matrix by a block diagonal matrix with blocks
12 | of approximately equal size using Ward's method for hierarchical clustering
13 | }
14 | \keyword{internal}
15 | 


--------------------------------------------------------------------------------
/R/knockoff/man/fs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_forward_selection.R
 3 | \name{fs}
 4 | \alias{fs}
 5 | \title{Forward selection}
 6 | \usage{
 7 | fs(X, y, omp = FALSE)
 8 | }
 9 | \arguments{
10 | \item{X}{matrix of predictors}
11 | 
12 | \item{y}{response vector}
13 | 
14 | \item{omp}{whether to use orthogonal matching pursuit (OMP)}
15 | }
16 | \value{
17 | vector with jth component the variable added at step j
18 | }
19 | \description{
20 | Perform forward variable selection with or without OMP
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/R/knockoff/man/knockoff.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/knockoff.R, R/knockoff_filter.R
 3 | \docType{package}
 4 | \name{knockoff}
 5 | \alias{knockoff}
 6 | \title{knockoff: A package for controlled variable selection}
 7 | \description{
 8 | This package implements the Knockoff Filter, which is a powerful and versatile tool for 
 9 | controlled variable selection.
10 | }
11 | \section{Outline}{
12 | 
13 | The procedure is based on the contruction of artificial 'knockoff copies' of the variables 
14 | present in the given statistical model. Then, it selects those variables that are clearly better 
15 | than their corresponding knockoffs, based on some measure of variable importance.
16 | A wide range of statistics and machine learning tools can be exploited to estimate the 
17 | importance of each variable, while guaranteeing finite-sample control of the false
18 | discovery rate (FDR).
19 | 
20 | The Knockoff Filter controls the FDR in either of two statistical scenarios:
21 | \itemize{
22 |  \item{The "model-X" scenario: }{the response \eqn{Y} can depend on the variables \eqn{X=(X_1,\ldots,X_p)}
23 |  in an arbitrary and unknown fashion, but the distribution of \eqn{X} must be known. In thise case
24 |  there are no constraints on the dimensions \eqn{n} and \eqn{p} of the problem.}
25 |  \item{The "fixed-X" scenario: }{the response \eqn{Y} depends upon \eqn{X} through a 
26 |  homoscedastic Gaussian linear model and the problem is low-dimensional (\eqn{n \geq p}). 
27 |  In this case, no modeling assumptions on \eqn{X} are required. }
28 | }
29 | 
30 | For more information, see the website below and the accompanying paper.
31 | 
32 | \url{https://web.stanford.edu/group/candes/knockoffs/index.html}
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/R/knockoff/man/knockoff.threshold.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/knockoff_filter.R
 3 | \name{knockoff.threshold}
 4 | \alias{knockoff.threshold}
 5 | \title{Threshold for the knockoff filter}
 6 | \usage{
 7 | knockoff.threshold(W, fdr = 0.1, offset = 1)
 8 | }
 9 | \arguments{
10 | \item{W}{the test statistics}
11 | 
12 | \item{fdr}{target false discovery rate (default: 0.1)}
13 | 
14 | \item{offset}{either 0 or 1 (default: 1). The offset used to compute the rejection threshold on the
15 | statistics. The value 1 yields a slightly more conservative procedure ("knockoffs+") that
16 | controls the FDR according to the usual definition, while an offset of 0 controls a modified FDR.}
17 | }
18 | \value{
19 | The threshold for variable selection.
20 | }
21 | \description{
22 | Computes the threshold for the knockoff filter.
23 | }
24 | 


--------------------------------------------------------------------------------
/R/knockoff/man/lasso_max_lambda.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_glmnet.R
 3 | \name{lasso_max_lambda}
 4 | \alias{lasso_max_lambda}
 5 | \title{Maximum lambda in lasso model}
 6 | \usage{
 7 | lasso_max_lambda(X, y, method = c("glmnet", "lars"), ...)
 8 | }
 9 | \arguments{
10 | \item{X}{matrix of predictors}
11 | 
12 | \item{y}{response vector}
13 | 
14 | \item{method}{either 'glmnet' or 'lars'}
15 | }
16 | \value{
17 | vector of maximum lambda's
18 | }
19 | \description{
20 | Computes the earliest (largest) lambda's for which predictors enter the
21 | lasso model.
22 | }
23 | \keyword{internal}
24 | 


--------------------------------------------------------------------------------
/R/knockoff/man/merge.clusters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/solve_asdp.R
 3 | \name{merge.clusters}
 4 | \alias{merge.clusters}
 5 | \title{Merge consecutive clusters of correlated variables while ensuring 
 6 | that no cluster has size larger than max.size}
 7 | \usage{
 8 | \method{merge}{clusters}(clusters, max.size)
 9 | }
10 | \description{
11 | Merge consecutive clusters of correlated variables while ensuring 
12 | that no cluster has size larger than max.size
13 | }
14 | \keyword{internal}
15 | 


--------------------------------------------------------------------------------
/R/knockoff/man/print.knockoff.result.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/knockoff_filter.R
 3 | \name{print.knockoff.result}
 4 | \alias{print.knockoff.result}
 5 | \title{Print results for the knockoff filter}
 6 | \usage{
 7 | \method{print}{knockoff.result}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{the output of a call to knockoff.filter}
11 | 
12 | \item{...}{unused}
13 | }
14 | \description{
15 | Prints the list of variables selected by the knockoff filter and the corresponding function call.
16 | }
17 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stability_selection_importance.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_stability_selection.R
 3 | \name{stability_selection_importance}
 4 | \alias{stability_selection_importance}
 5 | \title{Stability selection}
 6 | \usage{
 7 | stability_selection_importance(X, y, ...)
 8 | }
 9 | \arguments{
10 | \item{X}{matrix of predictors}
11 | 
12 | \item{y}{response vector}
13 | }
14 | \value{
15 | vector with jth component the selection probability of variable j
16 | }
17 | \description{
18 | Perform variable selection with stability selection
19 | }
20 | \keyword{internal}
21 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.forward_selection.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_forward_selection.R
 3 | \name{stat.forward_selection}
 4 | \alias{stat.forward_selection}
 5 | \title{Importance statistics based on forward selection}
 6 | \usage{
 7 | stat.forward_selection(X, X_k, y, omp = F)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{numeric vector of length n, containing the response variables.}
15 | 
16 | \item{omp}{whether to use orthogonal matching pursuit (default: F).}
17 | }
18 | \value{
19 | A vector of statistics \eqn{W} of length p.
20 | }
21 | \description{
22 | Computes the statistic
23 |   \deqn{W_j = \max(Z_j, Z_{j+p}) \cdot \mathrm{sgn}(Z_j - Z_{j+p}),}
24 | where \eqn{Z_1,\dots,Z_{2p}} give the reverse order in which the 2p
25 | variables (the originals and the knockoffs) enter the forward selection 
26 | model.
27 | See the Details for information about forward selection.
28 | }
29 | \details{
30 | In \emph{forward selection}, the variables are chosen iteratively to maximize
31 | the inner product with the residual from the previous step. The initial
32 | residual is always \code{y}. In standard forward selection
33 | (\code{stat.forward_selection}), the next residual is the remainder after
34 | regressing on the selected variable; when orthogonal matching pursuit
35 | is used, the next residual is the remainder
36 | after regressing on \emph{all} the previously selected variables.
37 | }
38 | \examples{
39 | set.seed(2022)
40 | p=100; n=100; k=15
41 | mu = rep(0,p); Sigma = diag(p)
42 | X = matrix(rnorm(n*p),n)
43 | nonzero = sample(p, k)
44 | beta = 3.5 * (1:p \%in\% nonzero)
45 | y = X \%*\% beta + rnorm(n)
46 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
47 | 
48 | # Basic usage with default arguments
49 | result = knockoff.filter(X, y, knockoffs=knockoffs,
50 |                            statistic=stat.forward_selection)
51 | print(result$selected)
52 | 
53 | # Advanced usage with custom arguments
54 | foo = stat.forward_selection
55 | k_stat = function(X, X_k, y) foo(X, X_k, y, omp=TRUE)
56 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
57 | print(result$selected)
58 | 
59 | }
60 | \seealso{
61 | Other statistics: 
62 | \code{\link{stat.glmnet_coefdiff}()},
63 | \code{\link{stat.glmnet_lambdadiff}()},
64 | \code{\link{stat.lasso_coefdiff_bin}()},
65 | \code{\link{stat.lasso_coefdiff}()},
66 | \code{\link{stat.lasso_lambdadiff_bin}()},
67 | \code{\link{stat.lasso_lambdadiff}()},
68 | \code{\link{stat.random_forest}()},
69 | \code{\link{stat.sqrt_lasso}()},
70 | \code{\link{stat.stability_selection}()}
71 | }
72 | \concept{statistics}
73 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.glmnet_coefdiff.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/stats_glmnet_cv.R
  3 | \name{stat.glmnet_coefdiff}
  4 | \alias{stat.glmnet_coefdiff}
  5 | \title{Importance statistics based on a GLM with cross-validation}
  6 | \usage{
  7 | stat.glmnet_coefdiff(X, X_k, y, family = "gaussian", cores = 2, ...)
  8 | }
  9 | \arguments{
 10 | \item{X}{n-by-p matrix of original variables.}
 11 | 
 12 | \item{X_k}{n-by-p matrix of knockoff variables.}
 13 | 
 14 | \item{y}{vector of length n, containing the response variables. Quantitative for family="gaussian", 
 15 | or family="poisson" (non-negative counts). For family="binomial" 
 16 | should be either a factor with two levels, or a two-column matrix of counts 
 17 | or proportions (the second column is treated as the target class; for a factor, 
 18 | the last level in alphabetical order is the target class). For family="multinomial", 
 19 | can be a nc>=2 level factor, or a matrix with nc columns of counts or proportions. 
 20 | For either "binomial" or "multinomial", if y is presented as a vector, it will 
 21 | be coerced into a factor. For family="cox", y should be a two-column matrix with 
 22 | columns named 'time' and 'status'. The latter is a binary variable, with '1' 
 23 | indicating death, and '0' indicating right censored. The function Surv() in 
 24 | package survival produces such a matrix. For family="mgaussian", y is a matrix 
 25 | of quantitative responses.}
 26 | 
 27 | \item{family}{response type (see above).}
 28 | 
 29 | \item{cores}{Number of cores used to compute the statistics by running cv.glmnet.
 30 | Unless otherwise specified, the number of cores is set equal to two (if available).}
 31 | 
 32 | \item{...}{additional arguments specific to \code{glmnet} (see Details).}
 33 | }
 34 | \value{
 35 | A vector of statistics \eqn{W} of length p.
 36 | }
 37 | \description{
 38 | Fits a generalized linear model via penalized maximum likelihood and cross-validation.
 39 | Then, compute the difference statistic
 40 |   \deqn{W_j = |Z_j| - |\tilde{Z}_j|}
 41 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the coefficient estimates for the 
 42 | jth variable and its knockoff, respectively. The value of the regularization
 43 | parameter \eqn{\lambda} is selected by cross-validation and computed with \code{glmnet}.
 44 | }
 45 | \details{
 46 | This function uses the \code{glmnet} package to fit a generalized linear model
 47 | via penalized maximum likelihood.
 48 | 
 49 | The statistics \eqn{W_j} are constructed by taking the difference 
 50 | between the coefficient of the j-th variable and its knockoff.
 51 |  
 52 | By default, the value of the regularization parameter is chosen by 10-fold cross-validation.
 53 | 
 54 | The default response family is 'gaussian', for a linear regression model.
 55 | Different response families (e.g. 'binomial') can be specified by passing an
 56 | optional parameter 'family'.
 57 | 
 58 | The optional \code{nlambda} parameter can be used to control the granularity of the 
 59 | grid of \eqn{\lambda}'s. The default value of \code{nlambda} is \code{500},
 60 | where \code{p} is the number of columns of \code{X}.
 61 | 
 62 | If the family is 'binomial' and a lambda sequence is not provided by the user, 
 63 | this function generates it on a log-linear scale before calling 'glmnet'.
 64 | 
 65 | For a complete list of the available additional arguments, see \code{\link[glmnet]{cv.glmnet}}
 66 | and \code{\link[glmnet]{glmnet}}.
 67 | }
 68 | \examples{
 69 | set.seed(2022)
 70 | p=200; n=100; k=15
 71 | mu = rep(0,p); Sigma = diag(p)
 72 | X = matrix(rnorm(n*p),n)
 73 | nonzero = sample(p, k)
 74 | beta = 3.5 * (1:p \%in\% nonzero)
 75 | y = X \%*\% beta + rnorm(n)
 76 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
 77 | 
 78 | # Basic usage with default arguments
 79 | result = knockoff.filter(X, y, knockoffs=knockoffs, 
 80 |                            statistic=stat.glmnet_coefdiff)
 81 | print(result$selected)
 82 | 
 83 | # Advanced usage with custom arguments
 84 | foo = stat.glmnet_coefdiff
 85 | k_stat = function(X, X_k, y) foo(X, X_k, y, nlambda=200)
 86 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
 87 | print(result$selected)
 88 | 
 89 | }
 90 | \seealso{
 91 | Other statistics: 
 92 | \code{\link{stat.forward_selection}()},
 93 | \code{\link{stat.glmnet_lambdadiff}()},
 94 | \code{\link{stat.lasso_coefdiff_bin}()},
 95 | \code{\link{stat.lasso_coefdiff}()},
 96 | \code{\link{stat.lasso_lambdadiff_bin}()},
 97 | \code{\link{stat.lasso_lambdadiff}()},
 98 | \code{\link{stat.random_forest}()},
 99 | \code{\link{stat.sqrt_lasso}()},
100 | \code{\link{stat.stability_selection}()}
101 | }
102 | \concept{statistics}
103 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.glmnet_lambdadiff.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_glmnet.R
 3 | \name{stat.glmnet_lambdadiff}
 4 | \alias{stat.glmnet_lambdadiff}
 5 | \title{Importance statistics based on a GLM}
 6 | \usage{
 7 | stat.glmnet_lambdadiff(X, X_k, y, family = "gaussian", ...)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{vector of length n, containing the response variables. Quantitative for family="gaussian", 
15 | or family="poisson" (non-negative counts). For family="binomial" 
16 | should be either a factor with two levels, or a two-column matrix of counts 
17 | or proportions (the second column is treated as the target class; for a factor, 
18 | the last level in alphabetical order is the target class). For family="multinomial", 
19 | can be a nc>=2 level factor, or a matrix with nc columns of counts or proportions. 
20 | For either "binomial" or "multinomial", if y is presented as a vector, it will 
21 | be coerced into a factor. For family="cox", y should be a two-column matrix with 
22 | columns named 'time' and 'status'. The latter is a binary variable, with '1' 
23 | indicating death, and '0' indicating right censored. The function Surv() in 
24 | package survival produces such a matrix. For family="mgaussian", y is a matrix 
25 | of quantitative responses.}
26 | 
27 | \item{family}{response type (see above).}
28 | 
29 | \item{...}{additional arguments specific to \code{glmnet} (see Details).}
30 | }
31 | \value{
32 | A vector of statistics \eqn{W} of length p.
33 | }
34 | \description{
35 | Fits a generalized linear model via penalized maximum likelihood and
36 | computes the difference statistic
37 |   \deqn{W_j = Z_j - \tilde{Z}_j}
38 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the maximum values of the 
39 | regularization parameter \eqn{\lambda} at which the jth variable 
40 | and its knockoff enter the model, respectively.
41 | }
42 | \details{
43 | This function uses \code{glmnet} to compute the regularization path
44 | on a fine grid of \eqn{\lambda}'s.
45 | 
46 | The \code{nlambda} parameter can be used to control the granularity of the 
47 | grid of \eqn{\lambda}'s. The default value of \code{nlambda} is \code{500}.
48 | 
49 | If the family is 'binomial' and a lambda sequence is not provided by the user, 
50 | this function generates it on a log-linear scale before calling 'glmnet'.
51 | 
52 | The default response family is 'gaussian', for a linear regression model.
53 | Different response families (e.g. 'binomial') can be specified by passing an
54 | optional parameter 'family'.
55 | 
56 | For a complete list of the available additional arguments, see \code{\link[glmnet]{glmnet}}.
57 | }
58 | \examples{
59 | set.seed(2022)
60 | p=200; n=100; k=15
61 | mu = rep(0,p); Sigma = diag(p)
62 | X = matrix(rnorm(n*p),n)
63 | nonzero = sample(p, k)
64 | beta = 3.5 * (1:p \%in\% nonzero)
65 | y = X \%*\% beta + rnorm(n)
66 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
67 | 
68 | # Basic usage with default arguments
69 | result = knockoff.filter(X, y, knockoffs=knockoffs, 
70 |                            statistic=stat.glmnet_lambdadiff)
71 | print(result$selected)
72 | 
73 | # Advanced usage with custom arguments
74 | foo = stat.glmnet_lambdadiff
75 | k_stat = function(X, X_k, y) foo(X, X_k, y, nlambda=200)
76 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
77 | print(result$selected)
78 | 
79 | }
80 | \seealso{
81 | Other statistics: 
82 | \code{\link{stat.forward_selection}()},
83 | \code{\link{stat.glmnet_coefdiff}()},
84 | \code{\link{stat.lasso_coefdiff_bin}()},
85 | \code{\link{stat.lasso_coefdiff}()},
86 | \code{\link{stat.lasso_lambdadiff_bin}()},
87 | \code{\link{stat.lasso_lambdadiff}()},
88 | \code{\link{stat.random_forest}()},
89 | \code{\link{stat.sqrt_lasso}()},
90 | \code{\link{stat.stability_selection}()}
91 | }
92 | \concept{statistics}
93 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.glmnet_lambdasmax.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_glmnet.R
 3 | \name{stat.glmnet_lambdasmax}
 4 | \alias{stat.glmnet_lambdasmax}
 5 | \title{GLM statistics for knockoff}
 6 | \usage{
 7 | stat.glmnet_lambdasmax(X, X_k, y, family = "gaussian", ...)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{vector of length n, containing the response variables. Quantitative for family="gaussian", 
15 | or family="poisson" (non-negative counts). For family="binomial" 
16 | should be either a factor with two levels, or a two-column matrix of counts 
17 | or proportions (the second column is treated as the target class; for a factor, 
18 | the last level in alphabetical order is the target class). For family="multinomial", 
19 | can be a nc>=2 level factor, or a matrix with nc columns of counts or proportions. 
20 | For either "binomial" or "multinomial", if y is presented as a vector, it will 
21 | be coerced into a factor. For family="cox", y should be a two-column matrix with 
22 | columns named 'time' and 'status'. The latter is a binary variable, with '1' 
23 | indicating death, and '0' indicating right censored. The function Surv() in 
24 | package survival produces such a matrix. For family="mgaussian", y is a matrix 
25 | of quantitative responses.}
26 | 
27 | \item{family}{response type (see above).}
28 | 
29 | \item{...}{additional arguments specific to \code{glmnet} (see Details).}
30 | }
31 | \value{
32 | A vector of statistics \eqn{W} of length p.
33 | }
34 | \description{
35 | Computes the signed maximum statistic
36 |   \deqn{W_j = \max(Z_j, \tilde{Z}_j) \cdot \mathrm{sgn}(Z_j - \tilde{Z}_j),}
37 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the maximum values of 
38 | \eqn{\lambda} at which the jth variable and its knockoff, respectively,
39 | enter the generalized linear model.
40 | }
41 | \details{
42 | This function uses \code{glmnet} to compute the regularization path
43 | on a fine grid of \eqn{\lambda}'s.
44 | 
45 | The additional \code{nlambda} 
46 | parameter can be used to control the granularity of the grid of \eqn{\lambda} values. 
47 | The default value of \code{nlambda} is \code{500}.
48 | 
49 | If the family is 'binomial' and a lambda sequence is not provided by the user, 
50 | this function generates it on a log-linear scale before calling 'glmnet'.
51 | 
52 | For a complete list of the available additional arguments, see \code{\link[glmnet]{glmnet}}.
53 | }
54 | \examples{
55 | p=200; n=100; k=15
56 | mu = rep(0,p); Sigma = diag(p)
57 | X = matrix(rnorm(n*p),n)
58 | nonzero = sample(p, k)
59 | beta = 3.5 * (1:p \%in\% nonzero)
60 | y = X \%*\% beta + rnorm(n)
61 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
62 | 
63 | # Basic usage with default arguments
64 | result = knockoff.filter(X, y, knockoff=knockoffs,
65 |                            statistic=stat.glmnet_lambdasmax)
66 | print(result$selected)
67 | 
68 | # Advanced usage with custom arguments
69 | foo = stat.glmnet_lambdasmax
70 | k_stat = function(X, X_k, y) foo(X, X_k, y, nlambda=200)
71 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
72 | print(result$selected)
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.lasso_coefdiff.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_lasso_cv.R
 3 | \name{stat.lasso_coefdiff}
 4 | \alias{stat.lasso_coefdiff}
 5 | \title{Importance statistics based the lasso with cross-validation}
 6 | \usage{
 7 | stat.lasso_coefdiff(X, X_k, y, cores = 2, ...)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{vector of length n, containing the response variables. It should be numeric}
15 | 
16 | \item{cores}{Number of cores used to compute the statistics by running cv.glmnet.
17 | If not specified, the number of cores is set to approximately half of the number of cores 
18 | detected by the parallel package.}
19 | 
20 | \item{...}{additional arguments specific to \code{glmnet} (see Details).}
21 | }
22 | \value{
23 | A vector of statistics \eqn{W} of length p.
24 | }
25 | \description{
26 | Fits a linear regression model via penalized maximum likelihood and cross-validation.
27 | Then, compute the difference statistic
28 |   \deqn{W_j = |Z_j| - |\tilde{Z}_j|}
29 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the coefficient estimates for the 
30 | jth variable and its knockoff, respectively. The value of the regularization
31 | parameter \eqn{\lambda} is selected by cross-validation and computed with \code{glmnet}.
32 | }
33 | \details{
34 | This function uses the \code{glmnet} package to fit the lasso path and 
35 | is a wrapper around the more general \link{stat.glmnet_coefdiff}.
36 | 
37 | The statistics \eqn{W_j} are constructed by taking the difference 
38 | between the coefficient of the j-th variable and its knockoff.
39 |  
40 | By default, the value of the regularization parameter is chosen by 10-fold cross-validation.
41 | 
42 | The optional \code{nlambda} parameter can be used to control the granularity of the 
43 | grid of \eqn{\lambda}'s. The default value of \code{nlambda} is \code{500},
44 | where \code{p} is the number of columns of \code{X}.
45 | 
46 | Unless a lambda sequence is provided by the user, this function generates it on a 
47 | log-linear scale before calling 'glmnet' (default 'nlambda': 500).
48 | 
49 | For a complete list of the available additional arguments, see \code{\link[glmnet]{cv.glmnet}}
50 | and \code{\link[glmnet]{glmnet}}.
51 | }
52 | \examples{
53 | set.seed(2022)
54 | p=200; n=100; k=15
55 | mu = rep(0,p); Sigma = diag(p)
56 | X = matrix(rnorm(n*p),n)
57 | nonzero = sample(p, k)
58 | beta = 3.5 * (1:p \%in\% nonzero)
59 | y = X \%*\% beta + rnorm(n)
60 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
61 | 
62 | # Basic usage with default arguments
63 | result = knockoff.filter(X, y, knockoffs=knockoffs, 
64 |                            statistic=stat.lasso_coefdiff)
65 | print(result$selected)
66 | 
67 | # Advanced usage with custom arguments
68 | foo = stat.lasso_coefdiff
69 | k_stat = function(X, X_k, y) foo(X, X_k, y, nlambda=200)
70 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
71 | print(result$selected)
72 | 
73 | }
74 | \seealso{
75 | Other statistics: 
76 | \code{\link{stat.forward_selection}()},
77 | \code{\link{stat.glmnet_coefdiff}()},
78 | \code{\link{stat.glmnet_lambdadiff}()},
79 | \code{\link{stat.lasso_coefdiff_bin}()},
80 | \code{\link{stat.lasso_lambdadiff_bin}()},
81 | \code{\link{stat.lasso_lambdadiff}()},
82 | \code{\link{stat.random_forest}()},
83 | \code{\link{stat.sqrt_lasso}()},
84 | \code{\link{stat.stability_selection}()}
85 | }
86 | \concept{statistics}
87 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.lasso_coefdiff_bin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_lasso_cv_bin.R
 3 | \name{stat.lasso_coefdiff_bin}
 4 | \alias{stat.lasso_coefdiff_bin}
 5 | \title{Importance statistics based on regularized logistic regression with cross-validation}
 6 | \usage{
 7 | stat.lasso_coefdiff_bin(X, X_k, y, cores = 2, ...)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables..}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{vector of length n, containing the response variables. It should be either a factor with two levels, 
15 | or a two-column matrix of counts or proportions 
16 | (the second column is treated as the target class; for a factor, the last level 
17 | in alphabetical order is the target class). If y is presented as a vector, 
18 | it will be coerced into a factor.}
19 | 
20 | \item{cores}{Number of cores used to compute the statistics by running cv.glmnet.
21 | If not specified, the number of cores is set to approximately half of the number of cores 
22 | detected by the parallel package.}
23 | 
24 | \item{...}{additional arguments specific to \code{glmnet} (see Details).}
25 | }
26 | \value{
27 | A vector of statistics \eqn{W} of length p.
28 | }
29 | \description{
30 | Fits a logistic regression model via penalized maximum likelihood and cross-validation.
31 | Then, compute the difference statistic
32 |   \deqn{W_j = |Z_j| - |\tilde{Z}_j|}
33 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the coefficient estimates for the 
34 | jth variable and its knockoff, respectively. The value of the regularization
35 | parameter \eqn{\lambda} is selected by cross-validation and computed with \code{glmnet}.
36 | }
37 | \details{
38 | This function uses the \code{glmnet} package to fit the penalized logistic regression path
39 | and is a wrapper around the more general \code{\link{stat.glmnet_coefdiff}}.
40 | 
41 | The statistics \eqn{W_j} are constructed by taking the difference 
42 | between the coefficient of the j-th variable and its knockoff.
43 |  
44 | By default, the value of the regularization parameter is chosen by 10-fold cross-validation.
45 | 
46 | The optional \code{nlambda} parameter can be used to control the granularity of the 
47 | grid of \eqn{\lambda}'s. The default value of \code{nlambda} is \code{500},
48 | where \code{p} is the number of columns of \code{X}.
49 | 
50 | For a complete list of the available additional arguments, see \code{\link[glmnet]{cv.glmnet}}
51 | and \code{\link[glmnet]{glmnet}}.
52 | }
53 | \examples{
54 | set.seed(2022)
55 | p=200; n=100; k=15
56 | mu = rep(0,p); Sigma = diag(p)
57 | X = matrix(rnorm(n*p),n)
58 | nonzero = sample(p, k)
59 | beta = 3.5 * (1:p \%in\% nonzero)
60 | pr = 1/(1+exp(-X \%*\% beta))
61 | y = rbinom(n,1,pr)
62 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
63 | 
64 | # Basic usage with default arguments
65 | result = knockoff.filter(X, y, knockoffs=knockoffs, 
66 |                            statistic=stat.lasso_coefdiff_bin)
67 | print(result$selected)
68 | 
69 | # Advanced usage with custom arguments
70 | foo = stat.lasso_coefdiff_bin
71 | k_stat = function(X, X_k, y) foo(X, X_k, y, nlambda=200)
72 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
73 | print(result$selected)
74 | 
75 | }
76 | \seealso{
77 | Other statistics: 
78 | \code{\link{stat.forward_selection}()},
79 | \code{\link{stat.glmnet_coefdiff}()},
80 | \code{\link{stat.glmnet_lambdadiff}()},
81 | \code{\link{stat.lasso_coefdiff}()},
82 | \code{\link{stat.lasso_lambdadiff_bin}()},
83 | \code{\link{stat.lasso_lambdadiff}()},
84 | \code{\link{stat.random_forest}()},
85 | \code{\link{stat.sqrt_lasso}()},
86 | \code{\link{stat.stability_selection}()}
87 | }
88 | \concept{statistics}
89 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.lasso_lambdadiff.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_lasso.R
 3 | \name{stat.lasso_lambdadiff}
 4 | \alias{stat.lasso_lambdadiff}
 5 | \title{Importance statistics based on the lasso}
 6 | \usage{
 7 | stat.lasso_lambdadiff(X, X_k, y, ...)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{vector of length n, containing the response variables. It should be numeric.}
15 | 
16 | \item{...}{additional arguments specific to \code{glmnet} (see Details).}
17 | }
18 | \value{
19 | A vector of statistics \eqn{W} of length p.
20 | }
21 | \description{
22 | Fit the lasso path and computes the difference statistic
23 |   \deqn{W_j = Z_j - \tilde{Z}_j}
24 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the maximum values of the 
25 | regularization parameter \eqn{\lambda} at which the jth variable 
26 | and its knockoff enter the penalized linear regression model, respectively.
27 | }
28 | \details{
29 | This function uses \code{glmnet} to compute the lasso path
30 | on a fine grid of \eqn{\lambda}'s and is a wrapper around the more general
31 | \link{stat.glmnet_lambdadiff}.
32 | 
33 | The \code{nlambda} parameter can be used to control the granularity of the 
34 | grid of \eqn{\lambda}'s. The default value of \code{nlambda} is \code{500}.
35 | 
36 | Unless a lambda sequence is provided by the user, this function generates it on a 
37 | log-linear scale before calling \code{glmnet} (default 'nlambda': 500).
38 | 
39 | For a complete list of the available additional arguments, see \code{\link[glmnet]{glmnet}}
40 | or \code{\link[lars]{lars}}.
41 | }
42 | \examples{
43 | set.seed(2022)
44 | p=200; n=100; k=15
45 | mu = rep(0,p); Sigma = diag(p)
46 | X = matrix(rnorm(n*p),n)
47 | nonzero = sample(p, k)
48 | beta = 3.5 * (1:p \%in\% nonzero)
49 | y = X \%*\% beta + rnorm(n)
50 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
51 | 
52 | # Basic usage with default arguments
53 | result = knockoff.filter(X, y, knockoffs=knockoffs, 
54 |                            statistic=stat.lasso_lambdadiff)
55 | print(result$selected)
56 | 
57 | # Advanced usage with custom arguments
58 | foo = stat.lasso_lambdadiff
59 | k_stat = function(X, X_k, y) foo(X, X_k, y, nlambda=200)
60 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
61 | print(result$selected)
62 | 
63 | }
64 | \seealso{
65 | Other statistics: 
66 | \code{\link{stat.forward_selection}()},
67 | \code{\link{stat.glmnet_coefdiff}()},
68 | \code{\link{stat.glmnet_lambdadiff}()},
69 | \code{\link{stat.lasso_coefdiff_bin}()},
70 | \code{\link{stat.lasso_coefdiff}()},
71 | \code{\link{stat.lasso_lambdadiff_bin}()},
72 | \code{\link{stat.random_forest}()},
73 | \code{\link{stat.sqrt_lasso}()},
74 | \code{\link{stat.stability_selection}()}
75 | }
76 | \concept{statistics}
77 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.lasso_lambdadiff_bin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_lasso_bin.R
 3 | \name{stat.lasso_lambdadiff_bin}
 4 | \alias{stat.lasso_lambdadiff_bin}
 5 | \title{Importance statistics based on regularized logistic regression}
 6 | \usage{
 7 | stat.lasso_lambdadiff_bin(X, X_k, y, ...)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{vector of length n, containing the response variables. It should be either a factor with two levels, 
15 | or a two-column matrix of counts or proportions 
16 | (the second column is treated as the target class; for a factor, the last level 
17 | in alphabetical order is the target class). If y is presented as a vector, 
18 | it will be coerced into a factor.}
19 | 
20 | \item{...}{additional arguments specific to \code{glmnet} (see Details).}
21 | }
22 | \value{
23 | A vector of statistics \eqn{W} of length p.
24 | }
25 | \description{
26 | Fit the lasso path and computes the difference statistic
27 |   \deqn{W_j = Z_j - \tilde{Z}_j}
28 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the maximum values of the 
29 | regularization parameter \eqn{\lambda} at which the jth variable 
30 | and its knockoff enter the penalized logistic regression model, respectively.
31 | }
32 | \details{
33 | This function uses \code{glmnet} to compute the lasso path
34 | on a fine grid of \eqn{\lambda}'s.
35 | 
36 | The \code{nlambda} parameter can be used to control the granularity of the 
37 | grid of \eqn{\lambda}'s. The default value of \code{nlambda} is \code{500}.
38 | 
39 | This function is a wrapper around the more general \code{\link{stat.glmnet_lambdadiff}}.
40 | 
41 | For a complete list of the available additional arguments, see \code{\link[glmnet]{glmnet}}
42 | or \code{\link[lars]{lars}}.
43 | }
44 | \examples{
45 | set.seed(2022)
46 | p=200; n=100; k=15
47 | mu = rep(0,p); Sigma = diag(p)
48 | X = matrix(rnorm(n*p),n)
49 | nonzero = sample(p, k)
50 | beta = 3.5 * (1:p \%in\% nonzero)
51 | pr = 1/(1+exp(-X \%*\% beta))
52 | y = rbinom(n,1,pr)
53 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
54 | 
55 | # Basic usage with default arguments
56 | result = knockoff.filter(X, y, knockoffs=knockoffs, 
57 |                            statistic=stat.lasso_lambdadiff_bin)
58 | print(result$selected)
59 | 
60 | # Advanced usage with custom arguments
61 | foo = stat.lasso_lambdadiff_bin
62 | k_stat = function(X, X_k, y) foo(X, X_k, y, nlambda=200)
63 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
64 | print(result$selected)
65 | 
66 | }
67 | \seealso{
68 | Other statistics: 
69 | \code{\link{stat.forward_selection}()},
70 | \code{\link{stat.glmnet_coefdiff}()},
71 | \code{\link{stat.glmnet_lambdadiff}()},
72 | \code{\link{stat.lasso_coefdiff_bin}()},
73 | \code{\link{stat.lasso_coefdiff}()},
74 | \code{\link{stat.lasso_lambdadiff}()},
75 | \code{\link{stat.random_forest}()},
76 | \code{\link{stat.sqrt_lasso}()},
77 | \code{\link{stat.stability_selection}()}
78 | }
79 | \concept{statistics}
80 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.lasso_lambdasmax.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_lasso.R
 3 | \name{stat.lasso_lambdasmax}
 4 | \alias{stat.lasso_lambdasmax}
 5 | \title{Penalized linear regression statistics for knockoff}
 6 | \usage{
 7 | stat.lasso_lambdasmax(X, X_k, y, ...)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{vector of length n, containing the response variables. It should be numeric.}
15 | 
16 | \item{...}{additional arguments specific to \code{glmnet} or \code{lars} (see Details).}
17 | }
18 | \value{
19 | A vector of statistics \eqn{W} of length p.
20 | }
21 | \description{
22 | Computes the signed maximum statistic
23 |   \deqn{W_j = \max(Z_j, \tilde{Z}_j) \cdot \mathrm{sgn}(Z_j - \tilde{Z}_j),}
24 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the maximum values of 
25 | \eqn{\lambda} at which the jth variable and its knockoff, respectively,
26 | enter the penalized linear regression model.
27 | }
28 | \details{
29 | This function uses \code{glmnet} to compute the regularization path
30 | on a fine grid of \eqn{\lambda}'s.
31 | 
32 | The additional \code{nlambda} 
33 | parameter can be used to control the granularity of the grid of \eqn{\lambda} values. 
34 | The default value of \code{nlambda} is \code{500}.
35 | 
36 | Unless a lambda sequence is provided by the user, this function generates it on a 
37 | log-linear scale before calling \code{glmnet} (default 'nlambda': 500).
38 | 
39 | This function is a wrapper around the more general 
40 | \code{\link{stat.glmnet_lambdadiff}}.
41 | 
42 | For a complete list of the available additional arguments, see \code{\link[glmnet]{glmnet}}.
43 | }
44 | \examples{
45 | p=200; n=100; k=15
46 | mu = rep(0,p); Sigma = diag(p)
47 | X = matrix(rnorm(n*p),n)
48 | nonzero = sample(p, k)
49 | beta = 3.5 * (1:p \%in\% nonzero)
50 | y = X \%*\% beta + rnorm(n)
51 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
52 | 
53 | # Basic usage with default arguments
54 | result = knockoff.filter(X, y, knockoff=knockoffs,
55 |                            statistic=stat.lasso_lambdasmax)
56 | print(result$selected)
57 | 
58 | # Advanced usage with custom arguments
59 | foo = stat.lasso_lambdasmax
60 | k_stat = function(X, X_k, y) foo(X, X_k, y, nlambda=200)
61 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
62 | print(result$selected)
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.lasso_lambdasmax_bin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_lasso_bin.R
 3 | \name{stat.lasso_lambdasmax_bin}
 4 | \alias{stat.lasso_lambdasmax_bin}
 5 | \title{Penalized logistic regression statistics for knockoff}
 6 | \usage{
 7 | stat.lasso_lambdasmax_bin(X, X_k, y, ...)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{vector of length n, containing the response variables. It should be either a factor with two levels, 
15 | or a two-column matrix of counts or proportions 
16 | (the second column is treated as the target class; for a factor, the last level 
17 | in alphabetical order is the target class). If y is presented as a vector, 
18 | it will be coerced into a factor.}
19 | 
20 | \item{...}{additional arguments specific to \code{glmnet} or \code{lars} (see Details).}
21 | }
22 | \value{
23 | A vector of statistics \eqn{W} of length p.
24 | }
25 | \description{
26 | Computes the signed maximum statistic
27 |   \deqn{W_j = \max(Z_j, \tilde{Z}_j) \cdot \mathrm{sgn}(Z_j - \tilde{Z}_j),}
28 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the maximum values of 
29 | \eqn{\lambda} at which the jth variable and its knockoff, respectively,
30 | enter the penalized logistic regression model.
31 | }
32 | \details{
33 | This function uses \code{glmnet} to compute the regularization path
34 | on a fine grid of \eqn{\lambda}'s.
35 | 
36 | The additional \code{nlambda} 
37 | parameter can be used to control the granularity of the grid of \eqn{\lambda} values. 
38 | The default value of \code{nlambda} is \code{500}.
39 | 
40 | This function is a wrapper around the more general 
41 | \link{stat.glmnet_lambdadiff}.
42 | 
43 | For a complete list of the available additional arguments, see \code{\link[glmnet]{glmnet}}.
44 | }
45 | \examples{
46 | p=200; n=100; k=15
47 | mu = rep(0,p); Sigma = diag(p)
48 | X = matrix(rnorm(n*p),n)
49 | nonzero = sample(p, k)
50 | beta = 3.5 * (1:p \%in\% nonzero)
51 | pr = 1/(1+exp(-X \%*\% beta))
52 | y = rbinom(n,1,pr)
53 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
54 | 
55 | # Basic usage with default arguments
56 | result = knockoff.filter(X, y, knockoff=knockoffs,
57 |                            statistic=stat.lasso_lambdasmax_bin)
58 | print(result$selected)
59 | 
60 | # Advanced usage with custom arguments
61 | foo = stat.lasso_lambdasmax_bin
62 | k_stat = function(X, X_k, y) foo(X, X_k, y, nlambda=200)
63 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
64 | print(result$selected)
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.random_forest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_random_forest.R
 3 | \name{stat.random_forest}
 4 | \alias{stat.random_forest}
 5 | \title{Importance statistics based on random forests}
 6 | \usage{
 7 | stat.random_forest(X, X_k, y, ...)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{vector of length n, containing the response variables. If a factor, classification is assumed, 
15 | otherwise regression is assumed.}
16 | 
17 | \item{...}{additional arguments specific to \code{ranger} (see Details).}
18 | }
19 | \value{
20 | A vector of statistics \eqn{W} of length p.
21 | }
22 | \description{
23 | Computes the difference statistic
24 |   \deqn{W_j = |Z_j| - |\tilde{Z}_j|}
25 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the random forest feature importances
26 | of the jth variable and its knockoff, respectively.
27 | }
28 | \details{
29 | This function uses the \code{ranger} package to compute variable 
30 | importance measures. The importance of a variable is measured as the total decrease
31 | in node impurities from splitting on that variable, averaged over all trees. 
32 | For regression, the node impurity is measured by residual sum of squares.
33 | For classification, it is measured by the Gini index.
34 | 
35 | For a complete list of the available additional arguments, see \code{\link[ranger]{ranger}}.
36 | }
37 | \examples{
38 | set.seed(2022)
39 | p=200; n=100; k=15
40 | mu = rep(0,p); Sigma = diag(p)
41 | X = matrix(rnorm(n*p),n)
42 | nonzero = sample(p, k)
43 | beta = 3.5 * (1:p \%in\% nonzero)
44 | y = X \%*\% beta + rnorm(n)
45 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
46 | 
47 | # Basic usage with default arguments
48 | result = knockoff.filter(X, y, knockoffs=knockoffs, 
49 |                            statistic=stat.random_forest)
50 | print(result$selected)
51 | 
52 | # Advanced usage with custom arguments
53 | foo = stat.random_forest
54 | k_stat = function(X, X_k, y) foo(X, X_k, y, nodesize=5)
55 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
56 | print(result$selected)
57 | 
58 | }
59 | \seealso{
60 | Other statistics: 
61 | \code{\link{stat.forward_selection}()},
62 | \code{\link{stat.glmnet_coefdiff}()},
63 | \code{\link{stat.glmnet_lambdadiff}()},
64 | \code{\link{stat.lasso_coefdiff_bin}()},
65 | \code{\link{stat.lasso_coefdiff}()},
66 | \code{\link{stat.lasso_lambdadiff_bin}()},
67 | \code{\link{stat.lasso_lambdadiff}()},
68 | \code{\link{stat.sqrt_lasso}()},
69 | \code{\link{stat.stability_selection}()}
70 | }
71 | \concept{statistics}
72 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.sqrt_lasso.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_sqrt_lasso.R
 3 | \name{stat.sqrt_lasso}
 4 | \alias{stat.sqrt_lasso}
 5 | \title{Importance statistics based on the square-root lasso}
 6 | \usage{
 7 | stat.sqrt_lasso(X, X_k, y, ...)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{vector of length n, containing the response variables of numeric type.}
15 | 
16 | \item{...}{additional arguments specific to \code{slim}.}
17 | }
18 | \value{
19 | A vector of statistics \eqn{W} of length p.
20 | }
21 | \description{
22 | Computes the signed maximum statistic
23 |   \deqn{W_j = \max(Z_j, \tilde{Z}_j) \cdot \mathrm{sgn}(Z_j - \tilde{Z}_j),}
24 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are the maximum values of 
25 | \eqn{\lambda} at which the jth variable and its knockoff, respectively,
26 | enter the SQRT lasso model.
27 | }
28 | \details{
29 | With default parameters, this function uses the package \code{RPtests}
30 | to run the SQRT lasso. By specifying the appropriate optional parameters, 
31 | one can use different Lasso variants including Dantzig Selector, LAD Lasso,
32 | SQRT Lasso and Lq Lasso for estimating high dimensional sparse linear models.
33 | 
34 | For a complete list of the available additional arguments, see \code{\link[RPtests]{sqrt_lasso}}.
35 | }
36 | \examples{
37 | set.seed(2022)
38 | p=50; n=50; k=10
39 | mu = rep(0,p); Sigma = diag(p)
40 | X = matrix(rnorm(n*p),n)
41 | nonzero = sample(p, k)
42 | beta = 3.5 * (1:p \%in\% nonzero)
43 | y = X \%*\% beta + rnorm(n)
44 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
45 | 
46 | # Basic usage with default arguments
47 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=stat.sqrt_lasso)
48 | print(result$selected)
49 | 
50 | # Advanced usage with custom arguments
51 | foo = stat.sqrt_lasso
52 | k_stat = function(X, X_k, y) foo(X, X_k, y, q=0.5)
53 | result = knockoff.filter(X, y, knockoffs=knockoffs, statistic=k_stat)
54 | print(result$selected)
55 | 
56 | }
57 | \seealso{
58 | Other statistics: 
59 | \code{\link{stat.forward_selection}()},
60 | \code{\link{stat.glmnet_coefdiff}()},
61 | \code{\link{stat.glmnet_lambdadiff}()},
62 | \code{\link{stat.lasso_coefdiff_bin}()},
63 | \code{\link{stat.lasso_coefdiff}()},
64 | \code{\link{stat.lasso_lambdadiff_bin}()},
65 | \code{\link{stat.lasso_lambdadiff}()},
66 | \code{\link{stat.random_forest}()},
67 | \code{\link{stat.stability_selection}()}
68 | }
69 | \concept{statistics}
70 | 


--------------------------------------------------------------------------------
/R/knockoff/man/stat.stability_selection.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stats_stability_selection.R
 3 | \name{stat.stability_selection}
 4 | \alias{stat.stability_selection}
 5 | \title{Importance statistics based on stability selection}
 6 | \usage{
 7 | stat.stability_selection(X, X_k, y, fitfun = stabs::lars.lasso, ...)
 8 | }
 9 | \arguments{
10 | \item{X}{n-by-p matrix of original variables.}
11 | 
12 | \item{X_k}{n-by-p matrix of knockoff variables.}
13 | 
14 | \item{y}{response vector (length n)}
15 | 
16 | \item{fitfun}{fitfun a function that takes the arguments x, y as above, 
17 | and additionally the number of variables to include in each model q. 
18 | The function then needs to fit the model and to return a logical vector 
19 | that indicates which variable was selected (among the q selected variables).
20 | The name of the function should be prefixed by 'stabs::'.}
21 | 
22 | \item{...}{additional arguments specific to 'stabs' (see Details).}
23 | }
24 | \value{
25 | A vector of statistics \eqn{W} of length p.
26 | }
27 | \description{
28 | Computes the difference statistic
29 |   \deqn{W_j = |Z_j| - |\tilde{Z}_j|}
30 | where \eqn{Z_j} and \eqn{\tilde{Z}_j} are measure the importance
31 | of the jth variable and its knockoff, respectively, based on the 
32 | stability of their selection upon subsampling of the data.
33 | }
34 | \details{
35 | This function uses the \code{stabs} package to compute
36 | variable selection stability. The selection stability of the j-th 
37 | variable is defined as its probability of being selected upon random
38 | subsampling of the data. The default method for selecting variables 
39 | in each subsampled dataset is \code{\link[stabs]{lars.lasso}}.
40 | 
41 | For a complete list of the available additional arguments, see \code{\link[stabs]{stabsel}}.
42 | }
43 | \examples{
44 | set.seed(2022)
45 | p=50; n=50; k=15
46 | mu = rep(0,p); Sigma = diag(p)
47 | X = matrix(rnorm(n*p),n)
48 | nonzero = sample(p, k)
49 | beta = 3.5 * (1:p \%in\% nonzero)
50 | y = X \%*\% beta + rnorm(n)
51 | knockoffs = function(X) create.gaussian(X, mu, Sigma)
52 | 
53 | # Basic usage with default arguments
54 | result = knockoff.filter(X, y, knockoffs=knockoffs,
55 |                          statistic=stat.stability_selection)
56 | print(result$selected)
57 | 
58 | 
59 | }
60 | \seealso{
61 | Other statistics: 
62 | \code{\link{stat.forward_selection}()},
63 | \code{\link{stat.glmnet_coefdiff}()},
64 | \code{\link{stat.glmnet_lambdadiff}()},
65 | \code{\link{stat.lasso_coefdiff_bin}()},
66 | \code{\link{stat.lasso_coefdiff}()},
67 | \code{\link{stat.lasso_lambdadiff_bin}()},
68 | \code{\link{stat.lasso_lambdadiff}()},
69 | \code{\link{stat.random_forest}()},
70 | \code{\link{stat.sqrt_lasso}()}
71 | }
72 | \concept{statistics}
73 | 


--------------------------------------------------------------------------------
/R/knockoff/man/vectorize_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/solve_sdp.R
 3 | \name{create.vectorize_matrix}
 4 | \alias{create.vectorize_matrix}
 5 | \title{Vectorize a matrix into the SCS format}
 6 | \usage{
 7 | create.vectorize_matrix(M)
 8 | }
 9 | \description{
10 | Vectorize a matrix into the SCS format
11 | }
12 | \keyword{internal}
13 | 


--------------------------------------------------------------------------------
/R/knockoff/man/verify_stat_depends.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/knockoff_filter.R
 3 | \name{verify_stat_depends}
 4 | \alias{verify_stat_depends}
 5 | \title{Verify dependencies for chosen statistics}
 6 | \usage{
 7 | verify_stat_depends(statistic)
 8 | }
 9 | \arguments{
10 | \item{statistic}{the statistic chosen by the user}
11 | }
12 | \description{
13 | Verify dependencies for chosen statistics
14 | }
15 | \keyword{internal}
16 | 


--------------------------------------------------------------------------------
/R/knockoff/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(knockoff)
 3 | 
 4 | # Set a random seed for reproducibility when running R CMD CHECK.
 5 | # Note that this file is not run by devtools::test, so the tests will be truly
 6 | # random when run from RStudio.
 7 | set.seed(1234)
 8 | 
 9 | # Run the test suite.
10 | test_check('knockoff')
11 | 
12 | # Reset the random seed.
13 | rm(.Random.seed)


--------------------------------------------------------------------------------
/R/knockoff/tests/testthat/test_create.R:
--------------------------------------------------------------------------------
 1 | test_that('Fixed-design equicorrelated knockoffs have the right correlation structure', {
 2 |   n = 20; p = 10
 3 |   X = knockoff:::normc(knockoff:::rnorm_matrix(n,p))
 4 |   knock_variables_default = create.fixed(X, method='equi', randomize=F)
 5 |   knock_variables_randomized = create.fixed(X, method='equi', randomize=T)
 6 |   X = knock_variables_default$X
 7 |   Xko_default = knock_variables_default$Xk
 8 |   Xko_randomized = knock_variables_randomized$Xk
 9 |   
10 |   G = t(X) %*% X
11 |   s = min(2*min(eigen(G)$values), 1)
12 |   for (Xko in list(Xko_default, Xko_randomized)) {
13 |     expect_equal(t(Xko) %*% Xko, G)
14 |     expect_equal(t(X) %*% Xko, G - diag(s,p,p))
15 |   }
16 | })
17 | 
18 | # Test case from Weijie Su.
19 | test_that('equicorrelated knockoffs are created in numerically sensitive case', {
20 |   n = 15; p = 5
21 |   M = matrix(0, p, p)
22 |   diag(M) = 1
23 |   for (i in 1:p) {
24 |     for (j in 1:p) {
25 |       if ((i==j+1) || (j==i+1))
26 |         M[i,j] <- 0.6
27 |       if ((i==j+2) || (j==i+2))
28 |         M[i,j] <- 0.1
29 |     }
30 |   }
31 |   X = knockoff:::with_seed(2, matrix(rnorm(n*p),n) %*% chol(M) )
32 |   k = 4
33 |   
34 |   Z = knockoff:::normc(X[,-k])
35 |   Z_ko = create.fixed(Z, method='equi', randomize=F)$Xk
36 |   expect_false(any(is.nan(Z_ko)))
37 | })
38 | 
39 | test_that('Fixed-design SDP knockoffs have the right correlation structure', {
40 |   skip_on_cran()
41 |   
42 |   n = 20; p = 10
43 |   X = knockoff:::normc(knockoff:::rnorm_matrix(n,p))
44 |   knock_variables_default = create.fixed(X, method='sdp', randomize=F)
45 |   knock_variables_randomized = create.fixed(X, method='sdp', randomize=T)
46 |   X = knock_variables_default$X
47 |   Xko_default = knock_variables_default$Xk
48 |   Xko_randomized = knock_variables_randomized$Xk
49 |   
50 |   offdiag <- function(A) A - diag(diag(A))
51 |   G = t(X) %*% X
52 |   tol = 1e-4
53 |   for (Xko in list(Xko_default, Xko_randomized)) {
54 |     expect_equal(t(Xko) %*% Xko, G, tolerance=tol)
55 |     expect_equal(offdiag(t(X) %*% Xko), offdiag(G), tolerance=tol)
56 |     expect_true(all(diag(t(X) %*% Xko) < 1+tol))
57 |   }
58 | })
59 | 
60 | test_that('Gaussian equicorrelated knockoffs have the right correlation structure', {
61 |   # Problem parameters
62 |   n = 10000000   # number of observations
63 |   p = 3          # number of variables
64 |   
65 |   # Generate the variables from a multivariate normal distribution
66 |   mu = c(1,2,3); Sigma = matrix(c(1,0.55,0.2, 0.55,1,0.55, 0.2, 0.55, 1),3)
67 |   
68 |   X = matrix(rep(mu,each=n),n) + matrix(rnorm(n*p),n) %*% chol(Sigma)
69 |   Xk = create.gaussian(X, mu, Sigma, method='equi')
70 |   
71 |   SigmaHat = cov(Xk)
72 |   SigmaHatCross = cov(X, y=Xk)
73 |   muHat = colMeans(Xk)
74 |   
75 |   lambda_min = eigen(Sigma, symmetric=T, only.values = T)$values[p]
76 |   diag_s = diag(rep(1, nrow(Sigma)) * min(2*lambda_min, min(diag(Sigma))))
77 |   
78 |   expect_equal(mu, muHat, tolerance=2e-3)
79 |   expect_equal(Sigma, SigmaHat, tolerance=2e-3)
80 |   expect_equal(Sigma-diag_s, SigmaHatCross, tolerance=2e-3)
81 | })
82 | 


--------------------------------------------------------------------------------
/R/knockoff/tests/testthat/test_filter.R:
--------------------------------------------------------------------------------
 1 | test_that('knockoff.filter verifies input dimensions', {
 2 |   expect_error(knockoff.filter(knockoff:::rnorm_matrix(10, 10), rnorm(10), knockoffs=create.fixed), 'dimensions')
 3 |   expect_error(knockoff.filter(knockoff:::rnorm_matrix(20, 10), rnorm(19), knockoffs=create.fixed))
 4 |   expect_warning(knockoff.filter(knockoff:::rnorm_matrix(20, 15), rnorm(20), knockoffs=create.fixed), 'dimensions')
 5 | })
 6 | 
 7 | # test_that('knockoff.filter for fixed design is invariant under permutations of the columns of the design matrix.', {
 8 | #   # Problem parameters
 9 | #   n = 250          # number of observations
10 | #   p = 100          # number of variables
11 | #   k = 30           # number of variables with nonzero coefficients
12 | #   amplitude = 5    # signal amplitude (for noise level = 1)
13 | #   
14 | #   # Generate the variables from a multivariate normal distribution
15 | #   X = matrix(rnorm(n*p),n)
16 | #   
17 | #   # Generate the response from a linear model
18 | #   nonzero = sample(p, k)
19 | #   beta = amplitude * (1:p %in% nonzero) / sqrt(n)
20 | #   y.sample <- function(X) X %*% beta + rnorm(n)
21 | #   y = y.sample(X)
22 | #   
23 | #   # Select variables
24 | #   set.seed(123)
25 | #   S = knockoff.filter(X, y, knockoffs=create.fixed)$selected
26 | #   
27 | #   # Permute columns
28 | #   idx_perm = sample(p)
29 | #   X_perm = X[,idx_perm]
30 | #   set.seed(123)
31 | #   S_perm = idx_perm[knockoff.filter(X_perm, y, knockoffs=create.fixed)$selected]
32 | #   
33 | #   # Verify consistency
34 | #   expect_true(setequal(S,S_perm))
35 | # })


--------------------------------------------------------------------------------
/R/knockoff/tests/testthat/test_stats.R:
--------------------------------------------------------------------------------
 1 | test_that('Statistics obey antisymmetry property', {
 2 |   n = 10; p = 5;
 3 |   prob = random_problem(n, p)
 4 |   knock.variables = create.fixed(prob$X)
 5 |   X = knock.variables$X
 6 |   Xk = knock.variables$Xk
 7 |   G = cbind(X, Xk)
 8 |   y = prob$y
 9 |     
10 |   i = sort(sample(p, sample(p,1))) # Indices to swap.
11 |   G_swap = G
12 |   G_swap[,c(i,i+p)] <- G[,c(i+p,i)]
13 |   
14 |   expect_antisymmetric <- function(stat) {
15 |     orig = 1:p; ko = (p+1):(2*p);
16 |     expect_equal(stat(G[,orig],G[,ko],y),
17 |                  stat(G_swap[,orig],G_swap[,ko],y) * ifelse(1:p %in% i, -1, 1), tolerance = 1e-3)
18 |   }
19 |   expect_antisymmetric(stat.forward_selection)
20 |   stats_fs_omp = function(X,Xk,y) stat.forward_selection(X, Xk, y, omp=FALSE)
21 |   expect_antisymmetric(stats_fs_omp)
22 |   stats_lasso_diff = function(X,Xk,y) stat.lasso_lambdadiff(X, Xk, y, nlambda=100000)
23 |   expect_antisymmetric(stats_lasso_diff)
24 |   stats_lasso_signed_max = function(X,Xk,y) stat.lasso_lambdasmax(X, Xk, y, nlambda=100000)
25 |   expect_antisymmetric(stats_lasso_signed_max)
26 | })
27 | 
28 | test_that('Finding the max lambda in lasso works for orthonormal design', {
29 |   n = 30; p = 10; amplitude = 3.5;
30 |   X = qr.Q(qr(rnorm_matrix(n,p)))
31 |   beta = amplitude * rnorm(p)
32 |   y = X %*% beta + rnorm(n)
33 |     
34 |   beta_ls = as.vector(t(X) %*% y)
35 |   expect_equal(lasso_max_lambda_glmnet(X, y, nlambda = 1e4, intercept=F, standardize=F), abs(beta_ls),
36 |                tolerance = 1e-3)
37 | })
38 | 


--------------------------------------------------------------------------------
/R/knockoff/tests/testthat/test_util.R:
--------------------------------------------------------------------------------
1 | test_that('restoring the random seed works', {
2 |   if (!exists('.Random.seed')) RNGkind()
3 |   seed <- .Random.seed
4 |   first <- with_seed(0, rnorm(10))
5 |   second <- with_seed(0, rnorm(10))
6 |   expect_equal(.Random.seed, seed)
7 |   expect_equal(first, second)
8 | })


--------------------------------------------------------------------------------
/R/knockoff/vignettes/advanced.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Advanced Usage of the Knockoff Filter for R"
  3 | date: "`r Sys.Date()`"
  4 | output: rmarkdown::html_vignette
  5 | vignette: >
  6 |   %\VignetteIndexEntry{Advanced Usage of the Knockoff Filter for R}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 |   %\usepackage[utf8]{inputenc}
  9 | ---
 10 | 
 11 | The function `knockoff.filter` is a wrapper around several simpler functions that
 12 | 
 13 | 1. Construct knockoff variables (various functions with prefix `create`)
 14 | 2. Compute the test statistic $W$ (various functions with prefix `stat`)
 15 | 3. Compute the threshold for variable selection (`knockoff.threshold`)
 16 | 
 17 | These functions may be called directly if desired. The purpose of this vignette is to illustrate the flexibility of this package with some examples.
 18 | 
 19 | ```{r, results='hide', message=FALSE, warning=FALSE}
 20 | set.seed(1234)
 21 | library(knockoff)
 22 | ```
 23 | 
 24 | Creating an artificial problem
 25 | ------------------------------
 26 | Let us begin by creating some synthetic data. For simplicity, we will use synthetic data constructed from a generalized linear model such that the response only depends on a small fraction of the variables.
 27 | 
 28 | ```{r}
 29 | # Problem parameters
 30 | n = 1000         # number of observations
 31 | p = 1000         # number of variables
 32 | k = 60           # number of variables with nonzero coefficients
 33 | amplitude = 7.5  # signal amplitude (for noise level = 1)
 34 | 
 35 | # Generate the variables from a multivariate normal distribution
 36 | mu = rep(0,p)
 37 | rho = 0.10
 38 | Sigma = toeplitz(rho^(0:(p-1)))
 39 | X = matrix(rnorm(n*p),n) %*% chol(Sigma)
 40 | 
 41 | # Generate the response from a logistic model and encode it as a factor.
 42 | nonzero = sample(p, k)
 43 | beta = amplitude * (1:p %in% nonzero) / sqrt(n)
 44 | invlogit = function(x) exp(x) / (1+exp(x))
 45 | y.sample = function(x) rbinom(n, prob=invlogit(x %*% beta), size=1)
 46 | y = factor(y.sample(X), levels=c(0,1), labels=c("A","B"))
 47 | ```
 48 | 
 49 | Looking inside the knockoff filter
 50 | ----------------------------------
 51 | Instead of using `knockoff.filter` directly, we can run the filter manually
 52 | by calling its main components one by one.
 53 | 
 54 | The first step is to generate the knockoff variables for the true Gaussian distribution of the variables.
 55 | ```{r}
 56 | X_k = create.gaussian(X, mu, Sigma)
 57 | ```
 58 | 
 59 | Then, we compute the knockoff statistics using 10-fold cross-validated lasso
 60 | ```{r, results='hide', message=FALSE, warning=FALSE}
 61 | W = stat.glmnet_coefdiff(X, X_k, y, nfolds=10, family="binomial")
 62 | ```
 63 | 
 64 | Now we can compute the rejection threshold
 65 | ```{r}
 66 | thres = knockoff.threshold(W, fdr=0.2, offset=1)
 67 | ```
 68 | 
 69 | The final step is to select the variables
 70 | ```{r}
 71 | selected = which(W >= thres)
 72 | print(selected)
 73 | ```
 74 | 
 75 | The false discovery proportion is
 76 | ```{r}
 77 | fdp = function(selected) sum(beta[selected] == 0) / max(1, length(selected))
 78 | fdp(selected)
 79 | ```
 80 | 
 81 | Performing numerical simulations
 82 | --------------------------------
 83 | We show how to manually run the knockoff filter multiple times and compute
 84 | average quantities. This is particularly useful to estimate the FDR
 85 | (or the power) for a particular configuration of the knockoff filter
 86 | on artificial problems.
 87 | ```{r}
 88 | # Optimize the parameters needed for generating Gaussian knockoffs, 
 89 | # by solving an SDP to minimize correlations with the original variables.
 90 | # This calculation requires only the model parameters mu and Sigma, 
 91 | # not the observed variables X. Therefore, there is no reason to perform it
 92 | # more than once for our simulation.
 93 | 
 94 | diag_s = create.solve_asdp(Sigma)
 95 | 
 96 | # Compute the fdp over 20 iterations
 97 | nIterations = 20
 98 | fdp_list = sapply(1:nIterations, function(it) {
 99 |     # Run the knockoff filter manually, using the pre-computed value of diag_s
100 |     X_k = create.gaussian(X, mu, Sigma, diag_s=diag_s)
101 |     W = stat.glmnet_lambdasmax(X, X_k, y, family="binomial")
102 |     t = knockoff.threshold(W, fdr=0.2, offset=1)
103 |     selected = which(W >= t)
104 |     # Compute and store the fdp
105 |     fdp(selected)
106 |   })
107 | # Estimate the FDR
108 | mean(fdp_list)
109 | ```
110 | 
111 | See also
112 | --------
113 | If you want to see some basic usage of the knockoff filter, see the [introductory vignette](knockoff.html).
114 | If you want to see how to use knockoffs for Fixed-X variables, see the [Fixed-X vignette](fixed.html).


--------------------------------------------------------------------------------
/R/knockoff/vignettes/fixed.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Controlled variable Selection with Fixed-X Knockoffs"
 3 | date: "`r Sys.Date()`"
 4 | output: rmarkdown::html_vignette
 5 | vignette: >
 6 |   %\VignetteIndexEntry{Controlled variable Selection with Fixed-X Knockoffs}
 7 |   %\VignetteEngine{knitr::rmarkdown}
 8 |   %\usepackage[utf8]{inputenc}
 9 | ---
10 |   
11 | This vignette illustrates the basic usage of the `knockoff` package with Fixed-X knockoffs. In this scenario we make no assumptions on the distribution of the predictors (which can be considered fixed), but we assume a homoscedastic linear regression model for the response. 
12 | In this scenario, knockoffs only control the FDR if used in combination with statistics that satisfy the "sufficiency" property. In particular, the default statistics based on the cross-validated lasso are not valid.
13 | 
14 | For simplicity, we will use synthetic data constructed from a linear model such that the response only depends on a small fraction of the variables.
15 | 
16 | 
17 | ```{r, results='hide', message=FALSE, warning=FALSE}
18 | set.seed(1234)
19 | ```
20 | 
21 | ```{r}
22 | # Problem parameters
23 | n = 1000          # number of observations
24 | p = 300           # number of variables
25 | k = 30            # number of variables with nonzero coefficients
26 | amplitude = 4.5   # signal amplitude (for noise level = 1)
27 | 
28 | # Generate the variables from a multivariate normal distribution
29 | mu = rep(0,p)
30 | rho = 0.25
31 | Sigma = toeplitz(rho^(0:(p-1)))
32 | X = matrix(rnorm(n*p),n) %*% chol(Sigma)
33 | 
34 | # Generate the response from a linear model
35 | nonzero = sample(p, k)
36 | beta = amplitude * (1:p %in% nonzero) / sqrt(n)
37 | y.sample = function(X) X %*% beta + rnorm(n)
38 | y = y.sample(X)
39 | ```
40 | 
41 | First examples
42 | --------------
43 | In order to create fixed-design knockoffs, we call `knockoff.filter` with the parameter `statistic` equal to `stat.glmnet_lambdadiff`. Moreover, since not all statistics are valid with fixed-design knockoffs, we use `stat.glmnet_lambdasmax` instead of the default one (which is based on cross-validation).
44 | ```{r, results='hide', message=FALSE}
45 | library(knockoff)
46 | result = knockoff.filter(X, y, knockoffs = create.fixed, statistic = stat.glmnet_lambdasmax)
47 | ```
48 | We can display the results with
49 | ```{r}
50 | print(result)
51 | ```
52 | The default value for the target false discovery rate is 0.1. In this experiment the false discovery proportion is
53 | ```{r}
54 | fdp = function(selected) sum(beta[selected] == 0) / max(1, length(selected))
55 | fdp(result$selected)
56 | ```
57 | 
58 | See also
59 | --------
60 | If you want to see some basic usage of the knockoff filter, see the [introductory vignette](knockoff.html).
61 | If you want to look inside the knockoff filter, see the [advanced vignette](advanced.html).


--------------------------------------------------------------------------------
/R/knockoff/vignettes/references.bib:
--------------------------------------------------------------------------------
 1 | @article{barber2014,
 2 |   title={Controlling the false discovery rate via knockoffs},
 3 |   author={Barber, Rina Foygel and Candes, Emmanuel},
 4 |   journal={arXiv preprint arXiv:1404.5609},
 5 |   year={2014}
 6 | }
 7 | 
 8 | @article{rhee2006,
 9 |   title={Genotypic predictors of human immunodeficiency virus type 1 drug resistance},
10 |   author={Rhee, Soo-Yon and Taylor, Jonathan and Wadhera, Gauhar and Ben-Hur, Asa and Brutlag, Douglas L and Shafer, Robert W},
11 |   journal={Proceedings of the National Academy of Sciences},
12 |   volume={103},
13 |   number={46},
14 |   pages={17355--17360},
15 |   year={2006},
16 |   publisher={National Academy of Sciences}
17 | }
18 | 
19 | @article{rhee2005,
20 |   title={HIV-1 Protease and reverse-transcriptase mutations: correlations with antiretroviral therapy in subtype B isolates and implications for drug-resistance surveillance},
21 |   author={Rhee, Soo-Yon and Fessel, W Jeffrey and Zolopa, Andrew R and Hurley, Leo and Liu, Tommy and Taylor, Jonathan and Nguyen, Dong Phuong and Slome, Sally and Klein, Daniel and Horberg, Michael and others},
22 |   journal={Journal of Infectious Diseases},
23 |   volume={192},
24 |   number={3},
25 |   pages={456--465},
26 |   year={2005},
27 |   publisher={Oxford University Press}
28 | }
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | The Knockoff Filter
 2 | ==========================
 3 | 
 4 | This repository provides versatile software interfaces to the knockoff methodology,
 5 | that is a general framework for controlling the false discovery 
 6 | rate when performing variable selection.
 7 | To learn more about knockoffs, visit https://web.stanford.edu/group/candes/knockoffs/.
 8 | 
 9 | This repository currently includes the following software for the knockoff filter:
10 | 
11 |   - [R package](R/README.md)
12 |   - [MATLAB package](MATLAB/README.md)
13 | 
14 | For more detailed information regarding a specific package, refer to the README files contained in the corresponding subdirectories. 


--------------------------------------------------------------------------------