├── AFTERFAC.CPP
├── ARCING.CPP
├── ARCING_M.CPP
├── BILINEAR.CPP
├── BOOT_C_1.CPP
├── BOOT_C_2.CPP
├── BOOT_P_1.CPP
├── BOOT_P_2.CPP
├── BOOT_P_3.CPP
├── BOOT_P_4.CPP
├── BOOT_P_5.CPP
├── CODE_DESCRIPTION.TXT
├── CONFCONF.CPP
├── DEP_BOOT.CPP
├── ENTROPY.CPP
├── GRNN.CPP
├── GRNN.H
├── GRNNGATE.CPP
├── INFO.H
├── INTEGRAT.CPP
├── LICENSE.txt
├── LINREG.CPP
├── LINREG.H
├── LOGISTIC.CPP
├── LOGISTIC.H
├── MC_TRAIN.CPP
├── MEM.CPP
├── MINIMIZE.CPP
├── MINIMIZE.H
├── MI_BIN.CPP
├── MI_CONT.CPP
├── MI_DISC.CPP
├── MI_ONLY.CPP
├── MLFN.CPP
├── MLFN.H
├── MULTCLAS.CPP
├── MULTPRED.CPP
├── MUTINF_B.CPP
├── MUTINF_C.CPP
├── MUTINF_D.CPP
├── PART.CPP
├── PARZDENS.CPP
├── QSORTD.CPP
├── RAND32.CPP
├── READFILE.CPP
├── README.md
├── SPLINE.CPP
├── STATS.CPP
├── TEST_CON.CPP
├── TEST_DIS.CPP
├── TRANSFER.CPP
├── TRANS_ENT.CPP
├── contributing.md
├── spearman.cpp
└── svdcmp.h


/BILINEAR.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  BILINEAR - Bilinear class for two-dimensional interpolation               */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <assert.h>
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <math.h>
 11 | #include <stdlib.h>
 12 | #include "info.h"
 13 | 
 14 | Bilinear::Bilinear ( // Uses input points (x,y,z) where z=f(x,y)
 15 |    int nxin ,        // Number of x points
 16 |    double *xin ,     // They are here, sorted ascending
 17 |    int nyin ,        // Number of y points
 18 |    double *yin ,     // They are here, sorted ascending
 19 |    double *zin ,     // Corresponding function values, y changing fastest
 20 |    int extra         // If nonzero, use 3x3 block with quadratic interpolation
 21 |    )
 22 | {
 23 | 
 24 |    quadratic = extra ;
 25 |    nx = nxin ;
 26 |    ny = nyin ;
 27 |    MEMTEXT ( "Bilinear constructor" ) ;
 28 |    x = (double *) MALLOC ( nx * sizeof(double) ) ;
 29 |    y = (double *) MALLOC ( ny * sizeof(double) ) ;
 30 |    z = (double *) MALLOC ( nx * ny * sizeof(double) ) ;
 31 |    assert ( x != NULL ) ;
 32 |    assert ( y != NULL ) ;
 33 |    assert ( z != NULL ) ;
 34 | 
 35 |    memcpy ( x , xin , nx * sizeof(double) ) ;
 36 |    memcpy ( y , yin , ny * sizeof(double) ) ;
 37 |    memcpy ( z , zin , nx * ny * sizeof(double) ) ;
 38 | }
 39 | 
 40 | Bilinear::~Bilinear ()
 41 | {
 42 |    MEMTEXT ( "Bilinear destructor" ) ;
 43 |    FREE ( x ) ;
 44 |    FREE ( y ) ;
 45 |    FREE ( z ) ;
 46 | }
 47 | 
 48 | double Bilinear::evaluate ( double xpt , double ypt )
 49 | {
 50 |    int k, kxlo, kxmid, kxhi, kylo, kymid, kyhi ;
 51 |    double t, u, val, clo, cmid, chi, zlo, zmid, zhi ;
 52 |    double dlo, dmid, dhi, lo_mid, lo_hi, mid_hi ;
 53 | 
 54 | /*
 55 |    Bound outlying inputs
 56 | */
 57 | 
 58 |    if (xpt < x[0])
 59 |       xpt = x[0] ;
 60 |    if (xpt > x[nx-1])
 61 |       xpt = x[nx-1] ;
 62 |    if (ypt < y[0])
 63 |       ypt = y[0] ;
 64 |    if (ypt > y[ny-1])
 65 |       ypt = y[ny-1] ;
 66 | 
 67 | /*
 68 |    Find the pair of x coordinates that bound the input
 69 | */
 70 | 
 71 |    kxlo = 0 ;
 72 |    kxhi = nx - 1 ;
 73 |    while (kxhi > kxlo+1) {
 74 |       k = (kxhi + kxlo) / 2 ;
 75 |       if (xpt < x[k])
 76 |          kxhi = k ;
 77 |       else
 78 |          kxlo = k ;
 79 |       }
 80 | 
 81 | /*
 82 |    Find the pair of y coordinates that bound the input
 83 | */
 84 | 
 85 |    kylo = 0 ;
 86 |    kyhi = ny - 1 ;
 87 |    while (kyhi > kylo+1) {
 88 |       k = (kyhi + kylo) / 2 ;
 89 |       if (ypt < y[k])
 90 |          kyhi = k ;
 91 |       else
 92 |          kylo = k ;
 93 |       }
 94 | 
 95 | /*
 96 |    3x3 with quadratic interpolation?
 97 | */
 98 | 
 99 |    if (quadratic) {
100 |       // Choose which way to go for the third x point
101 |       if (kxlo == 0) {
102 |          kxmid = kxhi ;
103 |          ++kxhi ;
104 |          }
105 |       else if (kxhi == nx-1) {
106 |          kxmid = kxlo ;
107 |          --kxlo ;
108 |          }
109 |       else if (xpt-x[kxlo] < x[kxhi]-xpt) {
110 |          kxmid = kxlo ;
111 |          --kxlo ;
112 |          }
113 |       else {
114 |          kxmid = kxhi ;
115 |          ++kxhi ;
116 |          }
117 | 
118 |       // Choose which way to go for the third y point
119 |       if (kylo == 0) {
120 |          kymid = kyhi ;
121 |          ++kyhi ;
122 |          }
123 |       else if (kyhi == ny-1) {
124 |          kymid = kylo ;
125 |          --kylo ;
126 |          }
127 |       else if (ypt-y[kylo] < y[kyhi]-ypt) {
128 |          kymid = kylo ;
129 |          --kylo ;
130 |          }
131 |       else {
132 |          kymid = kyhi ;
133 |          ++kyhi ;
134 |          }
135 | 
136 |       dlo = xpt - x[kxlo] ;
137 |       dmid = xpt - x[kxmid] ;
138 |       dhi = xpt - x[kxhi] ;
139 |       lo_mid = x[kxlo] - x[kxmid] ;
140 |       lo_hi = x[kxlo] - x[kxhi] ;
141 |       mid_hi = x[kxmid] - x[kxhi] ;
142 |       clo = dmid * dhi / (lo_mid * lo_hi) ;
143 |       cmid = dlo * dhi / (-lo_mid * mid_hi) ;
144 |       chi = dlo * dmid / (lo_hi * mid_hi) ;
145 | 
146 |       zlo  = clo * z[kxlo*ny+kylo]  + cmid * z[kxmid*ny+kylo]  + chi * z[kxhi*ny+kylo] ;
147 |       zmid = clo * z[kxlo*ny+kymid] + cmid * z[kxmid*ny+kymid] + chi * z[kxhi*ny+kymid] ;
148 |       zhi  = clo * z[kxlo*ny+kyhi]  + cmid * z[kxmid*ny+kyhi]  + chi * z[kxhi*ny+kyhi] ;
149 | 
150 |       dlo = ypt - y[kylo] ;
151 |       dmid = ypt - y[kymid] ;
152 |       dhi = ypt - y[kyhi] ;
153 |       lo_mid = y[kylo] - y[kymid] ;
154 |       lo_hi = y[kylo] - y[kyhi] ;
155 |       mid_hi = y[kymid] - y[kyhi] ;
156 |       clo = dmid * dhi / (lo_mid * lo_hi) ;
157 |       cmid = dlo * dhi / (-lo_mid * mid_hi) ;
158 |       chi = dlo * dmid / (lo_hi * mid_hi) ;
159 | 
160 |       return clo * zlo + cmid * zmid + chi * zhi ;
161 |       } // If quadratic
162 | 
163 | /*
164 |    Ordinary 2x2 bilinear
165 | */
166 | 
167 |    else {
168 |       t = (xpt - x[kxlo]) / (x[kxhi] - x[kxlo]) ;
169 |       u = (ypt - y[kylo]) / (y[kyhi] - y[kylo]) ;
170 | 
171 |       val = (1.0 - t) * (1.0 - u) * z[kxlo*ny+kylo] ;
172 |       val += t * (1.0 - u) * z[kxhi*ny+kylo] ;
173 |       val += t * u * z[kxhi*ny+kyhi] ;
174 |       val += (1.0 - t) * u * z[kxlo*ny+kyhi] ;
175 |       return val ;
176 |       }
177 | }
178 | 


--------------------------------------------------------------------------------
/BOOT_P_1.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  BOOT_P_1 - Bootstrap estimate of bias and variance when s != t            */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | #include <math.h>
 10 | #include <conio.h>
 11 | #include <ctype.h>
 12 | #include <stdlib.h>
 13 | 
 14 | double unifrand () ;
 15 | double normal () ;
 16 | void qsortd ( int istart , int istop , double *x ) ;
 17 | 
 18 | 
 19 | /*
 20 | --------------------------------------------------------------------------------
 21 | 
 22 |    Compute the parameter
 23 | 
 24 | --------------------------------------------------------------------------------
 25 | */
 26 | 
 27 | double param_mean ( int n , double *x )
 28 | {
 29 |    int i ;
 30 |    double mean ;
 31 | 
 32 |    mean = 0.0 ;
 33 |    for (i=0 ; i<n ; i++)
 34 |       mean += x[i] ;
 35 | 
 36 |    return mean / n ;
 37 | }
 38 | 
 39 | double param_median ( int n , double *x )
 40 | {
 41 |    qsortd ( 0 , n-1 , x ) ;
 42 | 
 43 |    if (n % 2)
 44 |       return x[n/2] ;
 45 |    else 
 46 |       return 0.5 * (x[n/2-1] + x[n/2]) ;
 47 | }
 48 | 
 49 | /*
 50 | --------------------------------------------------------------------------------
 51 | 
 52 |    boot_bias_var - Compute bias and variance of parameter (general s != t case)
 53 | 
 54 | --------------------------------------------------------------------------------
 55 | */
 56 | 
 57 | void boot_bias_var (
 58 |    int n ,              // Number of cases in sample
 59 |    double *data ,       // The sample
 60 |    double (*user_s) (int , double *) , // Compute param being bootstrapped, s
 61 |    double (*user_t) (int , double *) , // Compute true param, t
 62 |    int nboot ,          // Number of bootstrap replications
 63 |    double *rawstat ,    // Raw statistic of sample, theta-hat
 64 |    double *bias ,       // Output of bias estimate
 65 |    double *var ,        // Output of variance estimate
 66 |    double *work ,       // Work area n long
 67 |    double *work2        // Work area nboot long
 68 |    )
 69 | {
 70 |    int i, rep, k ;
 71 |    double stat, mean, variance, diff ;
 72 | 
 73 |    mean = 0.0 ;
 74 | 
 75 |    for (rep=0 ; rep<nboot ; rep++) {    // Do all bootstrap reps (b from 1 to B)
 76 | 
 77 |       for (i=0 ; i<n ; i++) {           // Generate the bootstrap sample
 78 |          k = (int) (unifrand() * n) ;   // Select a case from the sample
 79 |          if (k >= n)                    // Should never happen, but be prepared
 80 |             k = n - 1 ;
 81 |          work[i] = data[k] ;            // Put bootstrap sample in work
 82 |          }
 83 | 
 84 |       stat = user_s ( n , work ) ;      // Evaluate estimator for this boot rep
 85 |       work2[rep] = stat ;               // Enables more accurate variance
 86 |       mean += stat ;                    // Cumulate theta-hat star dot
 87 |       }
 88 | 
 89 |    mean /= nboot ;
 90 |    variance = 0.0 ;
 91 |    for (rep=0 ; rep<nboot ; rep++) {    // Cumulate variance
 92 |       diff = work2[rep] - mean ;
 93 |       variance += diff * diff ;
 94 |       }
 95 | 
 96 |    *rawstat = user_s ( n , data ) ;     // This is the final but biased estimate
 97 |    *bias = mean - user_t ( n , data ) ;
 98 |    *var = variance / (nboot - 1) ;
 99 | }
100 | 
101 | /*
102 | --------------------------------------------------------------------------------
103 | 
104 |    Optional main to test it
105 | 
106 | --------------------------------------------------------------------------------
107 | */
108 | 
109 | int main (
110 |    int argc ,    // Number of command line arguments (includes prog name)
111 |    char *argv[]  // Arguments (prog name is argv[0])
112 |    )
113 | 
114 | {
115 |    int i, ntries, itry, nsamps, nboot, divisor, ndone ;
116 |    double *x, diff, *work, *work2 ;
117 |    double *computed_param_1, *computed_bias_1, *computed_var_1 ;
118 |    double *computed_param_2, *computed_bias_2, *computed_var_2 ;
119 |    double mean_computed_param, var_computed_param ;
120 |    double mean_computed_bias, var_computed_bias, mean_computed_var ;
121 | 
122 | /*
123 |    Process command line parameters
124 | */
125 | 
126 |    if (argc != 4) {
127 |       printf (
128 |          "\nUsage: BOOT_P_1  nsamples  nboot  ntries" ) ;
129 |       exit ( 1 ) ;
130 |       }
131 | 
132 |    nsamps = atoi ( argv[1] ) ;
133 |    nboot = atoi ( argv[2] ) ;
134 |    ntries = atoi ( argv[3] ) ;
135 | 
136 |    if ((nsamps <= 0)  ||  (nboot <= 0)  ||  (ntries <= 0)) {
137 |       printf ( "\nUsage: BOOT_P_1  nsamples  nboot  ntries" ) ;
138 |       exit ( 1 ) ;
139 |       }
140 | 
141 | 
142 |    divisor = 1000000 / (nsamps * nboot) ;  // This is for progress reports only
143 |    if (divisor < 2)
144 |       divisor = 2 ;
145 | 
146 | /*
147 |    Allocate memory and initialize
148 | */
149 | 
150 |    x = (double *) malloc ( nsamps * sizeof(double) ) ;
151 |    work = (double *) malloc ( nsamps * sizeof(double) ) ;
152 |    work2 = (double *) malloc ( nboot * sizeof(double) ) ;
153 |    computed_param_1 = (double *) malloc ( ntries * sizeof(double) ) ;
154 |    computed_bias_1 = (double *) malloc ( ntries * sizeof(double) ) ;
155 |    computed_var_1 = (double *) malloc ( ntries * sizeof(double) ) ;
156 |    computed_param_2 = (double *) malloc ( ntries * sizeof(double) ) ;
157 |    computed_bias_2 = (double *) malloc ( ntries * sizeof(double) ) ;
158 |    computed_var_2 = (double *) malloc ( ntries * sizeof(double) ) ;
159 | 
160 | /*
161 |    Main outer loop does all tries
162 | */
163 | 
164 |    for (itry=0 ; itry<ntries ; itry++) {
165 | 
166 |       if ((itry % divisor) == 0)
167 |          printf ( "\n\n\nTry %d", itry ) ;
168 | 
169 | /*
170 |    This is the first of two tests.
171 | 
172 |    This test uses the mean of the sample to estimate the mean of the population.
173 |    So this is actually a plug-in statistic, and a silly one at that since
174 |    we already know that the mean is unbiased and has an optimal variance
175 |    estimate: the sample variance divided by n-1.
176 |    But this test is a useful verification that all is going as planned.
177 |    It also provides for a comparison with BOOT_P_2 later.
178 | */
179 | 
180 |       for (i=0 ; i<nsamps ; i++)
181 |          x[i] = normal () ;
182 | 
183 |       boot_bias_var ( nsamps , x , param_mean , param_mean , nboot ,
184 |                       &computed_param_1[itry] , &computed_bias_1[itry] ,
185 |                       &computed_var_1[itry] , work , work2 ) ;
186 | 
187 | /*
188 |    This is the second of two tests.
189 | 
190 |    It uses the sample median to estimate the population mean.
191 |    In this example, the population is defined by exponentiating a normal
192 |    random variable.  This distribution obviously has a long right tail and
193 |    short left tail.
194 | */
195 | 
196 |       for (i=0 ; i<nsamps ; i++)
197 |          x[i] = exp ( x[i] ) ;
198 | 
199 |       boot_bias_var ( nsamps , x , param_median , param_mean , nboot ,
200 |                       &computed_param_2[itry] , &computed_bias_2[itry] ,
201 |                       &computed_var_2[itry] , work , work2 ) ;
202 | 
203 |       if (((itry % divisor) == 1)
204 |        || (itry == ntries-1) ) {      // Don't do this every try!  Too slow.
205 |          ndone = itry + 1 ;           // This many tries done (and in arrays)
206 | 
207 | /*
208 |    Process test 1 of 2
209 | */
210 | 
211 |          mean_computed_param = 0.0 ;
212 |          mean_computed_bias = 0.0 ;
213 |          mean_computed_var = 0.0 ;
214 |          var_computed_param = 0.0 ;
215 |          var_computed_bias = 0.0 ;
216 |          for (i=0 ; i<ndone ; i++) {
217 |             mean_computed_param += computed_param_1[i] ;
218 |             mean_computed_bias += computed_bias_1[i] ;
219 |             mean_computed_var += computed_var_1[i] ;
220 |             }
221 |          mean_computed_param /= ndone ;  // Mean across tries of theta-hat
222 |          mean_computed_bias /= ndone ;   // Mean of bias estimates
223 |          mean_computed_var /= ndone ;    // Mean of variance estimates
224 |          for (i=0 ; i<ndone ; i++) {
225 |             diff = computed_param_1[i] - mean_computed_param ;
226 |             var_computed_param += diff * diff ;
227 |             diff = computed_bias_1[i] - mean_computed_bias ;
228 |             var_computed_bias += diff * diff ;
229 |             }
230 |          var_computed_param /= ndone ;   // Variance of theta-hat
231 |          var_computed_bias /= ndone ;    // Variance of bias estimate
232 |          printf ( "\n1: Computed parameter (theta-hat)  mean=%.5lf  variance=%.5lf",
233 |             mean_computed_param, var_computed_param ) ;
234 |          printf ( "\nMean boot bias=%.5lf (its std=%.5lf)   Mean boot var=%.5lf",
235 |             mean_computed_bias, sqrt(var_computed_bias), mean_computed_var ) ;
236 | 
237 | /*
238 |    Process test 2 of 2
239 | */
240 | 
241 |          mean_computed_param = 0.0 ;
242 |          mean_computed_bias = 0.0 ;
243 |          mean_computed_var = 0.0 ;
244 |          var_computed_param = 0.0 ;
245 |          var_computed_bias = 0.0 ;
246 |          for (i=0 ; i<ndone ; i++) {
247 |             mean_computed_param += computed_param_2[i] ;
248 |             mean_computed_bias += computed_bias_2[i] ;
249 |             mean_computed_var += computed_var_2[i] ;
250 |             }
251 |          mean_computed_param /= ndone ;  // Mean across tries of theta-hat
252 |          mean_computed_bias /= ndone ;   // Mean of bias estimates
253 |          mean_computed_var /= ndone ;    // Mean of variance estimates
254 |          for (i=0 ; i<ndone ; i++) {
255 |             diff = computed_param_2[i] - mean_computed_param ;
256 |             var_computed_param += diff * diff ;
257 |             diff = computed_bias_2[i] - mean_computed_bias ;
258 |             var_computed_bias += diff * diff ;
259 |             }
260 |          var_computed_param /= ndone ;   // Variance of theta-hat
261 |          var_computed_bias /= ndone ;    // Variance of bias estimate
262 |          printf ( "\n\n2: Computed parameter (theta-hat)  mean=%.5lf  variance=%.5lf",
263 |             mean_computed_param, var_computed_param ) ;
264 |          printf ( "\nMean boot bias=%.5lf (its std=%.5lf)   Mean boot var=%.5lf",
265 |             mean_computed_bias, sqrt(var_computed_bias), mean_computed_var ) ;
266 |          }
267 | 
268 |       if ((itry % 10) == 1) {
269 |          if (_kbhit ()) {
270 |             if (_getch() == 27)
271 |                break ;
272 |             }
273 |          }
274 | 
275 |      } // For all tries
276 | 
277 | 
278 |    return EXIT_SUCCESS ;
279 | }
280 | 


--------------------------------------------------------------------------------
/BOOT_P_2.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  BOOT_P_2 - Bootstrap estimate of bias and variance when s = t             */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | #include <math.h>
 10 | #include <conio.h>
 11 | #include <ctype.h>
 12 | #include <stdlib.h>
 13 | 
 14 | double unifrand () ;
 15 | double normal () ;
 16 | void qsortd ( int istart , int istop , double *x ) ;
 17 | void qsortds ( int istart , int istop , double *x , double *s ) ;
 18 | 
 19 | 
 20 | /*
 21 | --------------------------------------------------------------------------------
 22 | 
 23 |    Compute the parameter
 24 | 
 25 | --------------------------------------------------------------------------------
 26 | */
 27 | 
 28 | double param_mean ( int n , double *x , double *freq )
 29 | {
 30 |    int i ;
 31 |    double mean ;
 32 | 
 33 |    mean = 0.0 ;
 34 | 
 35 |    if (freq == NULL) {
 36 |       for (i=0 ; i<n ; i++)
 37 |          mean += x[i] ;
 38 |       return mean / n ;
 39 |       }
 40 | 
 41 |    for (i=0 ; i<n ; i++)
 42 |       mean += x[i] * freq[i] ;
 43 | 
 44 |    return mean ;
 45 | }
 46 | 
 47 | double param_median ( int n , double *x , double *freq )
 48 | {
 49 |    int i ;
 50 |    double sum ;
 51 | 
 52 |    if (n == 1)
 53 |       return x[0] ;
 54 | 
 55 |    if (freq == NULL) {
 56 |       qsortd ( 0 , n-1 , x ) ;
 57 |       if (n % 2)
 58 |          return x[n/2] ;
 59 |       else 
 60 |          return 0.5 * (x[n/2-1] + x[n/2]) ;
 61 |       }
 62 | 
 63 |    qsortds ( 0 , n-1 , x , freq ) ;
 64 | 
 65 | /*
 66 |    Handle unusual cases first
 67 | */
 68 | 
 69 |    if (freq[0] >= 0.5)
 70 |       return x[0] ;
 71 |    else if (freq[n-1] >= 0.5)
 72 |       return x[n-1] ;
 73 | 
 74 | /*
 75 |    Usual case.  Keep it primitive and simple: no interolation.
 76 | */
 77 | 
 78 |    sum = 0.0 ;
 79 |    for (i=0 ; i<n ; i++) {
 80 |       sum += freq[i] ;
 81 |       if (sum >= 0.5)
 82 |          break ;
 83 |       }
 84 | 
 85 |    return 0.5 * (x[i] + x[i-1]) ;
 86 | }
 87 | 
 88 | /*
 89 | --------------------------------------------------------------------------------
 90 | 
 91 |    boot_bias_var - Compute bias and variance of parameter (plug-in case, s=t)
 92 | 
 93 | --------------------------------------------------------------------------------
 94 | */
 95 | 
 96 | void boot_bias_var (
 97 |    int n ,              // Number of cases in sample
 98 |    double *data ,       // The sample
 99 |    double (*user_s) (int , double * , double * ) , // Compute param
100 |    int nboot ,          // Number of bootstrap replications
101 |    double *rawstat ,    // Raw statistic of sample, theta-hat
102 |    double *bias ,       // Output of bias estimate
103 |    double *var ,        // Output of variance estimate
104 |    double *work ,       // Work area n long
105 |    double *work2 ,      // Work area nboot long
106 |    double *freq         // Work area n long
107 |    )
108 | {
109 |    int i, rep, k ;
110 |    double stat, mean, variance, diff ;
111 | 
112 |    mean = 0.0 ;
113 | 
114 |    for (i=0 ; i<n ; i++)
115 |       freq[i] = 0.0 ;
116 | 
117 |    for (rep=0 ; rep<nboot ; rep++) {    // Do all bootstrap reps (b from 1 to B)
118 | 
119 |       for (i=0 ; i<n ; i++) {           // Generate the bootstrap sample
120 |          k = (int) (unifrand() * n) ;   // Select a case from the sample
121 |          if (k >= n)                    // Should never happen, but be prepared
122 |             k = n - 1 ;
123 |          work[i] = data[k] ;            // Put bootstrap sample in work
124 |          ++freq[k] ;                    // Tally for mean frequency
125 |          }
126 | 
127 |       stat = user_s ( n , work , NULL ) ; // Evaluate estimator for this rep
128 |       work2[rep] = stat ;               // Enables more accurate variance
129 |       mean += stat ;                    // Cumulate theta-hat star dot
130 |       }
131 | 
132 |    mean /= nboot ;
133 |    variance = 0.0 ;
134 |    for (rep=0 ; rep<nboot ; rep++) {    // Cumulate variance
135 |       diff = work2[rep] - mean ;
136 |       variance += diff * diff ;
137 |       }
138 | 
139 |    for (i=0 ; i<n ; i++)                // Convert tally of useage
140 |       freq[i] /= nboot * n ;            // To mean frequency of use
141 | 
142 |    memcpy ( work , data , n * sizeof(double) ) ; // user_s reorders, so preserve
143 |    *rawstat = user_s ( n , data , NULL) ;        // Final but biased estimate
144 |    *bias = mean - user_s ( n , work , freq ) ;
145 |    *var = variance / (nboot - 1) ;
146 | }
147 | 
148 | /*
149 | --------------------------------------------------------------------------------
150 | 
151 |    Optional main to test it
152 | 
153 | --------------------------------------------------------------------------------
154 | */
155 | 
156 | int main (
157 |    int argc ,    // Number of command line arguments (includes prog name)
158 |    char *argv[]  // Arguments (prog name is argv[0])
159 |    )
160 | 
161 | {
162 |    int i, ntries, itry, nsamps, nboot, divisor, ndone ;
163 |    double *x, diff, *work, *work2, *freq ;
164 |    double *computed_param_1, *computed_bias_1, *computed_var_1 ;
165 |    double *computed_param_2, *computed_bias_2, *computed_var_2 ;
166 |    double mean_computed_param, var_computed_param ;
167 |    double mean_computed_bias, var_computed_bias, mean_computed_var ;
168 | 
169 | /*
170 |    Process command line parameters
171 | */
172 | 
173 |    if (argc != 4) {
174 |       printf (
175 |          "\nUsage: BOOT_P_2  nsamples  nboot  ntries" ) ;
176 |       exit ( 1 ) ;
177 |       }
178 | 
179 |    nsamps = atoi ( argv[1] ) ;
180 |    nboot = atoi ( argv[2] ) ;
181 |    ntries = atoi ( argv[3] ) ;
182 | 
183 |    if ((nsamps <= 0)  ||  (nboot <= 0)  ||  (ntries <= 0)) {
184 |       printf ( "\nUsage: BOOT_P_2  nsamples  nboot  ntries" ) ;
185 |       exit ( 1 ) ;
186 |       }
187 | 
188 | 
189 |    divisor = 1000000 / (nsamps * nboot) ;  // This is for progress reports only
190 |    if (divisor < 2)
191 |       divisor = 2 ;
192 | 
193 | /*
194 |    Allocate memory and initialize
195 | */
196 | 
197 |    x = (double *) malloc ( nsamps * sizeof(double) ) ;
198 |    work = (double *) malloc ( nsamps * sizeof(double) ) ;
199 |    work2 = (double *) malloc ( nboot * sizeof(double) ) ;
200 |    computed_param_1 = (double *) malloc ( ntries * sizeof(double) ) ;
201 |    computed_bias_1 = (double *) malloc ( ntries * sizeof(double) ) ;
202 |    computed_var_1 = (double *) malloc ( ntries * sizeof(double) ) ;
203 |    computed_param_2 = (double *) malloc ( ntries * sizeof(double) ) ;
204 |    computed_bias_2 = (double *) malloc ( ntries * sizeof(double) ) ;
205 |    computed_var_2 = (double *) malloc ( ntries * sizeof(double) ) ;
206 |    freq = (double *) malloc ( nsamps * sizeof(double) ) ;
207 | 
208 | /*
209 |    Main outer loop does all tries
210 | */
211 | 
212 |    for (itry=0 ; itry<ntries ; itry++) {
213 | 
214 |       if ((itry % divisor) == 0)
215 |          printf ( "\n\n\nTry %d", itry ) ;
216 | 
217 | /*
218 |    This is the first of two tests.
219 |    It estimates the mean of a normal distribution.
220 | */
221 | 
222 |       for (i=0 ; i<nsamps ; i++)
223 |          x[i] = normal () ;
224 | 
225 |       boot_bias_var ( nsamps , x , param_mean , nboot ,
226 |                       &computed_param_1[itry] , &computed_bias_1[itry] ,
227 |                       &computed_var_1[itry] , work , work2 , freq ) ;
228 | 
229 | /*
230 |    This is the second of two tests.
231 |    It estimates the median of a strongly positively skewed distribution.
232 | */
233 | 
234 |       for (i=0 ; i<nsamps ; i++)
235 |          x[i] = exp ( x[i] ) ;
236 | 
237 |       boot_bias_var ( nsamps , x , param_median , nboot ,
238 |                       &computed_param_2[itry] , &computed_bias_2[itry] ,
239 |                       &computed_var_2[itry] , work , work2 , freq ) ;
240 | 
241 |       if (((itry % divisor) == 1)
242 |        || (itry == ntries-1) ) {      // Don't do this every try!  Too slow.
243 |          ndone = itry + 1 ;           // This many tries done (and in arrays)
244 | 
245 | /*
246 |    Process test 1 of 2
247 | */
248 | 
249 |          mean_computed_param = 0.0 ;
250 |          mean_computed_bias = 0.0 ;
251 |          mean_computed_var = 0.0 ;
252 |          var_computed_param = 0.0 ;
253 |          var_computed_bias = 0.0 ;
254 |          for (i=0 ; i<ndone ; i++) {
255 |             mean_computed_param += computed_param_1[i] ;
256 |             mean_computed_bias += computed_bias_1[i] ;
257 |             mean_computed_var += computed_var_1[i] ;
258 |             }
259 |          mean_computed_param /= ndone ;  // Mean across tries of theta-hat
260 |          mean_computed_bias /= ndone ;   // Mean of bias estimates
261 |          mean_computed_var /= ndone ;    // Mean of variance estimates
262 |          for (i=0 ; i<ndone ; i++) {
263 |             diff = computed_param_1[i] - mean_computed_param ;
264 |             var_computed_param += diff * diff ;
265 |             diff = computed_bias_1[i] - mean_computed_bias ;
266 |             var_computed_bias += diff * diff ;
267 |             }
268 |          var_computed_param /= ndone ;   // Variance of theta-hat
269 |          var_computed_bias /= ndone ;    // Variance of bias estimate
270 |          printf ( "\n1: Computed parameter (theta-hat)  mean=%.5lf  variance=%.5lf",
271 |             mean_computed_param, var_computed_param ) ;
272 |          printf ( "\nMean boot bias=%.5lf (its std=%.5lf)   Mean boot var=%.5lf",
273 |             mean_computed_bias, sqrt(var_computed_bias), mean_computed_var ) ;
274 | 
275 | /*
276 |    Process test 2 of 2
277 | */
278 | 
279 |          mean_computed_param = 0.0 ;
280 |          mean_computed_bias = 0.0 ;
281 |          mean_computed_var = 0.0 ;
282 |          var_computed_param = 0.0 ;
283 |          var_computed_bias = 0.0 ;
284 |          for (i=0 ; i<ndone ; i++) {
285 |             mean_computed_param += computed_param_2[i] ;
286 |             mean_computed_bias += computed_bias_2[i] ;
287 |             mean_computed_var += computed_var_2[i] ;
288 |             }
289 |          mean_computed_param /= ndone ;  // Mean across tries of theta-hat
290 |          mean_computed_bias /= ndone ;   // Mean of bias estimates
291 |          mean_computed_var /= ndone ;    // Mean of variance estimates
292 |          for (i=0 ; i<ndone ; i++) {
293 |             diff = computed_param_2[i] - mean_computed_param ;
294 |             var_computed_param += diff * diff ;
295 |             diff = computed_bias_2[i] - mean_computed_bias ;
296 |             var_computed_bias += diff * diff ;
297 |             }
298 |          var_computed_param /= ndone ;   // Variance of theta-hat
299 |          var_computed_bias /= ndone ;    // Variance of bias estimate
300 |          printf ( "\n\n2: Computed parameter (theta-hat)  mean=%.5lf  variance=%.5lf",
301 |             mean_computed_param, var_computed_param ) ;
302 |          printf ( "\nMean boot bias=%.5lf (its std=%.5lf)   Mean boot var=%.5lf",
303 |             mean_computed_bias, sqrt(var_computed_bias), mean_computed_var ) ;
304 |          }
305 | 
306 |       if ((itry % 10) == 1) {
307 |          if (_kbhit ()) {
308 |             if (_getch() == 27)
309 |                break ;
310 |             }
311 |          }
312 | 
313 |      } // For all tries
314 | 
315 | 
316 |    return EXIT_SUCCESS ;
317 | }
318 | 


--------------------------------------------------------------------------------
/BOOT_P_3.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  BOOT_P_3 - Bootstrap estimate of bias and variance for regression coef    */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | #include <math.h>
 10 | #include <conio.h>
 11 | #include <ctype.h>
 12 | #include <stdlib.h>
 13 | 
 14 | double unifrand () ;
 15 | double normal () ;
 16 | 
 17 | 
 18 | /*
 19 | --------------------------------------------------------------------------------
 20 | 
 21 |    Compute the parameter
 22 | 
 23 | --------------------------------------------------------------------------------
 24 | */
 25 | 
 26 | double param_beta ( int n , double *x , double *y )
 27 | {
 28 |    int i ;
 29 |    double xmean, ymean, xdif, ydif, xvar, covar ;
 30 | 
 31 |    xmean = ymean = 0.0 ;
 32 |    for (i=0 ; i<n ; i++) {
 33 |       xmean += x[i] ;
 34 |       ymean += y[i] ;
 35 |       }
 36 | 
 37 |    xmean /= n ;
 38 |    ymean /= n ;
 39 | 
 40 |    xvar = covar = 0.0 ;
 41 |    for (i=0 ; i<n ; i++) {
 42 |       xdif = x[i] - xmean ;
 43 |       ydif = y[i] - ymean ;
 44 |       xvar += xdif * xdif ;
 45 |       covar += xdif * ydif ;
 46 |       }
 47 | 
 48 |    if (xvar != 0.0)
 49 |       return covar / xvar ;
 50 |    return 0.0 ;
 51 | }
 52 | 
 53 | /*
 54 | --------------------------------------------------------------------------------
 55 | 
 56 |    boot_bias_var - Compute bias and variance of parameter, bivariate data
 57 | 
 58 | --------------------------------------------------------------------------------
 59 | */
 60 | 
 61 | void boot_bias_var (
 62 |    int n ,              // Number of cases in sample
 63 |    double *x ,          // Independent variable in sample
 64 |    double *y ,          // Dependent variable in sample
 65 |    double (*user_t) (int , double * , double *) , // Compute parameter
 66 |    int nboot ,          // Number of bootstrap replications
 67 |    double *rawstat ,    // Raw statistic of sample, theta-hat
 68 |    double *bias ,       // Output of bias estimate
 69 |    double *var ,        // Output of variance estimate
 70 |    double *xwork ,      // Work area n long
 71 |    double *ywork ,      // Work area n long
 72 |    double *work2        // Work area nboot long
 73 |    )
 74 | {
 75 |    int i, rep, k ;
 76 |    double stat, mean, variance, diff ;
 77 | 
 78 |    mean = 0.0 ;
 79 | 
 80 |    for (rep=0 ; rep<nboot ; rep++) {    // Do all bootstrap reps (b from 1 to B)
 81 | 
 82 |       for (i=0 ; i<n ; i++) {           // Generate the bootstrap sample
 83 |          k = (int) (unifrand() * n) ;   // Select a case from the sample
 84 |          if (k >= n)                    // Should never happen, but be prepared
 85 |             k = n - 1 ;
 86 |          xwork[i] = x[k] ;              // Put bootstrap sample in work
 87 |          ywork[i] = y[k] ;
 88 |          }
 89 | 
 90 |       stat = user_t ( n , xwork , ywork ) ; // Evaluate estimator for this rep
 91 |       work2[rep] = stat ;               // Enables more accurate variance
 92 |       mean += stat ;                    // Cumulate theta-hat star dot
 93 |       }
 94 | 
 95 |    mean /= nboot ;
 96 |    variance = 0.0 ;
 97 |    for (rep=0 ; rep<nboot ; rep++) {    // Cumulate variance
 98 |       diff = work2[rep] - mean ;
 99 |       variance += diff * diff ;
100 |       }
101 | 
102 |    stat = user_t ( n , x , y ) ;        // This is the final but biased estimate
103 |    *rawstat = stat ;
104 |    *bias = mean - stat ;
105 |    *var = variance / (nboot - 1) ;
106 | }
107 | 
108 | /*
109 | --------------------------------------------------------------------------------
110 | 
111 |    Optional main to test it
112 | 
113 | --------------------------------------------------------------------------------
114 | */
115 | 
116 | int main (
117 |    int argc ,    // Number of command line arguments (includes prog name)
118 |    char *argv[]  // Arguments (prog name is argv[0])
119 |    )
120 | 
121 | {
122 |    int i, ntries, itry, nsamps, nboot, divisor, ndone ;
123 |    double beta, *x, *y, diff, *xwork, *ywork, *work2 ;
124 |    double *computed_param, *computed_bias, *computed_var ;
125 |    double mean_computed_param, var_computed_param ;
126 |    double mean_computed_bias, var_computed_bias, mean_computed_var ;
127 | 
128 | /*
129 |    Process command line parameters
130 | */
131 | 
132 |    if (argc != 5) {
133 |       printf (
134 |          "\nUsage: BOOT_P_3  nsamples  nboot  ntries  beta" ) ;
135 |       exit ( 1 ) ;
136 |       }
137 | 
138 |    nsamps = atoi ( argv[1] ) ;
139 |    nboot = atoi ( argv[2] ) ;
140 |    ntries = atoi ( argv[3] ) ;
141 |    beta = atof ( argv[4] ) ;
142 | 
143 |    if ((nsamps <= 0)  ||  (nboot <= 0)  ||  (ntries <= 0)) {
144 |       printf ( "\nUsage: BOOT_P_3  nsamples  nboot  ntries  beta" ) ;
145 |       exit ( 1 ) ;
146 |       }
147 | 
148 | 
149 |    divisor = 1000000 / (nsamps * nboot) ;  // This is for progress reports only
150 |    if (divisor < 2)
151 |       divisor = 2 ;
152 | 
153 | /*
154 |    Allocate memory and initialize
155 | */
156 | 
157 |    x = (double *) malloc ( nsamps * sizeof(double) ) ;
158 |    y = (double *) malloc ( nsamps * sizeof(double) ) ;
159 |    xwork = (double *) malloc ( nsamps * sizeof(double) ) ;
160 |    ywork = (double *) malloc ( nsamps * sizeof(double) ) ;
161 |    work2 = (double *) malloc ( nboot * sizeof(double) ) ;
162 |    computed_param = (double *) malloc ( ntries * sizeof(double) ) ;
163 |    computed_bias = (double *) malloc ( ntries * sizeof(double) ) ;
164 |    computed_var = (double *) malloc ( ntries * sizeof(double) ) ;
165 | 
166 | /*
167 |    Main outer loop does all tries
168 | */
169 | 
170 |    for (itry=0 ; itry<ntries ; itry++) {
171 | 
172 |       if ((itry % divisor) == 0)
173 |          printf ( "\n\n\nTry %d", itry ) ;
174 | 
175 |       for (i=0 ; i<nsamps ; i++) {
176 |          x[i] = normal () ;
177 |          y[i] = beta * x[i] + 0.2 * normal () ;
178 |          }
179 | 
180 |       boot_bias_var ( nsamps , x , y , param_beta , nboot ,
181 |                       &computed_param[itry] , &computed_bias[itry] ,
182 |                       &computed_var[itry] , xwork , ywork , work2 ) ;
183 | 
184 |       if (((itry % divisor) == 1)
185 |        || (itry == ntries-1) ) {      // Don't do this every try!  Too slow.
186 |          ndone = itry + 1 ;           // This many tries done (and in arrays)
187 | 
188 |          mean_computed_param = 0.0 ;
189 |          mean_computed_bias = 0.0 ;
190 |          mean_computed_var = 0.0 ;
191 |          var_computed_param = 0.0 ;
192 |          var_computed_bias = 0.0 ;
193 |          for (i=0 ; i<ndone ; i++) {
194 |             mean_computed_param += computed_param[i] ;
195 |             mean_computed_bias += computed_bias[i] ;
196 |             mean_computed_var += computed_var[i] ;
197 |             }
198 |          mean_computed_param /= ndone ;  // Mean across tries of theta-hat
199 |          mean_computed_bias /= ndone ;   // Mean of bias estimates
200 |          mean_computed_var /= ndone ;    // Mean of variance estimates
201 |          for (i=0 ; i<ndone ; i++) {
202 |             diff = computed_param[i] - mean_computed_param ;
203 |             var_computed_param += diff * diff ;
204 |             diff = computed_bias[i] - mean_computed_bias ;
205 |             var_computed_bias += diff * diff ;
206 |             }
207 |          var_computed_param /= ndone ;   // Variance of theta-hat
208 |          var_computed_bias /= ndone ;    // Variance of bias estimate
209 |          printf ( "\nComputed parameter (theta-hat)  mean=%.5lf  variance=%.5lf",
210 |             mean_computed_param, var_computed_param ) ;
211 |          printf ( "\nMean boot bias=%.5lf (its std=%.5lf)   Mean boot var=%.5lf",
212 |             mean_computed_bias, sqrt(var_computed_bias), mean_computed_var ) ;
213 |          }
214 | 
215 |       if ((itry % 10) == 1) {
216 |          if (_kbhit ()) {
217 |             if (_getch() == 27)
218 |                break ;
219 |             }
220 |          }
221 | 
222 |      } // For all tries
223 | 
224 | 
225 |    return EXIT_SUCCESS ;
226 | }
227 | 


--------------------------------------------------------------------------------
/BOOT_P_5.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  BOOT_P_5 - Compare jackknife and bootstrap estimates for bias/var of PF   */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | #include <math.h>
 10 | #include <conio.h>
 11 | #include <ctype.h>
 12 | #include <stdlib.h>
 13 | 
 14 | double unifrand () ;
 15 | double normal () ;
 16 | 
 17 | /*
 18 | --------------------------------------------------------------------------------
 19 | 
 20 |    Compute the parameter
 21 | 
 22 | --------------------------------------------------------------------------------
 23 | */
 24 | 
 25 | double param_pf ( int n , double *x , double *freq )
 26 | {
 27 |    int i ;
 28 |    double sum_win, sum_loss ;
 29 | 
 30 |    sum_win = sum_loss = 1.e-5 ;   // Really zero, but prevent division by 0
 31 | 
 32 |    if (freq == NULL) {
 33 |       for (i=0 ; i<n ; i++) {
 34 |          if (x[i] > 0.0)
 35 |             sum_win += x[i] ;
 36 |          else 
 37 |             sum_loss -= x[i] ;
 38 |          }
 39 |       return sum_win / sum_loss ;
 40 |       }
 41 | 
 42 |    for (i=0 ; i<n ; i++) {
 43 |       if (x[i] > 0.0)
 44 |          sum_win += freq[i] * x[i] ;
 45 |       else 
 46 |          sum_loss -= freq[i] * x[i] ;
 47 |       }
 48 |    return sum_win / sum_loss ;
 49 | }
 50 | 
 51 | /*
 52 | --------------------------------------------------------------------------------
 53 | 
 54 |    boot_bias_var - Compute bias and variance of parameter (plug-in case, s=t)
 55 | 
 56 | --------------------------------------------------------------------------------
 57 | */
 58 | 
 59 | void boot_bias_var (
 60 |    int n ,              // Number of cases in sample
 61 |    double *data ,       // The sample
 62 |    double (*user_t) (int , double * , double * ) , // Compute param
 63 |    int nboot ,          // Number of bootstrap replications
 64 |    double *rawstat ,    // Raw statistic of sample, theta-hat
 65 |    double *bias ,       // Output of bias estimate
 66 |    double *var ,        // Output of variance estimate
 67 |    double *work ,       // Work area n long
 68 |    double *work2 ,      // Work area nboot long
 69 |    double *freq         // Work area n long
 70 |    )
 71 | {
 72 |    int i, rep, k ;
 73 |    double stat, mean, variance, diff ;
 74 | 
 75 |    mean = 0.0 ;
 76 | 
 77 |    for (i=0 ; i<n ; i++)
 78 |       freq[i] = 0.0 ;
 79 | 
 80 |    for (rep=0 ; rep<nboot ; rep++) {    // Do all bootstrap reps (b from 1 to B)
 81 | 
 82 |       for (i=0 ; i<n ; i++) {           // Generate the bootstrap sample
 83 |          k = (int) (unifrand() * n) ;   // Select a case from the sample
 84 |          if (k >= n)                    // Should never happen, but be prepared
 85 |             k = n - 1 ;
 86 |          work[i] = data[k] ;            // Put bootstrap sample in work
 87 |          ++freq[k] ;                    // Tally for mean frequency
 88 |          }
 89 | 
 90 |       stat = user_t ( n , work , NULL ) ; // Evaluate estimator for this rep
 91 |       work2[rep] = stat ;               // Enables more accurate variance
 92 |       mean += stat ;                    // Cumulate theta-hat star dot
 93 |       }
 94 | 
 95 |    mean /= nboot ;
 96 |    variance = 0.0 ;
 97 |    for (rep=0 ; rep<nboot ; rep++) {    // Cumulate variance
 98 |       diff = work2[rep] - mean ;
 99 |       variance += diff * diff ;
100 |       }
101 | 
102 |    for (i=0 ; i<n ; i++)                // Convert tally of useage
103 |       freq[i] /= nboot * n ;            // To mean frequency of use
104 | 
105 |    memcpy ( work , data , n * sizeof(double) ) ; // user_t may reorder, so preserve
106 |    *rawstat = user_t ( n , data , NULL) ;        // Final but biased estimate
107 |    *bias = mean - user_t ( n , work , freq ) ;
108 |    *var = variance / (nboot - 1) ;
109 | }
110 | 
111 | /*
112 | --------------------------------------------------------------------------------
113 | 
114 |    jack_bias_var - Compute bias and variance of parameter via jackknife
115 | 
116 | --------------------------------------------------------------------------------
117 | */
118 | 
119 | void jack_bias_var (
120 |    int n ,              // Number of cases in sample
121 |    double *data ,       // The sample
122 |    double (*user_t) (int , double * , double * ) , // Compute param
123 |    double *rawstat ,    // Raw statistic of sample, theta-hat
124 |    double *bias ,       // Output of bias estimate
125 |    double *var ,        // Output of variance estimate
126 |    double *work         // Work area n long
127 |    )
128 | {
129 |    int i ;
130 |    double last, temp, param, diff, theta_dot ;
131 | 
132 |    last = data[n-1] ;
133 |    theta_dot = 0.0 ;
134 |    for (i=0 ; i<n ; i++) {
135 |       temp = data[i] ;              // Preserve case being temporarily removed
136 |       data[i] = last ;              // Swap in last case
137 |       param = user_t ( n-1 , data , NULL ) ; // Param for this jackknife
138 |       theta_dot += param ;          // Cumulate mean across jackknife
139 |       work[i] = param ;             // Save for computing variance later
140 |       data[i] = temp ;              // Restore original case
141 |       }
142 | 
143 |    theta_dot /= n ;
144 | 
145 |    *rawstat = user_t ( n , data , NULL ) ;
146 |    *bias = (n - 1) * (theta_dot - *rawstat) ;
147 | 
148 |    *var = 0.0 ;
149 |    for (i=0 ; i<n ; i++) {
150 |       diff = work[i] - theta_dot ;
151 |       *var += diff * diff ;
152 |       }
153 | 
154 |    *var *= (n - 1.0) / (double) n ;
155 | }
156 | 
157 | /*
158 | --------------------------------------------------------------------------------
159 | 
160 |    Optional main to test it
161 | 
162 | --------------------------------------------------------------------------------
163 | */
164 | 
165 | int main (
166 |    int argc ,    // Number of command line arguments (includes prog name)
167 |    char *argv[]  // Arguments (prog name is argv[0])
168 |    )
169 | 
170 | {
171 |    int i, ntries, itry, nsamps, nboot, divisor, ndone ;
172 |    double *x, diff, *work, *work2, *freq, mean ;
173 |    double *computed_param_1, *computed_bias_1, *computed_var_1 ;
174 |    double *computed_param_2, *computed_bias_2, *computed_var_2 ;
175 |    double mean_computed_param, var_computed_param ;
176 |    double mean_computed_bias, var_computed_bias, mean_computed_var ;
177 |    double grand_wins, grand_losses ;
178 | 
179 | /*
180 |    Process command line parameters
181 | */
182 | 
183 |    if (argc != 5) {
184 |       printf (
185 |          "\nUsage: BOOT_P_5  nsamples  nboot  ntries  mean" ) ;
186 |       exit ( 1 ) ;
187 |       }
188 | 
189 |    nsamps = atoi ( argv[1] ) ;
190 |    nboot = atoi ( argv[2] ) ;
191 |    ntries = atoi ( argv[3] ) ;
192 |    mean = atof ( argv[4] ) ;
193 | 
194 |    if ((nsamps <= 0)  ||  (nboot <= 0)  ||  (ntries <= 0)) {
195 |       printf ( "\nUsage: BOOT_P_5  nsamples  nboot  ntries  mean" ) ;
196 |       exit ( 1 ) ;
197 |       }
198 | 
199 | 
200 |    divisor = 1000000 / (nsamps * nboot) ;  // This is for progress reports only
201 |    if (divisor < 2)
202 |       divisor = 2 ;
203 | 
204 | /*
205 |    Allocate memory and initialize
206 | */
207 | 
208 |    x = (double *) malloc ( nsamps * sizeof(double) ) ;
209 |    work = (double *) malloc ( nsamps * sizeof(double) ) ;
210 |    work2 = (double *) malloc ( nboot * sizeof(double) ) ;
211 |    computed_param_1 = (double *) malloc ( ntries * sizeof(double) ) ;
212 |    computed_bias_1 = (double *) malloc ( ntries * sizeof(double) ) ;
213 |    computed_var_1 = (double *) malloc ( ntries * sizeof(double) ) ;
214 |    computed_param_2 = (double *) malloc ( ntries * sizeof(double) ) ;
215 |    computed_bias_2 = (double *) malloc ( ntries * sizeof(double) ) ;
216 |    computed_var_2 = (double *) malloc ( ntries * sizeof(double) ) ;
217 |    freq = (double *) malloc ( nsamps * sizeof(double) ) ;
218 | 
219 | /*
220 |    Main outer loop does all tries
221 | */
222 | 
223 |    grand_wins = grand_losses = 0.0 ;
224 | 
225 |    for (itry=0 ; itry<ntries ; itry++) {
226 | 
227 |       for (i=0 ; i<nsamps ; i++) {
228 |          x[i] = 1000.0 * normal () + mean ;
229 |          if (x[i] > 0.0)          // Cumulate so we know the true value
230 |             grand_wins += x[i] ;
231 |          else 
232 |             grand_losses -= x[i] ;
233 |          }
234 | 
235 |       boot_bias_var ( nsamps , x , param_pf , nboot ,
236 |                       &computed_param_1[itry] , &computed_bias_1[itry] ,
237 |                       &computed_var_1[itry] , work , work2 , freq ) ;
238 | 
239 |       jack_bias_var ( nsamps , x , param_pf ,
240 |                       &computed_param_2[itry] , &computed_bias_2[itry] ,
241 |                       &computed_var_2[itry] , work ) ;
242 | 
243 |       if (((itry % divisor) == 1)
244 |        || (itry == ntries-1) ) {      // Don't do this every try!  Too slow.
245 |          ndone = itry + 1 ;           // This many tries done (and in arrays)
246 |          printf ( "\n\n\nTry %d   True value = %lf", itry,
247 |                   grand_wins / grand_losses ) ;
248 | 
249 | /*
250 |    Process test 1 of 2
251 | */
252 | 
253 |          mean_computed_param = 0.0 ;
254 |          mean_computed_bias = 0.0 ;
255 |          mean_computed_var = 0.0 ;
256 |          var_computed_param = 0.0 ;
257 |          var_computed_bias = 0.0 ;
258 |          for (i=0 ; i<ndone ; i++) {
259 |             mean_computed_param += computed_param_1[i] ;
260 |             mean_computed_bias += computed_bias_1[i] ;
261 |             mean_computed_var += computed_var_1[i] ;
262 |             }
263 |          mean_computed_param /= ndone ;  // Mean across tries of theta-hat
264 |          mean_computed_bias /= ndone ;   // Mean of bias estimates
265 |          mean_computed_var /= ndone ;    // Mean of variance estimates
266 |          for (i=0 ; i<ndone ; i++) {
267 |             diff = computed_param_1[i] - mean_computed_param ;
268 |             var_computed_param += diff * diff ;
269 |             diff = computed_bias_1[i] - mean_computed_bias ;
270 |             var_computed_bias += diff * diff ;
271 |             }
272 |          var_computed_param /= ndone ;   // Variance of theta-hat
273 |          var_computed_bias /= ndone ;    // Variance of bias estimate
274 |          printf ( "\nComputed parameter (theta-hat)  mean=%.5lf  variance=%.5lf",
275 |             mean_computed_param, var_computed_param ) ;
276 |          printf ( "\nBOOT: Mean bias=%.5lf (its std=%.5lf)   Mean var=%.5lf",
277 |             mean_computed_bias, sqrt(var_computed_bias), mean_computed_var ) ;
278 | 
279 | /*
280 |    Process test 2 of 2
281 | */
282 | 
283 |          mean_computed_param = 0.0 ;
284 |          mean_computed_bias = 0.0 ;
285 |          mean_computed_var = 0.0 ;
286 |          var_computed_param = 0.0 ;
287 |          var_computed_bias = 0.0 ;
288 |          for (i=0 ; i<ndone ; i++) {
289 |             mean_computed_param += computed_param_2[i] ;
290 |             mean_computed_bias += computed_bias_2[i] ;
291 |             mean_computed_var += computed_var_2[i] ;
292 |             }
293 |          mean_computed_param /= ndone ;  // Mean across tries of theta-hat
294 |          mean_computed_bias /= ndone ;   // Mean of bias estimates
295 |          mean_computed_var /= ndone ;    // Mean of variance estimates
296 |          for (i=0 ; i<ndone ; i++) {
297 |             diff = computed_param_2[i] - mean_computed_param ;
298 |             var_computed_param += diff * diff ;
299 |             diff = computed_bias_2[i] - mean_computed_bias ;
300 |             var_computed_bias += diff * diff ;
301 |             }
302 |          var_computed_param /= ndone ;   // Variance of theta-hat
303 |          var_computed_bias /= ndone ;    // Variance of bias estimate
304 |          printf ( "\nJACK: Mean bias=%.5lf (its std=%.5lf)   Mean var=%.5lf",
305 |             mean_computed_bias, sqrt(var_computed_bias), mean_computed_var ) ;
306 |          }
307 | 
308 |       if ((itry % 10) == 1) {
309 |          if (_kbhit ()) {
310 |             if (_getch() == 27)
311 |                break ;
312 |             }
313 |          }
314 | 
315 |      } // For all tries
316 | 
317 | 
318 |    return EXIT_SUCCESS ;
319 | }
320 | 


--------------------------------------------------------------------------------
/CODE_DESCRIPTION.TXT:
--------------------------------------------------------------------------------
 1 | The following routines are general-purpose workers
 2 | 
 3 | MEM.CPP - Optionally provides extensive memory-use checking as a debugging tool.
 4 | READFILE.CPP - Several variable analysis programs use this to read data files
 5 | SPEARMAN.CPP - Compute Spearman rho nonparametric correlation
 6 | STATS.CPP - A wide variety of statistical routines.  Very useful for other applications as well!
 7 | RAND32.CPP - Assorted random number generators, including several having extreme quality
 8 | QSORTD.CPP - Quick-sort routines
 9 | PART.CPP - Optimally partition a continuous variable into bins
10 | PARZDENS.CPP - Density estimation with Parzen's method
11 | SPLINE.CPP - Cubic spline interpolation
12 | MINIMIZE.CPP - Several numeric minimization routines
13 | BILINEAR.CPP - Bilinear interpolation
14 | INTEGRAT.CPP - Numeric integration by adaptive quadrature
15 | 
16 | 
17 | The following routines compute mutual information and relatives
18 | 
19 | MUTINF_B.CPP - Mutual information for binary data
20 | MUTINF_C.CPP - Mutual information for continuous data
21 | MUTINF_D.CPP - Mutual information for discrete data
22 | TRANS_ENT.CPP - Transfer entropy (information transfer)
23 | 
24 | 
25 | The following routines are primitive models used by testing programs.
26 | These use minimal training and do no error checking, so they are not
27 | appropriate for use outside of these applications.
28 | 
29 | LINREG.CPP - Ordinary linear regression by singular value decomposition
30 | GRNN.CPP - General regression neural network (advanced kernel regression)
31 | MLFN.CPP - Multiple layer feedforward network
32 | LOGISTIC.CPP - Logistic regression limited to small positive weights
33 | 
34 | 
35 | The following routines are complete programs that demonstrate concepts in the text
36 | 
37 | BOOT_P_1.CPP - Bootstrap estimate of bias and variance when s != t
38 | BOOT_P_2.CPP - Bootstrap estimate of bias and variance when s = t
39 | BOOT_P_3.CPP - Bootstrap estimate of bias and variance for regression coef
40 | BOOT_P_4.CPP - Bootstrap confidence intervals for correlation coef
41 | BOOT_P_5.CPP - Compare jackknife and bootstrap estimates for bias/var of PF
42 | BOOT_C_1.CPP - Compare resampling methods for estimating error variance (numeric)
43 | BOOT_C_2.CPP - Compare resampling methods for estimating error variance (classification)
44 | ENTROPY.CPP - Compute the entropy of each of a set of variables
45 | MI_DISC.CPP - Mutual information for discrete predicted and predictors
46 | MI_CONT.CPP - Mutual information for continuous predicted and predictors
47 | MI_BIN.CPP - Mutual information for binary predicted and predictors
48 | MI_ONLY.CPP - Mutual information ONLY for continuous predicted and predictors
49 | DEP_BOOT.CPP - Dependent bootstrap routines
50 | TEST_DIS.CPP - Test the discrete mutual information methods
51 | TEST_CON.CPP - Test the continuous mutual information methods
52 | TRANSFER.CPP - Compute transfer entropy for predictor candidates
53 | MC_TRAIN.CPP - Demonstrate Monte-Carlo permutation training
54 | ARCING.CPP - Compare bagging and AdaBoost methods for binary classification
55 | ARCING_M.CPP - Compare bagging and AdaBoost for multiple classification
56 | CONFCONF.CPP - Demonstrate confidence in hypothesis test confidence
57 | MULTPRED.CPP - Compare methods for combining multiple numeric predictors
58 | MULTCLAS.CPP - Compare methods for combining multiple class predictors
59 | AFTERFAC.CPP - Test after-the-fact oracle
60 | GRNNGATE.CPP - Test GRNN gating
61 | 


--------------------------------------------------------------------------------
/CONFCONF.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  CONFCONF - Demonstrate confidence in hypothesis test confidence           */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | #include <math.h>
 10 | #include <conio.h>
 11 | #include <ctype.h>
 12 | #include <stdlib.h>
 13 | 
 14 | double unifrand () ;
 15 | void qsortd ( int istart , int istop , double *x ) ;
 16 | double quantile_conf ( int n , int m , double conf ) ;
 17 | double inverse_ks ( int n , double cdf ) ;
 18 | double ks_test ( int n , double *x , double *D_plus , double *D_minus ) ;
 19 | 
 20 | int main (
 21 |    int argc ,    // Number of command line arguments (includes prog name)
 22 |    char *argv[]  // Arguments (prog name is argv[0])
 23 |    )
 24 | {
 25 |    int i, ncases, irep, nreps, m, n_lower, n_upper, n_ks2, n_ks_null, n_ks_alt ;
 26 |    double *x, pval, conf, pessimistic_lower, pessimistic_upper ;
 27 |    double ks_two, ks_one, D, Dp, Dm ;
 28 | 
 29 |    if (argc != 5) {
 30 |       printf ( "\nUsage: ConfConf  ncases  pval  conf  nreps" ) ;
 31 |       printf ( "\n  ncases - Number of cases in the sample" ) ;
 32 |       printf ( "\n  pval - Probability value (<0.5) for quantile test" ) ;
 33 |       printf ( "\n  conf - Desired confidence value (<0.5) for both tests" ) ;
 34 |       printf ( "\n  nreps - Number of replications" ) ;
 35 |       exit ( 1 ) ;
 36 |       }
 37 | 
 38 |    ncases = atoi ( argv[1] ) ;
 39 |    pval = atof ( argv[2] ) ;
 40 |    conf = atof ( argv[3] ) ;
 41 |    nreps = atoi ( argv[4] ) ;
 42 | 
 43 |    if (ncases < 10) {
 44 |       printf ( "\nERROR.. Must have at least 10 cases" ) ;
 45 |       exit ( 1 ) ;
 46 |       }
 47 | 
 48 |    if (pval * ncases < 1.0  ||  pval >= 0.5) {
 49 |       printf ( "\nERROR.. Pval too small or too large" ) ;
 50 |       exit ( 1 ) ;
 51 |       }
 52 | 
 53 |    if (conf <= 0.0  ||  conf >= 0.5) {
 54 |       printf ( "\nERROR.. Conf must be greater than 0 and less than 0.5" ) ;
 55 |       exit ( 1 ) ;
 56 |       }
 57 | 
 58 |    if (nreps < 1) {
 59 |       printf ( "\nERROR.. Must have at least 1 replication" ) ;
 60 |       exit ( 1 ) ;
 61 |       }
 62 | 
 63 | 
 64 | /*
 65 |    Allocate memory and initialize
 66 | */
 67 | 
 68 |    x = (double *) malloc ( ncases * sizeof(double) ) ;
 69 | 
 70 |    m = (int) (pval * ncases) ;  // Conservative order statistic for bound
 71 |    pessimistic_lower = quantile_conf ( ncases , m , conf ) ;
 72 |    pessimistic_upper = 1.0 - pessimistic_lower ;
 73 |    ks_two = inverse_ks ( ncases , 1.0 - conf ) ;       // Two-tailed test
 74 |    ks_one = inverse_ks ( ncases , 1.0 - 2.0 * conf ) ; // One-tailed test
 75 | 
 76 |    printf ( "\nSuppose the model predicts values near 0 for the null hypothesis" ) ;
 77 |    printf ( "\nand values near 1 for the alternative hypothesis." ) ;
 78 | 
 79 |    printf ( "\n\nIf the dataset represents the null hypothesis, the threshold" ) ;
 80 |    printf ( "\nfor rejecting the null at p=%.4lf is given by the %d'th order statistic.",
 81 |             pval, ncases - m + 1 ) ;
 82 |    printf ( "\nThis is a conservative estimate of the %.4lf quantile", 1.0-pval ) ;
 83 |    printf ( "\nThere is only a %.4lf chance that it will really be the %.4lf quantile or worse.",
 84 |             conf, pessimistic_upper ) ;
 85 | 
 86 |    printf ( "\n\nIf the dataset represents the alternative hypothesis, the threshold" ) ;
 87 |    printf ( "\nfor rejecting the alt at p=%.4lf is given by the %d'th order statistic.",
 88 |             pval, m ) ;
 89 |    printf ( "\nThis is a conservative estimate of the %.4lf quantile", pval ) ;
 90 |    printf ( "\nThere is only a %.4lf chance that it will really be the %.4lf quantile or worse.",
 91 |             conf, pessimistic_lower) ;
 92 | 
 93 |    printf ( "\n\nKS thresholds: two-tailed KS = %.4lf  one-tailed KS = %.4lf",
 94 |             ks_two, ks_one ) ;
 95 | 
 96 | /*
 97 |    Now generate nreps samples.  Verify that our required confidence level
 98 |    is observed.  Note that the fact that this test uses a uniform distribution
 99 |    does not in any way limit its applicability to uniform distributions.
100 |    If one were to generate cases from any other reasonable distribtion,
101 |    the pessimistic quantile bounds would have to be transformed similarly.
102 |    The result is that the inequalities below would pass or fail identically.
103 |    We count the number of times 'disaster' happens.
104 |    Disaster is when the order statistic used for the threshold is toward the
105 |    inside (center) of the distribution, meaning that if this order statistic
106 |    had been used as a threshold, more of the distribution would be outside
107 |    the threshold than the user expected.  We expect disaster to happen with
108 |    probability equal to the specified conf parameter.
109 | 
110 |    For the two-tailed Kolmogorov-Smirnov test, disaster is when the empirical
111 |    CDF deviates (above or below) from the correct value by more than the
112 |    conf-inspired value.  For the one-tailed test in which the dataset is from
113 |    the NULL distribution, disaster is when the empirical CDF exceeds the true
114 |    CDF, a situation that would encourage false rejection of the null hypothesis.
115 |    This is measured by D+.  For the one-tailed test in which the dataset is from
116 |    the ALT distribution, disaster is when the empirical CDF is less than the
117 |    true CDF, a situation that would encourage false rejection of the alternative
118 |    hypothesis.  This is measured by D-.
119 | */
120 | 
121 |    n_lower = n_upper = n_ks2 = n_ks_null = n_ks_alt = 0 ;
122 | 
123 |    for (irep=0 ; irep<nreps ; irep++) {
124 | 
125 |       for (i=0 ; i<ncases ; i++)
126 |          x[i] = unifrand () ;
127 |       qsortd ( 0 , ncases-1 , x ) ;
128 | 
129 |       if (x[m-1] > pessimistic_lower)
130 |          ++n_lower ;
131 | 
132 |       if (x[ncases-m] < pessimistic_upper)
133 |          ++n_upper ;
134 | 
135 |       D = ks_test ( ncases , x , &Dp , &Dm ) ;
136 |       if (D > ks_two)
137 |          ++n_ks2 ;
138 |       if (Dp > ks_one)
139 |          ++n_ks_null ;
140 |       if (Dm > ks_one)
141 |          ++n_ks_alt ;
142 |       }
143 | 
144 |    printf ( "\nPoint failure (expected=%.4lf)  Lower=%.4lf  Upper=%.4lf",
145 |             conf, (double) n_lower / nreps, (double) n_upper / nreps) ;
146 |    printf ( "\nKS failure:  two-tailed = %.4lf  NULL = %.4lf  ALT = %.4lf",
147 |             (double) n_ks2 / nreps, (double) n_ks_null / nreps,
148 |             (double) n_ks_alt / nreps) ;
149 | 
150 |    free ( x ) ;
151 |    return ( 0 ) ;
152 | }
153 | 


--------------------------------------------------------------------------------
/ENTROPY.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  ENTROPY - Compute the entropy of each of a set of variables               */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <assert.h>
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <math.h>
 11 | #include <conio.h>
 12 | #include <ctype.h>
 13 | #include <stdlib.h>
 14 | #include "..\info.h"
 15 | 
 16 | /*
 17 |    These are defined in MEM.CPP
 18 | */
 19 | 
 20 | extern int mem_keep_log ;      // Keep a log file?
 21 | extern char mem_file_name[] ;  // Log file name
 22 | extern int mem_max_used ;      // Maximum memory ever in use
 23 | 
 24 | int main (
 25 |    int argc ,    // Number of command line arguments (includes prog name)
 26 |    char *argv[]  // Arguments (prog name is argv[0])
 27 |    )
 28 | 
 29 | {
 30 |    int i, k, nbins, itype, nvars, ncases, ivar, *counts, ilow, ihigh, nb ;
 31 |    int istart, istop, ibest, *sortwork, n_indep_vars ;
 32 |    double *data, *work, *entropies, *proportional, p, max_entropy, low, high ;
 33 |    double dist, best_dist, factor, entropy ;
 34 |    short int *bins ;
 35 |    char filename[256], **names ;
 36 |    FILE *fp ;
 37 | 
 38 | /*
 39 |    Process command line parameters
 40 | */
 41 | 
 42 | #if 1
 43 |    if (argc != 5) {
 44 |       printf ( "\nUsage: ENTROPY  datafile  nvars  nbins  type" ) ;
 45 |       printf ( "\n  datafile - name of the text file containing the data" ) ;
 46 |       printf ( "\n             The first line is variable names" ) ;
 47 |       printf ( "\n             Subsequent lines are the data" ) ;
 48 |       printf ( "\n             Delimiters can be space, comma, or tab" ) ;
 49 |       printf ( "\n  nvars - This many variables, starting with the first, will be tested" ) ;
 50 |       printf ( "\n  nbins - If the data is discrete, this must be at least the" ) ;
 51 |       printf ( "\n          number of bins.  It will be automatically reduced" ) ;
 52 |       printf ( "\n          to the exact number of bins." ) ;
 53 |       printf ( "\n          If the data is continuous, it specifies the number of bins" ) ;
 54 |       printf ( "\n          computed by linearly scaling the interior/exterior range." ) ;
 55 |       printf ( "\n          About 10 bins is generally good for continuous data," ) ;
 56 |       printf ( "\n          although values as high as 15 or, rarely, even 20, may be" ) ;
 57 |       printf ( "\n          appropriate if there are tens of thousands of cases." ) ;
 58 |       printf ( "\n  type - Type of data processing:" ) ;
 59 |       printf ( "\n    1 - The data is discrete" ) ;
 60 |       printf ( "\n    2 - The data is continuous, and the entire range is to be tested" ) ;
 61 |       printf ( "\n    3 - The data is continuous, and the extremes are to be truncated" ) ;
 62 |       exit ( 1 ) ;
 63 |       }
 64 | 
 65 |    strcpy ( filename , argv[1] ) ;
 66 |    n_indep_vars = atoi ( argv[2] ) ;
 67 |    nbins = atoi ( argv[3] ) ;
 68 |    itype = atoi ( argv[4] ) ;
 69 | #else
 70 |    strcpy ( filename , "..\\VARS.TXT" ) ;
 71 |    n_indep_vars = 8 ;
 72 |    nbins = 10 ;
 73 |    itype = 2 ;
 74 | #endif
 75 | 
 76 |    if (itype < 1  ||  itype > 3) {
 77 |       printf ( "\nERROR... type illegal" ) ;
 78 |       return EXIT_FAILURE ;
 79 |       }
 80 | 
 81 |    if (nbins < 2  ||  (itype > 1  &&  nbins < 3)) {
 82 |       printf ( "\nERROR... nbins illegal" ) ;
 83 |       return EXIT_FAILURE ;
 84 |       }
 85 | 
 86 | /*
 87 |    These are used by MEM.CPP for runtime memory validation
 88 | */
 89 | 
 90 |    _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ;
 91 |    fp = fopen ( mem_file_name , "wt" ) ;
 92 |    if (fp == NULL) { // Should never happen
 93 |       printf ( "\nCannot open MEM.LOG file for writing!" ) ;
 94 |       return EXIT_FAILURE ;
 95 |       }
 96 |    fclose ( fp ) ;
 97 |    mem_keep_log = 1 ;
 98 |    mem_max_used = 0 ;
 99 | 
100 | /*
101 |    Open the text file to which results will be written
102 | */
103 | 
104 |    fp = fopen ( "ENTROPY.LOG" , "wt" ) ;
105 |    if (fp == NULL) { // Should never happen
106 |       printf ( "\nCannot open ENTROPY.LOG file for writing!" ) ;
107 |       return EXIT_FAILURE ;
108 |       }
109 | 
110 | /*
111 |    Read the file
112 | */
113 | 
114 |    if (readfile ( filename , &nvars , &names , &ncases , &data ))
115 |       return EXIT_FAILURE ;
116 | 
117 | /*
118 |    Allocate scratch memory
119 | 
120 |    bins - Bin ids for all variables
121 |    counts - Count of cases in each bin
122 |    entropies - Entropy of each variable
123 |    proportional - Proportional entropy of each variable
124 |    work - Temporary use for extracting a variable from the dataset
125 |    sortwork - Temporary use for printing variable's information sorted
126 | */
127 | 
128 |    MEMTEXT ( "ENTROPY 6 allocs" ) ;
129 |    bins = (short int *) MALLOC ( ncases * sizeof(short int) ) ;
130 |    assert ( bins != NULL ) ;
131 |    counts = (int *) MALLOC ( nbins * sizeof(int) ) ;
132 |    assert ( counts != NULL ) ;
133 |    entropies = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ;
134 |    assert ( entropies != NULL ) ;
135 |    proportional = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ;
136 |    assert ( proportional != NULL ) ;
137 |    work = (double *) MALLOC ( ncases * sizeof(double) ) ;
138 |    assert ( work != NULL ) ;
139 |    sortwork = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ;
140 |    assert ( sortwork != NULL ) ;
141 | 
142 | /*
143 |    If splitting a continuous variable across interior range,
144 |    compute things that will be needed.
145 | */
146 | 
147 |    if (itype > 1) {
148 |       nb = nbins ;                    // Always needed
149 |       ilow = (ncases + 1) / nb - 1 ;  // Needed only if itype==3
150 |       if (ilow < 0)
151 |          ilow = 0 ;
152 |       ihigh = ncases - 1 - ilow ;
153 |       }
154 | 
155 | /*
156 |    If splitting a discrete variable, warn the user if the variable is continuous
157 | */
158 | 
159 |    else {
160 |       for (ivar=0 ; ivar<n_indep_vars ; ivar++) {
161 |          for (i=0 ; i<ncases ; i++)
162 |             work[i] = data[i*nvars+ivar] ;
163 |          qsortd ( 0 , ncases-1 , work ) ;
164 |          k = 1 ;
165 |          for (i=1 ; i<ncases ; i++) {
166 |             if (work[i] > work[i-1])
167 |                ++k ;
168 |             }
169 |          if (k > nbins)
170 |             fprintf ( fp,
171 |                "\nWARNING... %s has %d distinct values, not %d.  Results will be incorrect.",
172 |                names[ivar], k, nbins ) ;
173 |          }
174 |       }
175 | 
176 | /*
177 |    Compute and save the entropy for each variable.
178 |    Print the results, sort them, and print them again, this time sorted.
179 | */
180 | 
181 |    fprintf ( fp , "\n                       Variable     Entropy   Proportional" ) ;
182 | 
183 |    for (ivar=0 ; ivar<n_indep_vars ; ivar++) {
184 | 
185 |       for (i=0 ; i<ncases ; i++)
186 |          work[i] = data[i*nvars+ivar] ;
187 | 
188 |       if (itype == 1) {   // Discrete?
189 |          nb = nbins ;
190 |          partition ( ncases , work , &nb , NULL , bins ) ;
191 |          for (i=0 ; i<nb ; i++)
192 |             counts[i] = 0 ;
193 |          for (i=0 ; i<ncases ; i++)
194 |             ++counts[bins[i]] ;
195 |          }
196 | 
197 |       else if (itype == 2) {   // Continuous, split across full range
198 |          low = high = work[0] ;
199 |          for (i=1 ; i<ncases ; i++) {
200 |             if (work[i] > high)
201 |                high = work[i] ;
202 |             if (work[i] < low)
203 |                low = work[i] ;
204 |             }
205 |          for (i=0 ; i<nb ; i++)
206 |             counts[i] = 0 ;
207 |          factor = (nb - 0.00000000001) / (high - low + 1.e-60) ;
208 |          for (i=0 ; i<ncases ; i++) {
209 |             k = (int) (factor * (work[i] - low)) ;
210 |             ++counts[k] ;
211 |             }
212 |          }
213 | 
214 |       else {              // Continuous, split across interior range
215 |          // Find the shortest interval containing 1-2/nbins of the distribution
216 |          qsortd ( 0 , ncases-1 , work ) ;
217 |          istart = 0 ;
218 |          istop = istart + ihigh - ilow - 2 ;
219 |          best_dist = 1.e60 ;
220 |          while (istop < ncases) {  // Try bounds containing the same n of cases
221 |             dist = work[istop] - work[istart] ;
222 |             if (dist < best_dist) { // We're looking for the shortest
223 |                best_dist = dist ;
224 |                ibest = istart ;
225 |                }
226 |             ++istart ;  // Advance to the next interval
227 |             ++istop ;
228 |             }
229 |          istart = ibest ;  // This is the shortest interval
230 |          istop = istart + ihigh - ilow - 2 ;
231 |          counts[0] = istart ;                 // The count of the leftmost bin
232 |          counts[nb-1] = ncases - istop - 1 ;  // and rightmost are implicit
233 |          for (i=1 ; i<nb-1 ; i++)
234 |             counts[i] = 0 ;
235 |          low = work[istart] ;
236 |          high = work[istop] ;
237 |          factor = (nb - 2.00000000001) / (high - low + 1.e-60) ;
238 |          for (i=istart ; i<=istop ; i++) {
239 |             k = (int) (factor * (work[i] - low)) ;
240 |             ++counts[k+1] ;
241 |             }
242 |          } // Else continuous
243 | //!!!!!!!!!!!!!!!!!!!!!!
244 |       printf ( "\n\n%s bins:", names[ivar] ) ;
245 |       for (i=0 ; i<nb ; i++)
246 |          printf ( " %d", counts[i] ) ;
247 | //!!!!!!!!!!!!!!!!!!!!!!
248 | 
249 |       // Compute the entropy
250 |       max_entropy = log ( (double) (nb) ) ;
251 |       entropy = 0.0 ;
252 |       for (i=0 ; i<nb ; i++) {
253 |          if (counts[i] > 0) {
254 |             p = (double) counts[i] / (double) ncases ;
255 |             entropy -= p * log(p) ;
256 |             }
257 |          }
258 | 
259 |       sortwork[ivar] = ivar ;
260 |       entropies[ivar] = entropy ;
261 |       proportional[ivar] = entropy / max_entropy ;
262 | 
263 |       fprintf ( fp , "\n%31s %11.5lf %11.5lf",
264 |                 names[ivar], entropy, proportional[ivar] ) ;
265 |       }
266 | 
267 |    fprintf ( fp , "\n" ) ;
268 |    fprintf ( fp , "\n" ) ;
269 |    fprintf ( fp , "\nEntropies, in decreasing order" ) ;
270 |    fprintf ( fp , "\n" ) ;
271 |    fprintf ( fp , "\n                       Variable     Entropy    Relative" ) ;
272 | 
273 |    qsortdsi ( 0 , n_indep_vars-1 , entropies , sortwork ) ;
274 |    for (ivar=0 ; ivar<n_indep_vars ; ivar++) { // Do all candidates
275 |       k = sortwork[n_indep_vars-1-ivar] ;      // Index of sorted candidate
276 |       fprintf ( fp , "\n%31s %11.5lf %11.5lf",
277 |                 names[k], entropies[n_indep_vars-1-ivar], proportional[k] ) ;
278 |       }
279 | 
280 |    MEMTEXT ( "ENTROPY: Finish... 6 arrays plus free_data()" ) ;
281 |    fclose ( fp ) ;
282 |    FREE ( bins ) ;
283 |    FREE ( counts ) ;
284 |    FREE ( entropies ) ;
285 |    FREE ( proportional ) ;
286 |    FREE ( work ) ;
287 |    FREE ( sortwork ) ;
288 |    free_data ( nvars , names , data ) ;
289 |    MEMCLOSE () ;
290 |    printf ( "\n\nPress any key..." ) ;
291 |    _getch () ;
292 |    return EXIT_SUCCESS ;
293 | }
294 | 


--------------------------------------------------------------------------------
/GRNN.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  GRNN - General Regression Neural Network                                  */
  4 | /*                                                                            */
  5 | /*         This implementation uses a primitive annealing training method     */
  6 | /*         It is not good enough for serious work.                            */
  7 | /*         It could be brought up to decent standards by using the included   */
  8 | /*         annealing just as a starting point, following it with good         */
  9 | /*         refinement.                                                        */
 10 | /*         Also, a user friendly version would have provision for progress    */
 11 | /*         reports and user interruption.  And last but not least, error      */
 12 | /*         checks like failure to allocate sufficient memory should be        */
 13 | /*         included in any serious inplementation.                            */
 14 | /*                                                                            */
 15 | /*                                                                            */
 16 | /*  To use this class:                                                        */
 17 | /*    1) Construct a new instance of the class                                */
 18 | /*    2) Call add_case() exactly ncases times, each time providing the        */
 19 | /*       nin+nout vector of inputs and outputs.                               */
 20 | /*    3) Call train()                                                         */
 21 | /*    4) Call predict() as many times as desired                              */
 22 | /*    5) Optionally, call reset() and go to step 2                            */
 23 | /*                                                                            */
 24 | /*  This does not include any checks for insufficient memory.                 */
 25 | /*  It also assumes that the user calls add_case exactly ncases times         */
 26 | /*  and does not check for failure to do so.                                  */
 27 | /*                                                                            */
 28 | /******************************************************************************/
 29 | 
 30 | #include <stdio.h>
 31 | #include <stdlib.h>
 32 | #include <string.h>
 33 | #include <math.h>
 34 | #include "grnn.h"
 35 | 
 36 | double normal () ;
 37 | #define EPS1 1.e-180
 38 | 
 39 | /*
 40 | --------------------------------------------------------------------------------
 41 | 
 42 |    Constructor, destructor, reset(), add_case()
 43 | 
 44 | --------------------------------------------------------------------------------
 45 | */
 46 | 
 47 | GRNN::GRNN ( int ncase , int nin , int nout )
 48 | {
 49 |    ncases = ncase ;
 50 |    ninputs = nin ;
 51 |    noutputs = nout ;
 52 |    tset = (double *) malloc ( ncases * (ninputs + noutputs) * sizeof(double) ) ;
 53 |    sigma = (double *) malloc ( ninputs * sizeof(double) ) ;
 54 |    outwork = (double *) malloc ( noutputs * sizeof(double) ) ;
 55 |    reset () ;
 56 | }
 57 | 
 58 | 
 59 | GRNN::~GRNN ()
 60 | {
 61 |    if (tset != NULL)
 62 |       free ( tset ) ;
 63 |    if (sigma != NULL)
 64 |       free ( sigma ) ;
 65 |    if (outwork != NULL)
 66 |       free ( outwork ) ;
 67 | }
 68 | 
 69 | /*
 70 |    This discards any existing training data.
 71 |    It does not need to be called after construction, but it must
 72 |    be called if the user wants to reuse the GRNN object for a new dataset.
 73 | */
 74 | 
 75 | void GRNN::reset ()
 76 | {
 77 |    nrows = 0 ;      // No rows (via add_case()) yet present
 78 |    trained = 0 ;    // Training not done yet
 79 | }
 80 | 
 81 | /*
 82 |    Build the training set one case at a time.
 83 |    The user must call this member EXACTLY ncases times after construction
 84 |    or a call to reset(), and before a call to train().
 85 | */
 86 | 
 87 | void GRNN::add_case ( double *newcase )
 88 | {
 89 |    if (nrows >= ncases)  // Careful user never lets this happen
 90 |       return ;           // But cheap insurance
 91 | 
 92 |    memcpy ( tset + nrows * (ninputs + noutputs) , newcase ,
 93 |             (ninputs + noutputs) * sizeof(double) ) ;
 94 |    ++nrows ;
 95 | }
 96 | 
 97 | 
 98 | /*
 99 | --------------------------------------------------------------------------------
100 | 
101 |    predict() - Given an input vector, compute output using trained model
102 | 
103 | --------------------------------------------------------------------------------
104 | */
105 | 
106 | void GRNN::predict (
107 |    double *input ,     // Input vector
108 |    double *output      // Returned output
109 |    )
110 | {
111 |    int icase, iout, ivar ;
112 |    double *dptr, diff, dist, psum ;
113 | 
114 |    for (iout=0 ; iout<noutputs ; iout++) // For each output
115 |       output[iout] = 0.0 ;               // Will sum kernels here
116 |    psum = 0.0 ;                          // Denominator sum
117 | 
118 |    for (icase=0 ; icase<ncases ; icase++) {  // Do all training cases
119 | 
120 |       dptr = tset + (ninputs + noutputs) * icase ; // Point to this case
121 |       dist = 0.0 ;                           // Will sum distance here
122 |       for (ivar=0 ; ivar<ninputs ; ivar++) { // All variables in this case
123 |          diff = input[ivar] - dptr[ivar] ;   // Input minus case
124 |          diff /= sigma[ivar] ;               // Scale per sigma
125 |          dist += diff * diff ;               // Cumulate Euclidean distance
126 |          }
127 | 
128 |       dist = exp ( -dist ) ;                // Apply the Gaussian kernel
129 | 
130 |       if (dist < EPS1)                      // If this case is far from all
131 |          dist = EPS1 ;                      // prevent zero density
132 | 
133 |       dptr += ninputs ;                     // Outputs stored after inputs
134 |       for (ivar=0 ; ivar<noutputs ; ivar++) // For every output variable
135 |          output[ivar] += dist * dptr[ivar] ;// Cumulate numerator
136 |       psum += dist ;                        // Cumulate denominator
137 |       } // For all training cases
138 | 
139 |    for (ivar=0 ; ivar<noutputs ; ivar++)
140 |       output[ivar] /= psum ;
141 | }
142 | 
143 | /*
144 | --------------------------------------------------------------------------------
145 | 
146 |    execute() - Given sigma weights, pass through the training set, return MSE.
147 | 
148 | --------------------------------------------------------------------------------
149 | */
150 | 
151 | double GRNN::execute ()
152 | {
153 |    int itest, icase, iout, ivar ;
154 |    double *dptr, *tptr, diff, dist, psum, err ;
155 | 
156 |    err = 0.0 ;
157 | 
158 |    for (itest=0 ; itest<ncases ; itest++) {
159 |       tptr = tset + (ninputs + noutputs) * itest ; // Test case
160 | 
161 |       for (iout=0 ; iout<noutputs ; iout++) // For each output
162 |          outwork[iout] = 0.0 ;              // Will sum kernels here
163 |       psum = 0.0 ;                          // Denominator sum
164 | 
165 |       for (icase=0 ; icase<ncases ; icase++) {  // Do all training cases
166 | 
167 |          if (icase == itest)                    // Do not include test case
168 |             continue ;                          // In trial kernel
169 | 
170 |          dptr = tset + (ninputs + noutputs) * icase ; // Point to this case
171 |          dist = 0.0 ;                           // Will sum distance here
172 |          for (ivar=0 ; ivar<ninputs ; ivar++) { // All variables in this case
173 |             diff = tptr[ivar] - dptr[ivar] ;    // Test case minus kernel case
174 |             diff /= sigma[ivar] ;               // Scale per sigma
175 |             dist += diff * diff ;               // Cumulate Euclidean distance
176 |             }
177 | 
178 |          dist = exp ( -dist ) ;                // Apply the Gaussian kernel
179 | 
180 |          if (dist < EPS1)                      // If this case is far from all
181 |             dist = EPS1 ;                      // prevent zero density
182 | 
183 |          dptr += ninputs ;                     // Outputs stored after inputs
184 |          for (ivar=0 ; ivar<noutputs ; ivar++) // For every output variable
185 |             outwork[ivar] += dist * dptr[ivar] ;// Cumulate numerator
186 |          psum += dist ;                        // Cumulate denominator
187 |          } // For all training cases
188 | 
189 |       tptr += ninputs ;                        // Outputs stored after inputs
190 |       for (ivar=0 ; ivar<noutputs ; ivar++) {
191 |          outwork[ivar] /= psum ;               // Predicted output
192 |          diff = outwork[ivar] - tptr[ivar] ;   // Predicted minus actual
193 |          err += diff * diff ;                  // Cumulate squared error
194 |          }
195 | 
196 |       } // For all test cases
197 | 
198 |    err /= ncases * noutputs ;                  // MSE
199 | 
200 |    return err ;
201 | }
202 | 
203 | /*
204 | --------------------------------------------------------------------------------
205 | 
206 |    train() - Train the model
207 | 
208 |    After add_case has been called exactly ncases times, this must be called
209 |    to train the model.
210 | 
211 |    This routine is the weak point in this GRNN class.  The training algorithm
212 |    is relatively slow and inaccurate.
213 |    It is an excellent starting point for refinement, having a high probability
214 |    of finding a solution near a global minimum.  Addition of a good refinement
215 |    algorithm would make this class usable in real applications.
216 | 
217 | --------------------------------------------------------------------------------
218 | */
219 | 
220 | void GRNN::anneal_train (
221 |    int n_outer ,      // Number of outer loop iterations, perhaps 10-20
222 |    int n_inner ,      // Number of inner loop iterations, perhaps 100-10000
223 |    double start_std   // Starting standard deviation of log weights, about 3.0
224 |    )
225 | {
226 |    int i, inner, outer ;
227 |    double error, best_error, std, *best_wts, *test_wts, *center ;
228 | 
229 | /*
230 |    Best_wts keeps track of the best (log) sigma weights.
231 |    Center is the center around which perturbation is done.
232 |    It starts at zero.  After completion of each pass through the inner loop
233 |    it is changed to best_wts.
234 | */
235 | 
236 |    best_wts = (double *) malloc ( ninputs * sizeof(double) ) ;
237 |    test_wts = (double *) malloc ( ninputs * sizeof(double) ) ;
238 |    center = (double *) malloc ( ninputs * sizeof(double) ) ;
239 | 
240 |    for (i=0 ; i<ninputs ; i++)
241 |       center[i] = 0.0 ;
242 | 
243 |    best_error = -1.0 ;
244 |    std = start_std ;
245 | 
246 |    for (outer=0 ; outer<n_outer ; outer++) {
247 |       for (inner=0 ; inner<n_inner ; inner++) {
248 | 
249 |          for (i=0 ; i<ninputs ; i++) {
250 |             test_wts[i] = center[i] + std * normal() ;
251 |             sigma[i] = exp ( test_wts[i] ) ;
252 |             }
253 | 
254 |          error = execute () ;
255 |          if ((best_error < 0.0)  ||  (error < best_error)) {
256 |             best_error = error ;
257 |             memcpy ( best_wts , test_wts , ninputs * sizeof(double) ) ;
258 |             }
259 |          } // For inner loop iterations
260 |       memcpy ( center , best_wts , ninputs * sizeof(double) ) ;
261 |       std *= 0.7 ;
262 |       } // For outer loop iterations
263 | 
264 |    for (i=0 ; i<ninputs ; i++)
265 |       sigma[i] = exp ( best_wts[i] ) ;
266 | 
267 |    trained = 1 ;    // Training complete
268 |    free ( best_wts ) ;
269 |    free ( test_wts ) ;
270 |    free ( center ) ;
271 | }
272 | 
273 | /*
274 |    This is customized for this demonstration
275 | */
276 | 
277 | void GRNN::train ()
278 | {
279 |    anneal_train ( 10 , 100 , 3.0 ) ;
280 | }
281 | 


--------------------------------------------------------------------------------
/GRNN.H:
--------------------------------------------------------------------------------
 1 | class GRNN {
 2 | 
 3 | public:
 4 | 
 5 |    GRNN ( int ncase , int nin , int nout ) ;
 6 |    ~GRNN () ;
 7 |    void reset () ;
 8 |    void add_case ( double *newcase ) ;
 9 |    void train () ;
10 |    void anneal_train ( int n_outer , int n_inner , double start_std ) ;
11 |    void predict ( double *input , double *output ) ;
12 | 
13 | 
14 | private:
15 |    double execute () ;
16 | 
17 |    int ncases ;     // Number of cases
18 |    int ninputs  ;   // Number of inputs
19 |    int noutputs  ;  // Number of outputs
20 |    int nrows ;      // How many times has add_case() been called?
21 |    int trained ;    // Has it been trained yet?
22 |    double *tset ;   // Ncases by (ninputs+noutputs) matrix of training data
23 |    double *sigma ;  // Ninputs vector of sigma weights
24 |    double *outwork ;// Noutputs work vector
25 | } ;
26 | 


--------------------------------------------------------------------------------
/INFO.H:
--------------------------------------------------------------------------------
  1 | // Class headers, function declarations and constants for information code
  2 | 
  3 | /*
  4 |    These are for intercepting memory allocation for runtime checking
  5 | */
  6 | 
  7 | #if 1
  8 | #define MALLOC memalloc
  9 | #define FREE memfree
 10 | #define REALLOC memrealloc
 11 | #define MEMTEXT memtext
 12 | #define MEMCLOSE memclose
 13 | #else
 14 | #define MALLOC malloc
 15 | #define FREE free
 16 | #define REALLOC realloc
 17 | #define MEMTEXT notext
 18 | #define MEMCLOSE nomemclose
 19 | #endif
 20 | 
 21 | #if ! defined ( PI )
 22 | #define PI 3.141592653589793
 23 | #endif
 24 | 
 25 | 
 26 | /*
 27 | --------------------------------------------------------------------------------
 28 | 
 29 |    Interpolation routines
 30 | 
 31 | --------------------------------------------------------------------------------
 32 | */
 33 | 
 34 | class CubicSpline {
 35 | 
 36 | public:
 37 |    CubicSpline ( int n , double *xin , double *yin ) ;
 38 |    ~CubicSpline () ;
 39 |    double evaluate ( double x ) ;
 40 | 
 41 | private:
 42 |    int n ;
 43 |    double *x ;
 44 |    double *y ;
 45 |    double *y2 ;
 46 | } ;
 47 | 
 48 | class Bilinear {
 49 | 
 50 | public:
 51 |    Bilinear ( int nxin , double *xin , int nyin , double *yin , double *zin ,
 52 |               int extra ) ;
 53 |    ~Bilinear () ;
 54 |    double evaluate ( double x , double y ) ;
 55 | 
 56 | private:
 57 |    int quadratic ;
 58 |    int nx ;
 59 |    int ny ;
 60 |    double *x ;
 61 |    double *y ;
 62 |    double *z ;
 63 | } ;
 64 | 
 65 | /*
 66 | --------------------------------------------------------------------------------
 67 | 
 68 |    ParzenDensities ParzDens_? used for continuous mutual information
 69 | 
 70 | --------------------------------------------------------------------------------
 71 | */
 72 | 
 73 | class ParzDens_1 {
 74 | 
 75 | public:
 76 |    ParzDens_1 ( int n_tset , double *tset , int n_div ) ;
 77 |    ~ParzDens_1 () ;
 78 |    double density ( double x ) ;
 79 |    double low ;     // Lowest value with significant density
 80 |    double high ;    // And highest
 81 | 
 82 | private:
 83 |    int nd ;         // Number of points in array below
 84 |    double *d ;      // The data on which the density is based
 85 |    double var ;     // Presumed variance
 86 |    double factor ;  // Normalizing factor to make it a density
 87 |    CubicSpline *spline ; // Used only if interpolation
 88 | } ;
 89 | 
 90 | class ParzDens_2 {
 91 | 
 92 | public:
 93 |    ParzDens_2 ( int n_tset , double *tset0 , double *tset1 , int n_div ) ;
 94 |    ~ParzDens_2 () ;
 95 |    double density ( double x0 , double x1 ) ;
 96 | 
 97 | private:
 98 |    int nd ;         // Number of points in arrays below
 99 |    double *d0 ;     // The data on which the density is based; first variable
100 |    double *d1 ;     // And second
101 |    double var0 ;    // Presumed variance of first variable
102 |    double var1 ;    // And second
103 |    double factor ;  // Normalizing factor to make it a density
104 |    Bilinear *bilin ; // Used only for bilinear interpolation
105 | } ;
106 | 
107 | class ParzDens_3 {
108 | 
109 | public:
110 |    ParzDens_3 ( int n_tset , double *tset0 , double *tset1 , double *tset2 , int n_div ) ;
111 |    ~ParzDens_3 () ;
112 |    double density ( double x0 , double x1 , double x2 ) ;
113 | 
114 | private:
115 |    int nd ;         // Number of points in arrays below
116 |    double *d0 ;     // The data on which the density is based; first variable
117 |    double *d1 ;     // And second
118 |    double *d2 ;     // And third
119 |    double var0 ;    // Presumed variance of first variable
120 |    double var1 ;    // And second
121 |    double var2 ;    // And third
122 |    double factor ;  // Normalizing factor to make it a density
123 | } ;
124 | 
125 | /*
126 | --------------------------------------------------------------------------------
127 | 
128 |    MutualInformation
129 | 
130 | --------------------------------------------------------------------------------
131 | */
132 | 
133 | class MutualInformationParzen {  // Parzen window method
134 | 
135 | public:
136 |    MutualInformationParzen ( int nn , double *dep_vals , int ndiv ) ;
137 |    ~MutualInformationParzen () ;
138 |    double mut_inf ( double *x ) ;
139 | 
140 | private:
141 |    int n ;             // Number of cases
142 |    int n_div ;         // Number of divisions of range, typically 5-10
143 |    double *depvals ;   // 'Dependent' variable
144 |    ParzDens_1 *dens_dep ;   // Marginal density of 'dependent' variable
145 | } ;
146 | 
147 | class MutualInformationAdaptive {  // Adaptive partitioning method
148 | 
149 | public:
150 |    MutualInformationAdaptive ( int nn , double *dep_vals ,
151 |                                int respect_ties , double crit ) ;
152 |    ~MutualInformationAdaptive () ;
153 |    double mut_inf ( double *x , int respect_ties ) ;
154 | 
155 | private:
156 |    int n ;             // Number of cases
157 |    int *y ;            // 'Dependent' variable ranks
158 |    int *y_tied ;       // tied[i] != 0 if case with rank i == case with rank i+1
159 |    double chi_crit ;   // Chi-square test criterion
160 | } ;
161 | 
162 | class MutualInformationDiscrete {
163 | 
164 | public:
165 |    MutualInformationDiscrete ( int nc , short int *bins ) ;
166 |    ~MutualInformationDiscrete () ;
167 |    double entropy () ;
168 |    double mut_inf ( short int *bins ) ;
169 |    double conditional ( short int *bins ) ;
170 |    double conditional_error ( short int *bins ) ;
171 |    double HYe ( short int *bins ) ;
172 |    double hPe ( short int *bins ) ;
173 | 
174 | private:
175 |    int ncases ;         // Number of cases
176 |    short int *bins_y ;  // They are here
177 |    int nbins_y ;        // Number of bins
178 |    int *marginal_y ;    // Marginal distribution
179 | } ;
180 | 
181 | 
182 | /*
183 | --------------------------------------------------------------------------------
184 | 
185 |    Function declarations
186 | 
187 | --------------------------------------------------------------------------------
188 | */
189 | 
190 | extern void free_data ( int nvars , char **names , double *data ) ;
191 | extern double trans_ent ( int n , int nbins_x , int nbins_y , short int *x , short int *y ,
192 |                           int xlag , int xhist , int yhist , int *counts , double *ab ,
193 |                           double *bc , double *b ) ;
194 | extern double integrate ( double low , double high , double min_width ,
195 |                           double acc , double tol , double (*criter) (double) );
196 | extern double inverse_normal_cdf ( double p ) ;
197 | extern void *memalloc ( unsigned int n ) ;
198 | extern void nomemclose () ;
199 | extern void memclose () ;
200 | extern void memfree ( void *ptr ) ;
201 | extern void *memrealloc ( void *ptr , unsigned int size ) ;
202 | extern void notext ( char *text ) ;
203 | extern void memtext ( char *text ) ;
204 | extern double mutinf_b ( int n , short int *y , short int *x , short int *z ) ;
205 | extern double normal () ;
206 | extern void partition ( int n , double *data , int *npart ,
207 |                         double *bnds , short int *bins ) ;
208 | extern void qsortd ( int first , int last , double *data ) ;
209 | extern void qsortds ( int first , int last , double *data , double *slave ) ;
210 | extern void qsortdsi ( int first , int last , double *data , int *slave ) ;
211 | extern unsigned int RAND32 () ;
212 | extern int readfile ( char *name , int *nvars , char ***names ,
213 |                       int *ncases , double **data ) ;
214 | extern double unifrand () ;
215 | 


--------------------------------------------------------------------------------
/INTEGRAT.CPP:
--------------------------------------------------------------------------------
 1 | /******************************************************************************/
 2 | /*                                                                            */
 3 | /*  INTEGRAT - Integrate() to perform adaptive quadrature                     */
 4 | /*                                                                            */
 5 | /******************************************************************************/
 6 | 
 7 | #include <math.h>
 8 | #include "info.h"
 9 | 
10 | #define INTBUF 100 /* Incredibly conservative! (divisions 2^(-100) are tiny!) */
11 | 
12 | double integrate (
13 |    double low ,                // Lower limit for definite integral
14 |    double high ,               // Upper limit
15 |    double min_width ,          // Demand subdivision this small or smaller
16 |    double acc ,                // Relative interval width limit
17 |    double tol ,                // Relative error tolerance
18 |    double (*criter) (double)   // Criterion function
19 |    )
20 | {
21 |    int istack ;
22 |    double sum, a, b, mid, fa, fb, fmid, lowres, hires, fac ;
23 | 
24 | struct IntStack {
25 |    double x0 ;
26 |    double x1 ;
27 |    double f0 ;
28 |    double f1 ;
29 | } stack[INTBUF] ;
30 | 
31 |    fac = 3.0 * tol ; // Error is about (lowres-hires) / 3
32 | 
33 | /*
34 |    Start by initializing the stack to be the entire interval
35 |    and the integral so far to be zero
36 | */
37 | 
38 |    stack[0].x0 = low ;
39 |    stack[0].f0 = criter ( low ) ;
40 |    stack[0].x1 = high ;
41 |    stack[0].f1 = criter ( high ) ;
42 |    istack = 1 ;
43 |    sum = 0.0 ;
44 | 
45 | /*
46 |    Main algorithm starts here.  Pop interval off stack and test its quality.
47 | */
48 | 
49 |    while (istack > 0) {  // While there is still at least one interval on stack
50 |       --istack ;         // Pop this interval
51 |       a = stack[istack].x0 ;
52 |       b = stack[istack].x1 ;
53 |       fa = stack[istack].f0 ;
54 |       fb = stack[istack].f1 ;
55 |       mid = 0.5 * (a + b) ;
56 |       fmid = criter ( mid ) ;
57 |       lowres = 0.5 * (b - a) * (fa + fb) ; // Trapezoidal rule
58 |       hires = 0.25 * (b - a) * (fa + 2.0 * fmid + fb) ; // And refined value
59 |       // If the interval is ridiculously narrow, no point in continuing
60 |       // If it gets this far, chances are the integrand is discontinuous
61 |       if (b - a <= acc * (1.0 + fabs(a) + fabs(b)))
62 |          sum += hires ;               // Quit trying to refine
63 |       else if ((b - a) <= min_width  &&  fabs(lowres-hires) < fac * (b - a))
64 |          sum += hires ;  // Normal convergence flag
65 |       else {
66 |          stack[istack].x0 = a ;
67 |          stack[istack].f0 = fa ;
68 |          stack[istack].x1 = mid ;
69 |          stack[istack].f1 = fmid ;
70 |          ++istack ;
71 |          if (istack < INTBUF) {       // Insurance against catastrophe only
72 |             stack[istack].x0 = mid ;  // Should ALWAYS be true (easily!)
73 |             stack[istack].f0 = fmid ; // If this if() fails, the answer will
74 |             stack[istack].x1 = b ;    // of course be wrong, but only due to
75 |             stack[istack].f1 = fb ;   // a horrendous underlying problem
76 |             ++istack ;                // like a singularity in the function
77 |             }
78 |          else {
79 |             --istack ;      // Error condition, so undo push
80 |             sum += hires ;  // And go with this best estimiate
81 |             }
82 |          }
83 |       }
84 |    return sum ;
85 | }
86 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/assessing-and-improving-prediction-and-classification/28736ace4e23f260aa4a19dbab092f668a96480a/LICENSE.txt


--------------------------------------------------------------------------------
/LINREG.CPP:
--------------------------------------------------------------------------------
 1 | /******************************************************************************/
 2 | /*                                                                            */
 3 | /*  LINREG - Linear regression                                                */
 4 | /*                                                                            */
 5 | /*  To use this class:                                                        */
 6 | /*    1) Construct a new instance of the class                                */
 7 | /*    2) Call add_case() exactly ncases times, each time providing the        */
 8 | /*       npred=ncols vector of predictors.                                    */
 9 | /*    3) Call solve() as many times as desired with various right hand sides  */
10 | /*    4) Optionally, call reset() and go to step 2                            */
11 | /*                                                                            */
12 | /*  This does not include any checks for insufficient memory.                 */
13 | /*  It also assumes that the user calls add_case exactly ncases times         */
14 | /*  and does not check for failure to do so.                                  */
15 | /*                                                                            */
16 | /******************************************************************************/
17 | 
18 | #include <stdlib.h>
19 | #include <string.h>
20 | #include <math.h>
21 | #include "linreg.h"
22 | 
23 | LinReg::LinReg ( int ncase , int ncol )
24 | {
25 |    ncases = ncase ;
26 |    ncols = ncol ;
27 |    svd = new SingularValueDecomp ( ncase , ncol ) ;
28 |    reset () ;
29 | }
30 | 
31 | 
32 | LinReg::~LinReg ()
33 | {
34 |    if (svd != NULL)
35 |       delete svd ;
36 | }
37 | 
38 | /*
39 |    This discards any existing design matrix.
40 |    It does not need to be called after construction, but it must
41 |    be called if the user wants to reuse the LinReg object for a new
42 |    design matrix.
43 | */
44 | 
45 | void LinReg::reset ()
46 | {
47 |    nrows = 0 ;      // No rows (via add_case()) yet present
48 |    decomp = 0 ;     // Decomposition not done yet
49 | }
50 | 
51 | /*
52 |    Build the design matrix one case at a time.
53 |    The user must call this member EXACTLY ncases times after construction
54 |    or a call to reset(), and before a call to solve().
55 | */
56 | 
57 | void LinReg::add_case ( double *newcase )
58 | {
59 |    if (nrows >= ncases)  // Careful user never lets this happen
60 |       return ;           // But cheap insurance
61 | 
62 |    memcpy ( svd->a + nrows * ncols , newcase , ncols * sizeof(double) ) ;
63 |    ++nrows ;
64 | }
65 | 
66 | /*
67 |    After add_case has been called exactly ncases times, this may be called
68 |    as many times as desired to solve a system.
69 | */
70 | 
71 | void LinReg::solve (
72 |    double eps ,       // Singularity limit, typically 1.e-8 or so
73 |    double *rhs ,      // Right hand side, ncases long
74 |    double *b          // Output of solution, npred=ncols long
75 |    )
76 | {
77 |    int i ;
78 | 
79 |    if (nrows != ncases) { // Careful user never lets this happen
80 |       for (i=0 ; i<ncols ; i++)  // This might be a warning flag
81 |          b[i] = 0.0 ;     // For a watchful but careless user
82 |       return ;            // But cheap insurance
83 |       }
84 | 
85 |    if (! decomp) {        // If the decomposition has not been done yet (1'st call)
86 |       svd->svdcmp () ;    // Do it now
87 |       decomp = 1 ;        // And flag that it has been done
88 |       }
89 | 
90 |    memcpy ( svd->b , rhs , ncases * sizeof(double) ) ;
91 |    svd->backsub ( eps , b ) ;
92 | }
93 | 


--------------------------------------------------------------------------------
/LINREG.H:
--------------------------------------------------------------------------------
 1 | #ifndef SVD
 2 | #define SVD
 3 | class SingularValueDecomp {
 4 | 
 5 | public:
 6 | 
 7 |    SingularValueDecomp ( int nrows , int ncols , int save_a=0 ) ;
 8 |    ~SingularValueDecomp () ;
 9 |    void svdcmp () ;
10 |    void backsub ( double limit , double *soln ) ;
11 | 
12 |    int ok ;         // Was everything legal and allocs successful?
13 | 
14 | /*
15 |    These are made public to allow access if desired.
16 |    Normally, only 'a' (the design matrix) and 'b' (the right-hand-side)
17 |    are written by the user.  If 'save_a' is nonzero, 'a' is kept intact.
18 | */
19 | 
20 |    double *a ;      // nrows by ncols input of design, output of U
21 |    double *u ;      // unless save_a nonzero, in which case U output in 'u'
22 |    double *w ;      // Unsorted ncols vector of singular values
23 |    double *v ;      // Ncols by ncols output of 'v'
24 |    double *b ;      // Nrows right-hand-side for backsub
25 | 
26 | 
27 | private:
28 | 
29 |    void bidiag ( double *matrix ) ;
30 |    double bid1 ( int col , double *matrix , double scale ) ;
31 |    double bid2 ( int col , double *matrix , double scale ) ;
32 |    void right ( double *matrix ) ;
33 |    void left ( double *matrix ) ;
34 |    void cancel ( int low , int high , double *matrix ) ;
35 |    void qr ( int low , int high , double *matrix ) ;
36 |    void qr_mrot ( int col , double sine , double cosine , double *matrix ) ;
37 |    void qr_vrot ( int col , double sine , double cosine ) ;
38 | 
39 |    int rows ;       // Nrows preserved here
40 |    int cols ;       // And ncols
41 |    double *work ;   // Scratch vector ncols long
42 |    double norm ;    // Norm of 'a' matrix
43 | } ;
44 | #endif
45 | 
46 | class LinReg {
47 | 
48 | public:
49 | 
50 |    LinReg ( int ncase , int ncol ) ;
51 |    ~LinReg () ;
52 |    void reset () ;
53 |    void add_case ( double *newcase ) ;
54 |    void solve ( double eps , double *rhs , double *b ) ;
55 | 
56 | 
57 | private:
58 |    SingularValueDecomp *svd ;
59 |    int ncases ;    // Number of cases
60 |    int ncols  ;    // Number of columns
61 |    int nrows ;     // How many times has add_case() been called?
62 |    int decomp ;    // Has the decomposition been done yet?
63 | } ;
64 | 


--------------------------------------------------------------------------------
/LOGISTIC.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  LOGISTIC - Logistic regression                                            */
  4 | /*                                                                            */
  5 | /*  Unlike most implementations (which use iterative gradient ascent)         */
  6 | /*  this uses simulated annealing.  This is considerably slower than          */
  7 | /*  the traditional method, but more likely to find the global optimum.       */
  8 | /*  It is also more numerically stable.                                       */
  9 | /*                                                                            */
 10 | /*  To use this class:                                                        */
 11 | /*    1) Construct a new instance of the class                                */
 12 | /*    2) Call add_case() exactly ncases times, each time providing the        */
 13 | /*       nin+1 vector of inputs and output.                                   */
 14 | /*    3) Call train()                                                         */
 15 | /*    4) Call predict() as many times as desired                              */
 16 | /*    5) Optionally, call reset() and go to step 2                            */
 17 | /*                                                                            */
 18 | /*  This does not include any checks for insufficient memory.                 */
 19 | /*  It also assumes that the user calls add_case exactly ncases times         */
 20 | /*  and does not check for failure to do so.                                  */
 21 | /*                                                                            */
 22 | /******************************************************************************/
 23 | 
 24 | #include <stdio.h>
 25 | #include <stdlib.h>
 26 | #include <string.h>
 27 | #include <math.h>
 28 | #include "logistic.h"
 29 | #include "minimize.h"
 30 | 
 31 | double normal () ;
 32 | 
 33 | static double max_exp = log ( 1.e190 ) ;
 34 | inline double safe_exp ( double x )
 35 | {
 36 |    if (x <= max_exp)
 37 |       return exp ( x ) ;
 38 | 
 39 |    return 1.e190 ;
 40 | }
 41 | 
 42 | /*
 43 | --------------------------------------------------------------------------------
 44 | 
 45 |    Constructor, destructor, reset(), add_case()
 46 | 
 47 | --------------------------------------------------------------------------------
 48 | */
 49 | 
 50 | Logistic::Logistic ( int ncase , int nin )
 51 | {
 52 |    ncases = ncase ;
 53 |    ninputs = nin ;
 54 |    coefs = (double *) malloc ( (ninputs +1) * sizeof(double) ) ;
 55 |    tset = (double *) malloc ( ncases * (ninputs + 1) * sizeof(double) ) ;
 56 |    reset () ;
 57 | }
 58 | 
 59 | 
 60 | Logistic::~Logistic ()
 61 | {
 62 |    if (tset != NULL)
 63 |       free ( tset ) ;
 64 |    if (coefs != NULL)
 65 |       free ( coefs ) ;
 66 | }
 67 | 
 68 | /*
 69 |    This discards any existing training data.
 70 |    It does not need to be called after construction, but it must
 71 |    be called if the user wants to reuse the Logistic object for a new dataset.
 72 | */
 73 | 
 74 | void Logistic::reset ()
 75 | {
 76 |    nrows = 0 ;      // No rows (via add_case()) yet present
 77 |    trained = 0 ;    // Training not done yet
 78 | }
 79 | 
 80 | /*
 81 |    Build the training set one case at a time.
 82 |    The user must call this member EXACTLY ncases times after construction
 83 |    or a call to reset(), and before a call to train().
 84 | */
 85 | 
 86 | void Logistic::add_case ( double *newcase )
 87 | {
 88 |    if (nrows >= ncases)  // Careful user never lets this happen
 89 |       return ;           // But cheap insurance
 90 | 
 91 | #if 0
 92 |    printf ( "\n---> " ) ;
 93 |    for (int i=0 ; i<=ninputs ; i++)
 94 |       printf ( " %.2lf", newcase[i] ) ;
 95 | #endif
 96 |    memcpy ( tset + nrows * (ninputs + 1) , newcase ,
 97 |             (ninputs + 1) * sizeof(double) ) ;
 98 |    ++nrows ;
 99 | }
100 | 
101 | 
102 | /*
103 | --------------------------------------------------------------------------------
104 | 
105 |    predict() - Given an input vector, compute output using trained model
106 |                The output is the linear combination, the log odds ratio
107 | 
108 | --------------------------------------------------------------------------------
109 | */
110 | 
111 | void Logistic::predict (
112 |    double *input ,     // Input vector
113 |    double *output      // Returned output
114 |    )
115 | {
116 |    int i ;
117 | 
118 |    *output = coefs[ninputs] ;    // Constant term
119 |    for (i=0 ; i<ninputs ; i++)
120 |       *output += input[i] * coefs[i] ;
121 | }
122 | 
123 | /*
124 | --------------------------------------------------------------------------------
125 | 
126 |    execute() - Given coefficients, pass through the training set,
127 |                return log likelihood.
128 | 
129 | --------------------------------------------------------------------------------
130 | */
131 | 
132 | double Logistic::execute ()
133 | {
134 |    int icase ;
135 |    double *tptr, term, sum1, sum2 ;
136 | 
137 |    sum1 = sum2 = 0.0 ;
138 | 
139 |    for (icase=0 ; icase<ncases ; icase++) {
140 |       tptr = tset + (ninputs + 1) * icase ; // This case
141 |       predict ( tptr , &term ) ;            // Log odds ratio
142 |       sum1 += term * tptr[ninputs] ;        // Output stored after inputs
143 |       sum2 += log ( 1.0 + safe_exp ( term )) ;
144 |       } // For all training cases
145 | 
146 |    return sum1 - sum2 ;
147 | }
148 | 
149 | /*
150 | --------------------------------------------------------------------------------
151 | 
152 |    train() - Train the model
153 | 
154 |    After add_case has been called exactly ncases times, this must be called
155 |    to train the model.
156 | 
157 | --------------------------------------------------------------------------------
158 | */
159 | 
160 | double logit_crit ( double *x ) ;   // Local criterion function for optimization
161 | double logit_unicrit ( double x ) ; // Local criterion function for optimization
162 | static Logistic *local_logistic ;   // Needed by above
163 | 
164 | void Logistic::train ()
165 | {
166 |    int i, inner, outer, first ;
167 |    double y, best_y, std, *test_wts, *best_wts, *center ;
168 | 
169 |    local_logistic = this ;
170 | 
171 | /*
172 |    Best_wts keeps track of the (log) best coefs.
173 |    Center is the center around which perturbation is done.
174 |    It starts at zero.  After completion of each pass through the inner loop
175 |    it is changed to best_wts.
176 | */
177 | 
178 |    test_wts = (double *) malloc ( ninputs * sizeof(double) ) ;
179 |    best_wts = (double *) malloc ( ninputs * sizeof(double) ) ;
180 |    center = (double *) malloc ( ninputs * sizeof(double) ) ;
181 | 
182 |    std = 1.0 ;   // Reasonable when predictors are mean ranks
183 | 
184 |    for (i=0 ; i<ninputs ; i++)
185 |       center[i] = 0.0 ;
186 | 
187 |    first = 1 ;
188 |    for (outer=0 ; outer<10 ; outer++) {
189 |       for (inner=0 ; inner<10 + 5 * ninputs * ninputs ; inner++) {
190 | 
191 |          for (i=0 ; i<ninputs ; i++)
192 |             test_wts[i] = center[i] + std * normal() ;
193 | 
194 |          y = logit_crit ( test_wts ) ;
195 |          if (first  ||  (y > best_y)) {
196 |             first = 0 ;
197 |             best_y = y ;
198 |             memcpy ( best_wts , test_wts , ninputs * sizeof(double) ) ;
199 |             }
200 | 
201 |          } // For inner loop iterations
202 |       memcpy ( center , best_wts , ninputs * sizeof(double) ) ;
203 |       std *= 0.7 ;
204 |       } // For outer loop iterations
205 | 
206 |    logit_crit ( best_wts ) ;  // Needed to set coefs correctly
207 |    trained = 1 ;    // Training complete
208 | 
209 | #if 0
210 |    printf ( "\n" ) ;
211 |    for (i=0 ; i<=ninputs ; i++)
212 |       printf ( "  %.3lf", coefs[i] ) ;
213 | //   getch () ;
214 | #endif
215 | 
216 | 
217 |    free ( test_wts ) ;
218 |    free ( best_wts ) ;
219 |    free ( center ) ;
220 | }
221 | 
222 | static double logit_crit ( double *x )
223 | {
224 |    int i ;
225 |    double x1, y1, x2, y2, x3, y3 ;
226 | 
227 |    for (i=0 ; i<local_logistic->ninputs ; i++)
228 |        local_logistic->coefs[i] = safe_exp ( x[i] ) ;
229 | 
230 |    glob_min ( -20.0 , 20.0 , 5 , 0 , -1.e160 , logit_unicrit ,
231 |               &x1 , &y1 , &x2 , &y2 , &x3 , &y3 ) ;
232 | 
233 |    y2 = brentmin ( 50 , -1.e160 , 1.e-10 , 1.e-10 , logit_unicrit ,
234 |                    &x1 , &x2 , &x3 , y2 ) ;
235 | 
236 |    local_logistic->coefs[local_logistic->ninputs] = x2 ;
237 |    return -y2 ;
238 | }
239 | 
240 | static double logit_unicrit ( double t )
241 | {
242 |    double penalty ;
243 | 
244 |    penalty = 0.0 ;
245 |    if (fabs ( t ) > 20.0)                // Rare pathological event
246 |       penalty = 1.e10 * (fabs ( t ) - 20.0) ;
247 | 
248 |    local_logistic->coefs[local_logistic->ninputs] = t ;
249 |    return penalty - local_logistic->execute () ;
250 | }
251 | 


--------------------------------------------------------------------------------
/LOGISTIC.H:
--------------------------------------------------------------------------------
 1 | class Logistic {
 2 | 
 3 | public:
 4 | 
 5 |    Logistic ( int ncase , int nin ) ;
 6 |    ~Logistic () ;
 7 |    void reset () ;
 8 |    void add_case ( double *newcase ) ;
 9 |    void train () ;
10 |    void predict ( double *input , double *output ) ;
11 | 
12 |    double execute () ;
13 | 
14 |    int ncases ;     // Number of cases
15 |    int ninputs  ;   // Number of inputs
16 |    int nrows ;      // How many times has add_case() been called?
17 |    int trained ;    // Has it been trained yet?
18 |    double *tset ;   // Ncases by (ninputs+1) matrix of training data
19 |    double *coefs ;  // Trained coefficient vector ninputs+1 long
20 | } ;
21 | 
22 | 


--------------------------------------------------------------------------------
/MINIMIZE.H:
--------------------------------------------------------------------------------
 1 | extern int glob_min ( double low , double high , int npts , int log_space ,
 2 |    double critlim , double (*criter) (double) ,
 3 |    double *x1, double *y1 , double *x2, double *y2 , double *x3, double *y3 ) ;
 4 | 
 5 | extern double brentmin ( int itmax , double critlim , double eps ,
 6 |                          double tol , double (*criter) (double) ,
 7 |                          double *x1 , double *x2 , double *x3 , double y ) ;
 8 | 
 9 | extern double powell ( int maxits , double critlim , double tol ,
10 |    double (*criter) ( double * ) , int n , double *x , double ystart ,
11 |    double *base , double *p0 , double *direc ) ;
12 | 


--------------------------------------------------------------------------------
/MI_ONLY.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  MI_ONLY - Mutual information ONLY for continuous predicted and predictors */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <assert.h>
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <math.h>
 11 | #include <conio.h>
 12 | #include <ctype.h>
 13 | #include <stdlib.h>
 14 | #include "..\info.h"
 15 | 
 16 | /*
 17 |    These are defined in MEM.CPP
 18 | */
 19 | 
 20 | extern int mem_keep_log ;      // Keep a log file?
 21 | extern char mem_file_name[] ;  // Log file name
 22 | extern int mem_max_used ;      // Maximum memory ever in use
 23 | 
 24 | int main (
 25 |    int argc ,    // Number of command line arguments (includes prog name)
 26 |    char *argv[]  // Arguments (prog name is argv[0])
 27 |    )
 28 | 
 29 | {
 30 |    int i, j, k, nvars, ncases, irep, nreps, ivar, nties, ties ;
 31 |    int n_indep_vars, idep, icand, *index, *mcpt_max_counts, *mcpt_same_counts, *mcpt_solo_counts ;
 32 |    double *data, *work, dtemp, *save_info, criterion, *crits ;
 33 |    char filename[256], **names, depname[256] ;
 34 |    FILE *fp ;
 35 |    MutualInformationAdaptive *mi_adapt ;
 36 | 
 37 | /*
 38 |    Process command line parameters
 39 | */
 40 | 
 41 | #if 1
 42 |    if (argc != 5) {
 43 |       printf ( "\nUsage: MI_ONLY  datafile  n_indep  depname  nreps" ) ;
 44 |       printf ( "\n  datafile - name of the text file containing the data" ) ;
 45 |       printf ( "\n             The first line is variable names" ) ;
 46 |       printf ( "\n             Subsequent lines are the data." ) ;
 47 |       printf ( "\n             Delimiters can be space, comma, or tab" ) ;
 48 |       printf ( "\n  n_indep - Number of independent vars, starting with the first" ) ;
 49 |       printf ( "\n  depname - Name of the 'dependent' variable" ) ;
 50 |       printf ( "\n            It must be AFTER the first n_indep variables" ) ;
 51 |       printf ( "\n  nreps - Number of Monte-Carlo permutations, including unpermuted" ) ;
 52 |       exit ( 1 ) ;
 53 |       }
 54 | 
 55 |    strcpy ( filename , argv[1] ) ;
 56 |    n_indep_vars = atoi ( argv[2] ) ;
 57 |    strcpy ( depname , argv[3] ) ;
 58 |    nreps = atoi ( argv[4] ) ;
 59 | #else
 60 |    strcpy ( filename , "..\\SYNTH.TXT" ) ;
 61 |    n_indep_vars = 7 ;
 62 |    strcpy ( depname , "SUM1234" ) ;
 63 |    nreps = 100 ;
 64 | #endif
 65 | 
 66 |    _strupr ( depname ) ;
 67 | 
 68 | /*
 69 |    These are used by MEM.CPP for runtime memory validation
 70 | */
 71 | 
 72 |    _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ;
 73 |    fp = fopen ( mem_file_name , "wt" ) ;
 74 |    if (fp == NULL) { // Should never happen
 75 |       printf ( "\nCannot open MEM.LOG file for writing!" ) ;
 76 |       return EXIT_FAILURE ;
 77 |       }
 78 |    fclose ( fp ) ;
 79 |    mem_keep_log = 0 ;  // Change this to 1 to keep a memory use log (slows execution!)
 80 |    mem_max_used = 0 ;
 81 | 
 82 | /*
 83 |    Open the text file to which results will be written
 84 | */
 85 | 
 86 |    fp = fopen ( "MI_ONLY.LOG" , "wt" ) ;
 87 |    if (fp == NULL) { // Should never happen
 88 |       printf ( "\nCannot open MI_ONLY.LOG file for writing!" ) ;
 89 |       return EXIT_FAILURE ;
 90 |       }
 91 | 
 92 | /*
 93 |    Read the file and locate the index of the dependent variable
 94 | */
 95 | 
 96 |    if (readfile ( filename , &nvars , &names , &ncases , &data ))
 97 |       return EXIT_FAILURE ;
 98 | 
 99 |    for (idep=0 ; idep<nvars ; idep++) {
100 |       if (! strcmp ( depname , names[idep] ))
101 |          break ;
102 |       }
103 | 
104 |    if (idep == nvars) {
105 |       printf ( "\nERROR... Dependent variable %s is not in file", depname ) ;
106 |       return EXIT_FAILURE ;
107 |       }
108 | 
109 |    if (idep < n_indep_vars) {
110 |       printf ( "\nERROR... Dependent variable %s must be beyond independent vars",
111 |                depname ) ;
112 |       return EXIT_FAILURE ;
113 |       }
114 | 
115 | /*
116 |    Check each variable for ties.  This is not needed for the algorithm,
117 |    but it is good to warn the user, because more than a very few tied values
118 |    in any variable seriously degrades performance of the adaptive partitioning algorithm.
119 | */
120 | 
121 |    MEMTEXT ( "MI_ONLY: Work" ) ;
122 |    work = (double *) MALLOC ( ncases * sizeof(double) ) ;
123 |    assert ( work != NULL ) ;
124 | 
125 |    ties = 0 ;
126 |    assert ( work != NULL ) ;
127 |    for (ivar=0 ; ivar<nvars ; ivar++) {
128 |       if (ivar > n_indep_vars  &&  ivar != idep)
129 |          continue ; // Check only the variables selected by the user
130 |       for (i=0 ; i<ncases ; i++)
131 |          work[i] = data[i*nvars+ivar] ;
132 |       qsortd ( 0 , ncases-1 , work ) ;
133 |       nties = 0 ;
134 |       for (i=1 ; i<ncases ; i++) {
135 |          if (work[i] == work[i-1])
136 |             ++nties ;
137 |          }
138 |       if ((double) nties / (double) ncases > 0.05) {
139 |          ++ties ;
140 |          fprintf ( fp , "\nWARNING... %s has %.2lf percent ties!",
141 |                    names[ivar], 100.0 * nties / (double) ncases ) ;
142 |          }
143 |       } // For all variables
144 |    if (ties) {
145 |       fprintf ( fp , "\nThe presence of ties will seriously degrade" ) ;
146 |       fprintf ( fp , "\nperformance of the adaptive partitioning algorithm\n\n" ) ;
147 |       }
148 | 
149 | /*
150 |    Allocate scratch memory and create the MutualInformation object using the
151 |    dependent variable
152 | 
153 |    crits - Mutual information criterion
154 |    index - Indices that sort the criterion
155 |    save_info - Ditto, this is univariate information, to be sorted
156 |    mi_adapt - The MutualInformation object, constructed with the 'dependent' variable
157 | */
158 | 
159 |    MEMTEXT ( "MI_ONLY work allocs plus MutualInformation" ) ;
160 |    crits = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ;
161 |    assert ( crits != NULL ) ;
162 |    index = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ;
163 |    assert ( index != NULL ) ;
164 |    mcpt_max_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ;
165 |    assert ( mcpt_max_counts != NULL ) ;
166 |    mcpt_same_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ;
167 |    assert ( mcpt_same_counts != NULL ) ;
168 |    mcpt_solo_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ;
169 |    assert ( mcpt_solo_counts != NULL ) ;
170 |    save_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ;
171 |    assert ( save_info != NULL ) ;
172 | 
173 |    for (irep=0 ; irep<nreps ; irep++) {
174 | 
175 |       for (i=0 ; i<ncases ; i++)            // Get the 'dependent' variable
176 |          work[i] = data[i*nvars+idep] ;
177 | 
178 | //    Shuffle dependent variable if in permutation run (irep>0)
179 | 
180 |       if (irep) {                   // If doing permuted runs, shuffle
181 |          i = ncases ;              // Number remaining to be shuffled
182 |          while (i > 1) {            // While at least 2 left to shuffle
183 |             j = (int) (unifrand () * i) ;
184 |             if (j >= i)
185 |                j = i - 1 ;
186 |             dtemp = work[--i] ;
187 |             work[i] = work[j] ;
188 |             work[j] = dtemp ;
189 |             }
190 |          }
191 | 
192 |       // Here we use a tiny split theshold (instead of the usual 6.0) so that it picks up
193 |       // small amounts of mutual information (perhaps including noise).
194 |       // If we used 6.0, nearly all permutations of any reasonably sized dataset
195 |       // would have a computed mutual information of zero.  It's safe picking up
196 |       // some noise because the permutation test will account for this.
197 | 
198 |       mi_adapt = new MutualInformationAdaptive ( ncases , work , 1 , 0.1 ) ; // Deliberately tiny for low information
199 |       assert ( mi_adapt != NULL ) ;
200 | 
201 | /*
202 |    Compute and save the mutual information for the dependent variable
203 |    with each individual independent variable candidate.
204 | */
205 | 
206 |       for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates
207 |          for (i=0 ; i<ncases ; i++)
208 |             work[i] = data[i*nvars+icand] ;
209 | 
210 |          criterion = mi_adapt->mut_inf ( work , 1 ) ;
211 | 
212 |          save_info[icand] = criterion ; // We will sort this when all candidates are done
213 |                                         
214 |          if (irep == 0) {               // If doing original (unpermuted), save criterion
215 |             index[icand] = icand ;      // Will need original indices when criteria are sorted
216 |             crits[icand] = criterion ;
217 |             mcpt_max_counts[icand] = mcpt_same_counts[icand] = mcpt_solo_counts[icand] = 1 ;  // This is >= itself so count it now
218 |             }
219 |          else {
220 |             if (criterion >= crits[icand])
221 |                ++mcpt_solo_counts[icand] ;
222 |             }
223 |          } // Initial list of all candidates
224 | 
225 |       delete mi_adapt ;
226 |       mi_adapt = NULL ;
227 | 
228 |       if (irep == 0)  // Find the indices that sort the candidates per criterion
229 |          qsortdsi ( 0 , n_indep_vars-1 , save_info , index ) ;
230 | 
231 |       else {
232 |          qsortd ( 0 , n_indep_vars-1 , save_info ) ;
233 |          for (icand=0 ; icand<n_indep_vars ; icand++) {
234 |             if (save_info[icand] >= crits[index[icand]])
235 |                ++mcpt_same_counts[index[icand]] ;
236 |             if (save_info[n_indep_vars-1] >= crits[index[icand]]) // Valid only for largest
237 |                ++mcpt_max_counts[index[icand]] ;
238 |             }
239 |          }
240 | 
241 |       }  // For all reps
242 | 
243 |    fprintf ( fp , "\nAdaptive partitioning mutual information of %s", depname);
244 | 
245 |    fprintf ( fp , "\n" ) ;
246 |    fprintf ( fp , "\n" ) ;
247 |    fprintf ( fp , "\nPredictors, in order of decreasing mutual information" ) ;
248 |    fprintf ( fp , "\n" ) ;
249 |    fprintf ( fp , "\n                       Variable   Information   Solo pval   Min pval   Max pval" ) ;
250 | 
251 |    for (icand=0 ; icand<n_indep_vars ; icand++) { // Do all candidates
252 |       k = index[n_indep_vars-1-icand] ;           // Index of sorted candidate
253 |       fprintf ( fp , "\n%31s %11.5lf %12.4lf %10.4lf %10.4lf", names[k], crits[k],
254 |                 (double) mcpt_solo_counts[k] / nreps,
255 |                 (double) mcpt_same_counts[k] / nreps,
256 |                 (double) mcpt_max_counts[k] / nreps ) ;
257 |       }
258 | 
259 |    MEMTEXT ( "MI_ONLY: Finish" ) ;
260 |    fclose ( fp ) ;
261 |    FREE ( work ) ;
262 |    FREE ( crits ) ;
263 |    FREE ( index ) ;
264 |    FREE ( mcpt_max_counts ) ;
265 |    FREE ( mcpt_same_counts ) ;
266 |    FREE ( mcpt_solo_counts ) ;
267 |    FREE ( save_info ) ;
268 |    free_data ( nvars , names , data ) ;
269 | 
270 |    MEMCLOSE () ;
271 |    printf ( "\n\nPress any key..." ) ;
272 |    _getch () ;
273 |    return EXIT_SUCCESS ;
274 | }
275 | 


--------------------------------------------------------------------------------
/MLFN.H:
--------------------------------------------------------------------------------
 1 | #ifndef SVD
 2 | #define SVD
 3 | class SingularValueDecomp {
 4 | 
 5 | public:
 6 | 
 7 |    SingularValueDecomp ( int nrows , int ncols , int save_a=0 ) ;
 8 |    ~SingularValueDecomp () ;
 9 |    void svdcmp () ;
10 |    void backsub ( double limit , double *soln ) ;
11 | 
12 |    int ok ;         // Was everything legal and allocs successful?
13 | 
14 | /*
15 |    These are made public to allow access if desired.
16 |    Normally, only 'a' (the design matrix) and 'b' (the right-hand-side)
17 |    are written by the user.  If 'save_a' is nonzero, 'a' is kept intact.
18 | */
19 | 
20 |    double *a ;      // nrows by ncols input of design, output of U
21 |    double *u ;      // unless save_a nonzero, in which case U output in 'u'
22 |    double *w ;      // Unsorted ncols vector of singular values
23 |    double *v ;      // Ncols by ncols output of 'v'
24 |    double *b ;      // Nrows right-hand-side for backsub
25 | 
26 | 
27 | private:
28 | 
29 |    void bidiag ( double *matrix ) ;
30 |    double bid1 ( int col , double *matrix , double scale ) ;
31 |    double bid2 ( int col , double *matrix , double scale ) ;
32 |    void right ( double *matrix ) ;
33 |    void left ( double *matrix ) ;
34 |    void cancel ( int low , int high , double *matrix ) ;
35 |    void qr ( int low , int high , double *matrix ) ;
36 |    void qr_mrot ( int col , double sine , double cosine , double *matrix ) ;
37 |    void qr_vrot ( int col , double sine , double cosine ) ;
38 | 
39 |    int rows ;       // Nrows preserved here
40 |    int cols ;       // And ncols
41 |    double *work ;   // Scratch vector ncols long
42 |    double norm ;    // Norm of 'a' matrix
43 | } ;
44 | #endif
45 | 
46 | class MLFN {
47 | 
48 | public:
49 | 
50 |    MLFN ( int ncase , int nin , int nout , int nhid ) ;
51 |    ~MLFN () ;
52 |    void reset () ;
53 |    void add_case ( double *newcase ) ;
54 |    void add_case ( double *newcase , double prob ) ;
55 |    void train () ;
56 |    void anneal_train ( int n_outer , int n_inner , double start_std ) ;
57 |    void predict ( double *input , double *output ) ;
58 | 
59 | 
60 | private:
61 |    double execute () ;
62 | 
63 |    SingularValueDecomp *svd ;
64 |    int ncases ;     // Number of cases
65 |    int ninputs  ;   // Number of inputs
66 |    int noutputs  ;  // Number of outputs
67 |    int nhidden ;    // Number of hidden neurons
68 |    int nrows ;      // How many times has add_case() been called?
69 |    int trained ;    // Has it been trained yet?
70 |    double *tset ;   // Ncases by (ninputs+noutputs) matrix of training data
71 |    double *probs ;  // Ncases probability vector if add_case() supplies probability
72 |    double *inwts ;  // Input weights with constant last; nhidden by (ninputs+1)
73 |    double *outwts ; // Output weights with constant last; noutputs by (nhidden+1)
74 |    double *hid ;    // Nhidden vector of hidden layer activations
75 | } ;
76 | 


--------------------------------------------------------------------------------
/MUTINF_B.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  MutInf_B - Mutual information for binary data                             */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <assert.h>
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <math.h>
 11 | #include <stdlib.h>
 12 | #include "info.h"
 13 | 
 14 | double mutinf_b (
 15 |    int n ,         // Number of cases
 16 |    short int *y ,  // The 'dependent' variable
 17 |    short int *x ,  // The 'independent' variable; NULL to compute H(Y)
 18 |    short int *z )  // NULL to compute I(X;Y), z to compute I(X;Y|Z)
 19 | {
 20 |    int i, nx0, nx1, ny0, ny1, nz0, nz1, n00, n01, n10, n11 ;
 21 |    int  n000, n010, n100, n110, n001, n011, n101, n111 ;
 22 |    double p, HX, HY, HZ, HXY, HYZ, HXZ, HXYZ ;
 23 | 
 24 | /*
 25 | --------------------------------------------------------------------------------
 26 | 
 27 |    Compute the entropy of Y
 28 | 
 29 | --------------------------------------------------------------------------------
 30 | */
 31 | 
 32 |    if (x == NULL) {
 33 |       ny1 = 0 ;
 34 |       for (i=0 ; i<n ; i++) {
 35 |          if (y[i])
 36 |             ++ny1 ;
 37 |          }
 38 |       ny0 = n - ny1 ;
 39 |       // Compute the entropy of Y
 40 |       if (ny0) {
 41 |          p = (double) ny0 / (double) n ;
 42 |          HY = p * log ( p ) ;
 43 |          }
 44 |       else
 45 |          HY = 0.0 ;
 46 |       if (ny1) {
 47 |          p = (double) ny1 / (double) n ;
 48 |          HY += p * log ( p ) ;
 49 |          }
 50 |       return -HY ;
 51 |       }
 52 | /*
 53 | --------------------------------------------------------------------------------
 54 | 
 55 |    Compute the joint entropy I(X;Y)
 56 | 
 57 | --------------------------------------------------------------------------------
 58 | */
 59 | 
 60 |    if (z == NULL) {
 61 |       n01 = n10 = n11 = 0 ;
 62 |       for (i=0 ; i<n ; i++) {
 63 |          if (x[i]) {
 64 |             if (y[i])
 65 |                ++n11 ;
 66 |             else
 67 |                ++n10 ;
 68 |             }
 69 |          else {
 70 |             if (y[i])
 71 |                ++n01 ;
 72 |             }
 73 |          }
 74 |       n00 = n - n01 - n10 - n11 ;
 75 |       // Compute the marginals
 76 |       nx0 = n00 + n01 ;
 77 |       nx1 = n10 + n11 ;
 78 |       ny0 = n00 + n10 ;
 79 |       ny1 = n01 + n11 ;
 80 |       // Compute the entropy of X
 81 |       if (nx0) {
 82 |          p = (double) nx0 / (double) n ;
 83 |          HX = p * log ( p ) ;
 84 |          }
 85 |       else
 86 |          HX = 0.0 ;
 87 |       if (nx1) {
 88 |          p = (double) nx1 / (double) n ;
 89 |          HX += p * log ( p ) ;
 90 |          }
 91 | 
 92 |       // Compute the entropy of Y
 93 |       if (ny0) {
 94 |          p = (double) ny0 / (double) n ;
 95 |          HY = p * log ( p ) ;
 96 |          }
 97 |       else
 98 |          HY = 0.0 ;
 99 |       if (ny1) {
100 |          p = (double) ny1 / (double) n ;
101 |          HY += p * log ( p ) ;
102 |          }
103 | 
104 |       // Compute the joint entropy H(X,Y)
105 |       if (n00) {
106 |          p = (double) n00 / (double) n ;
107 |          HXY = p * log ( p ) ;
108 |          }
109 |       else
110 |          HXY = 0.0 ;
111 |       if (n01) {
112 |          p = (double) n01 / (double) n ;
113 |          HXY += p * log ( p ) ;
114 |          }
115 |       if (n10) {
116 |          p = (double) n10 / (double) n ;
117 |          HXY += p * log ( p ) ;
118 |          }
119 |       if (n11) {
120 |          p = (double) n11 / (double) n ;
121 |          HXY += p * log ( p ) ;
122 |          }
123 | 
124 |       return HXY - HX - HY ;
125 |       } // If z==NULL... want I(X;Y)
126 | 
127 | /*
128 | --------------------------------------------------------------------------------
129 | 
130 |    Compute the joint conditional entropy I(X;Y|Z)
131 | 
132 | --------------------------------------------------------------------------------
133 | */
134 | 
135 |    else {
136 |       n000 = n001 = n010 = n011 = n100 = n101 = n110 = n111 = 0 ;
137 |       for (i=0 ; i<n ; i++) {
138 |          if (x[i]) {
139 |             if (y[i]) {
140 |                if (z[i])
141 |                   ++n111 ;
142 |                else
143 |                   ++n110 ;
144 |                }
145 |             else {
146 |                if (z[i])
147 |                   ++n101 ;
148 |                else
149 |                   ++n100 ;
150 |                }
151 |             }
152 |          else {
153 |             if (y[i]) {
154 |                if (z[i])
155 |                   ++n011 ;
156 |                else
157 |                   ++n010 ;
158 |                }
159 |             else {
160 |                if (z[i])
161 |                   ++n001 ;
162 |                else
163 |                   ++n000 ;
164 |                }
165 |             }
166 |          }
167 |       // Compute the entropy of Z
168 |       nz0 = n000 + n010 + n100 + n110 ;
169 |       nz1 = n - nz0 ;
170 |       if (nz0) {
171 |          p = (double) nz0 / (double) n ;
172 |          HZ = p * log ( p ) ;
173 |          }
174 |       else
175 |          HZ = 0.0 ;
176 |       if (nz1) {
177 |          p = (double) nz1 / (double) n ;
178 |          HZ += p * log ( p ) ;
179 |          }
180 | 
181 |       // Compute the joint entropy H(X,Z)
182 |       n00 = n000 + n010 ;
183 |       n01 = n001 + n011 ;
184 |       n10 = n100 + n110 ;
185 |       n11 = n101 + n111 ;
186 |       if (n00) {
187 |          p = (double) n00 / (double) n ;
188 |          HXZ = p * log ( p ) ;
189 |          }
190 |       else
191 |          HXZ = 0.0 ;
192 |       if (n01) {
193 |          p = (double) n01 / (double) n ;
194 |          HXZ += p * log ( p ) ;
195 |          }
196 |       if (n10) {
197 |          p = (double) n10 / (double) n ;
198 |          HXZ += p * log ( p ) ;
199 |          }
200 |       if (n11) {
201 |          p = (double) n11 / (double) n ;
202 |          HXZ += p * log ( p ) ;
203 |          }
204 | 
205 |       // Compute the joint entropy H(Y,Z)
206 |       n00 = n000 + n100 ;
207 |       n01 = n001 + n101 ;
208 |       n10 = n010 + n110 ;
209 |       n11 = n011 + n111 ;
210 |       if (n00) {
211 |          p = (double) n00 / (double) n ;
212 |          HYZ = p * log ( p ) ;
213 |          }
214 |       else
215 |          HYZ = 0.0 ;
216 |       if (n01) {
217 |          p = (double) n01 / (double) n ;
218 |          HYZ += p * log ( p ) ;
219 |          }
220 |       if (n10) {
221 |          p = (double) n10 / (double) n ;
222 |          HYZ += p * log ( p ) ;
223 |          }
224 |       if (n11) {
225 |          p = (double) n11 / (double) n ;
226 |          HYZ += p * log ( p ) ;
227 |          }
228 | 
229 |    // Compute the joint entropy H(X,Y,Z)
230 |       if (n000) {
231 |          p = (double) n000 / (double) n ;
232 |          HXYZ = p * log ( p ) ;
233 |          }
234 |       else
235 |          HXYZ = 0.0 ;
236 |       if (n001) {
237 |          p = (double) n001 / (double) n ;
238 |          HXYZ += p * log ( p ) ;
239 |          }
240 |       if (n010) {
241 |          p = (double) n010 / (double) n ;
242 |          HXYZ += p * log ( p ) ;
243 |          }
244 |       if (n011) {
245 |          p = (double) n011 / (double) n ;
246 |          HXYZ += p * log ( p ) ;
247 |          }
248 |       if (n100) {
249 |          p = (double) n100 / (double) n ;
250 |          HXYZ += p * log ( p ) ;
251 |          }
252 |       if (n101) {
253 |          p = (double) n101 / (double) n ;
254 |          HXYZ += p * log ( p ) ;
255 |          }
256 |       if (n110) {
257 |          p = (double) n110 / (double) n ;
258 |          HXYZ += p * log ( p ) ;
259 |          }
260 |       if (n111) {
261 |          p = (double) n111 / (double) n ;
262 |          HXYZ += p * log ( p ) ;
263 |          }
264 |       }
265 | 
266 |    return HZ + HXYZ - HXZ - HYZ ;
267 | }
268 | 


--------------------------------------------------------------------------------
/MUTINF_D.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  MutInf_D - Mutual information for discrete data                           */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <assert.h>
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <math.h>
 11 | #include <stdlib.h>
 12 | #include "info.h"
 13 | 
 14 | #define DEBUG 0
 15 | 
 16 | /*
 17 | --------------------------------------------------------------------------------
 18 | 
 19 |    MutualInformationDiscrete - Constructor and destructor
 20 | 
 21 | --------------------------------------------------------------------------------
 22 | */
 23 | 
 24 | 
 25 | MutualInformationDiscrete::MutualInformationDiscrete (
 26 |    int nc ,      // Number of cases
 27 |    short int *bins )   // They are here (y, the 'dependent' variable)
 28 | {
 29 |    int i ;
 30 | 
 31 |    MEMTEXT ( "MutualInformationDiscrete constructor" ) ;
 32 | 
 33 | /*
 34 |    Keep a local copy of the bins
 35 | */
 36 | 
 37 |    ncases = nc ;
 38 | 
 39 |    bins_y = (short int *) MALLOC ( ncases * sizeof(short int) ) ;
 40 |    assert (bins_y != NULL) ;
 41 | 
 42 |    memcpy ( bins_y , bins , ncases * sizeof(short int) ) ;
 43 | 
 44 | /*
 45 |    Compute the number of bins, and then compute and save the marginal distribution
 46 | */
 47 | 
 48 |    nbins_y = 0 ;
 49 |    for (i=0 ; i<ncases ; i++) {
 50 |       if (bins_y[i] > nbins_y)
 51 |          nbins_y = bins_y[i] ;
 52 |       }
 53 |    ++nbins_y ;  // Number of bins is one greater than max bin because org=0
 54 | 
 55 |    marginal_y = (int *) MALLOC ( nbins_y * sizeof(int) ) ;
 56 |    assert (marginal_y != NULL) ;
 57 | 
 58 |    for (i=0 ; i<nbins_y ; i++)
 59 |       marginal_y[i] = 0 ;
 60 | 
 61 |    for (i=0 ; i<ncases ; i++)
 62 |       ++marginal_y[bins_y[i]] ;
 63 | }
 64 | 
 65 | MutualInformationDiscrete::~MutualInformationDiscrete ()
 66 | {
 67 |    MEMTEXT ( "MutualInformationDiscrete destructor" ) ;
 68 |    FREE ( bins_y ) ;
 69 |    FREE ( marginal_y ) ;
 70 | }
 71 | 
 72 | /*
 73 | --------------------------------------------------------------------------------
 74 | 
 75 |    entropy() - Compute the entropy of Y, the 'dependent' variable
 76 | 
 77 | --------------------------------------------------------------------------------
 78 | */
 79 | 
 80 | double MutualInformationDiscrete::entropy ()
 81 | {
 82 |    int i ;
 83 |    double p, ent ;
 84 | 
 85 |    ent = 0.0 ;
 86 |    for (i=0 ; i<nbins_y ; i++) {
 87 |       if (marginal_y[i] > 0) {
 88 |          p = (double) marginal_y[i] / ncases ;
 89 |          ent += p * log ( p ) ;
 90 |          }
 91 |       }
 92 |    return -ent ;
 93 | }
 94 | 
 95 | /*
 96 | --------------------------------------------------------------------------------
 97 | 
 98 |    conditional ( bins_x ) - Compute the conditional entropy of Y given X
 99 | 
100 | --------------------------------------------------------------------------------
101 | */
102 | 
103 | double MutualInformationDiscrete::conditional ( short int *bins_x )
104 | {
105 |    int i, ix, iy, nbins_x, *grid, *marginal_x ;
106 |    double CI, pyx, cix ;
107 | 
108 |    MEMTEXT ( "MutualInformationDiscrete::conditional()" ) ;
109 | 
110 | /*
111 |    Compute the number of bins
112 | */
113 | 
114 |    nbins_x = 0 ;
115 |    for (i=0 ; i<ncases ; i++) {
116 |       if (bins_x[i] > nbins_x)
117 |          nbins_x = bins_x[i] ;
118 |       }
119 |    ++nbins_x ;  // Number of bins is one greater than max bin because org=0
120 | 
121 | /*
122 |    Compute the marginal of x and the counts in the nbins_x by nbins_y grid
123 | */
124 | 
125 |    marginal_x = (int *) MALLOC ( nbins_x * sizeof(int) ) ;
126 |    assert (marginal_x != NULL) ;
127 | 
128 |    grid = (int *) MALLOC ( nbins_x * nbins_y * sizeof(int) ) ;
129 |    assert ( grid != NULL ) ;
130 | 
131 |    for (ix=0 ; ix<nbins_x ; ix++) {
132 |       marginal_x[ix] = 0 ;
133 |       for (iy=0 ; iy<nbins_y ; iy++)
134 |          grid[ix*nbins_y+iy] = 0 ;
135 |       }
136 | 
137 |    for (i=0 ; i<ncases ; i++) {
138 |       ix = bins_x[i] ;
139 |       ++marginal_x[ix] ;
140 |       ++grid[ix*nbins_y+bins_y[i]] ;
141 |       }
142 | 
143 | /*
144 |    Compute the conditional entropy
145 | */
146 | 
147 |    CI = 0.0 ;
148 |    for (ix=0 ; ix<nbins_x ; ix++) {
149 |       if (marginal_x[ix] > 0) {
150 |          cix = 0.0 ;
151 |          for (iy=0 ; iy<nbins_y ; iy++) {
152 |             pyx = (double) grid[ix*nbins_y+iy] / (double) marginal_x[ix] ;
153 |             if (pyx > 0.0)
154 |                cix += pyx * log ( pyx ) ;
155 |             }
156 |          }
157 |       CI += cix * marginal_x[ix] / ncases ;
158 |       }
159 | 
160 |    FREE ( marginal_x ) ;
161 |    FREE ( grid ) ;
162 | 
163 |    return -CI ;
164 | }
165 | 
166 | /*
167 | --------------------------------------------------------------------------------
168 | 
169 |    mut_inf ( bins_x ) - Compute the mutual information I(X;Y)
170 | 
171 | --------------------------------------------------------------------------------
172 | */
173 | 
174 | double MutualInformationDiscrete::mut_inf ( short int *bins_x )
175 | {
176 |    int i, j, ix, nbins_x, *grid, *marginal_x ;
177 |    double MI, px, py, pxy ;
178 | 
179 |    MEMTEXT ( "MutualInformationDiscrete::compute()" ) ;
180 | 
181 | /*
182 |    Compute the number of bins
183 | */
184 | 
185 |    nbins_x = 0 ;
186 |    for (i=0 ; i<ncases ; i++) {
187 |       if (bins_x[i] > nbins_x)
188 |          nbins_x = bins_x[i] ;
189 |       }
190 |    ++nbins_x ;  // Number of bins is one greater than max bin because org=0
191 | 
192 | /*
193 |    Compute the marginal of x and the counts in the nbins_x by nbins_y grid
194 | */
195 | 
196 |    marginal_x = (int *) MALLOC ( nbins_x * sizeof(int) ) ;
197 |    assert (marginal_x != NULL) ;
198 | 
199 |    grid = (int *) MALLOC ( nbins_x * nbins_y * sizeof(int) ) ;
200 |    assert ( grid != NULL ) ;
201 | 
202 |    for (i=0 ; i<nbins_x ; i++) {
203 |       marginal_x[i] = 0 ;
204 |       for (j=0 ; j<nbins_y ; j++)
205 |          grid[i*nbins_y+j] = 0 ;
206 |       }
207 | 
208 |    for (i=0 ; i<ncases ; i++) {
209 |       ix = bins_x[i] ;
210 |       ++marginal_x[ix] ;
211 |       ++grid[ix*nbins_y+bins_y[i]] ;
212 |       }
213 | 
214 | /*
215 |    Compute the mutual information
216 | */
217 | 
218 |    MI = 0.0 ;
219 |    for (i=0 ; i<nbins_x ; i++) {
220 |       px = (double) marginal_x[i] / (double) ncases ;
221 |       for (j=0 ; j<nbins_y ; j++) {
222 |          py = (double) marginal_y[j] / (double) ncases ;
223 |          pxy = (double) grid[i*nbins_y+j] / (double) ncases ;
224 |          if (pxy > 0.0)
225 |             MI += pxy * log ( pxy / (px * py) ) ;
226 |          }
227 |       }
228 | 
229 |    FREE ( marginal_x ) ;
230 |    FREE ( grid ) ;
231 | 
232 |    return MI ;
233 | }
234 | 
235 | /*
236 | --------------------------------------------------------------------------------
237 | 
238 |    hPe ( bins_x ) - Compute the Shannon entropy of the probability of error
239 |                     This only makes sense if X and Y have the same number of
240 |                     bins, and the bin of X is a prediction of the bin of Y.
241 | 
242 | --------------------------------------------------------------------------------
243 | */
244 | 
245 | double MutualInformationDiscrete::hPe ( short int *bins_x )
246 | {
247 |    int i, err ;
248 |    double p ;
249 | 
250 |    err = 0 ;
251 |    for (i=0 ; i<ncases ; i++) {
252 |       if (bins_x[i] != bins_y[i])
253 |          ++err ;
254 |       }
255 | 
256 |    if (err == 0  ||  err == ncases)
257 |       return 0.0 ;
258 | 
259 |    p = (double) err / (double) ncases ;
260 |    return -p * log ( p ) - (1.0 - p) * log ( 1.0 - p ) ;
261 | }
262 | 
263 | /*
264 | --------------------------------------------------------------------------------
265 | 
266 |    conditional_error ( bins_x ) - Compute the conditional error entropy given X
267 | 
268 | --------------------------------------------------------------------------------
269 | */
270 | 
271 | double MutualInformationDiscrete::conditional_error ( short int *bins_x )
272 | {
273 |    int i, ix, nbins_x, *error_count, *marginal_x ;
274 |    double CI, pyx ;
275 | 
276 |    MEMTEXT ( "MutualInformationDiscrete::conditional_error()" ) ;
277 | 
278 | /*
279 |    Compute the number of bins
280 | */
281 | 
282 |    nbins_x = 0 ;
283 |    for (i=0 ; i<ncases ; i++) {
284 |       if (bins_x[i] > nbins_x)
285 |          nbins_x = bins_x[i] ;
286 |       }
287 |    ++nbins_x ;  // Number of bins is one greater than max bin because org=0
288 | 
289 | /*
290 |    Compute the marginal of x and the error counts
291 | */
292 | 
293 |    marginal_x = (int *) MALLOC ( nbins_x * sizeof(int) ) ;
294 |    assert (marginal_x != NULL) ;
295 | 
296 |    error_count = (int *) MALLOC ( nbins_x * sizeof(int) ) ;
297 |    assert ( error_count != NULL ) ;
298 | 
299 |    for (ix=0 ; ix<nbins_x ; ix++) {
300 |       marginal_x[ix] = 0 ;
301 |       error_count[ix] = 0 ;
302 |       }
303 | 
304 |    for (i=0 ; i<ncases ; i++) {
305 |       ix = bins_x[i] ;
306 |       ++marginal_x[ix] ;
307 |       if (bins_y[i] != ix)
308 |          ++error_count[ix] ;
309 |       }
310 | 
311 | /*
312 |    Compute the conditional error entropy
313 | */
314 | 
315 |    CI = 0.0 ;
316 |    for (ix=0 ; ix<nbins_x ; ix++) {
317 |       if (error_count[ix] > 0  &&  error_count[ix] < marginal_x[ix]) {
318 |          pyx = (double) error_count[ix] / (double) marginal_x[ix] ;
319 |          CI += (pyx * log(pyx) + (1.0-pyx) * log(1.0-pyx)) * marginal_x[ix] / ncases ;
320 |          }
321 |       }
322 | 
323 |    FREE ( marginal_x ) ;
324 |    FREE ( error_count ) ;
325 | 
326 |    return -CI ;
327 | }
328 | 
329 | /*
330 | --------------------------------------------------------------------------------
331 | 
332 |    HYe ( bins_x ) - Compute the minimum (over bins of X) conditional entropy
333 |                     H(Y|error,X).  In other words, for each X bin,compute the
334 |                     conditional entropy of Y given that this X is an incorrect
335 |                     decision.  Return the minimum of this value across X bins.
336 |                     This only makes sense if X and Y have the same number of
337 |                     bins, and the bin of X is a prediction of the bin of Y.
338 | 
339 | --------------------------------------------------------------------------------
340 | */
341 | 
342 | double MutualInformationDiscrete::HYe ( short int *bins_x )
343 | {
344 |    int i, ix, iy, nbins_x, nerr, *grid, *marginal_x ;
345 |    double minCI, pyx, cix ;
346 | 
347 |    MEMTEXT ( "MutualInformationDiscrete::HYe()" ) ;
348 | 
349 | /*
350 |    Compute the number of bins
351 | */
352 | 
353 |    nbins_x = 0 ;
354 |    for (i=0 ; i<ncases ; i++) {
355 |       if (bins_x[i] > nbins_x)
356 |          nbins_x = bins_x[i] ;
357 |       }
358 |    ++nbins_x ;  // Number of bins is one greater than max bin because org=0
359 | 
360 | /*
361 |    This algorithm makes sense only if nbins_x equals nbins_y.
362 |    Return an error flag that will get the user's attention if this is violated.
363 | */
364 | 
365 |    if (nbins_x != nbins_y)
366 |       return -1.e60 ;
367 | 
368 | /*
369 |    Compute the marginal of x and the counts in the nbins_x by nbins_y grid
370 | */
371 | 
372 |    marginal_x = (int *) MALLOC ( nbins_x * sizeof(int) ) ;
373 |    assert (marginal_x != NULL) ;
374 | 
375 |    grid = (int *) MALLOC ( nbins_x * nbins_y * sizeof(int) ) ;
376 |    assert ( grid != NULL ) ;
377 | 
378 |    for (ix=0 ; ix<nbins_x ; ix++) {
379 |       marginal_x[ix] = 0 ;
380 |       for (iy=0 ; iy<nbins_y ; iy++)
381 |          grid[ix*nbins_y+iy] = 0 ;
382 |       }
383 | 
384 |    for (i=0 ; i<ncases ; i++) {
385 |       ix = bins_x[i] ;
386 |       ++marginal_x[ix] ;
387 |       ++grid[ix*nbins_y+bins_y[i]] ;
388 |       }
389 | 
390 | /*
391 |    Compute the minimum entropy, conditional on error and each X
392 |    Note that the computation in the inner loop is almost the same as in the
393 |    conditional entropy.  The only difference is that since we are also
394 |    conditioning on the classification being in error, we must remove from
395 |    the X marginal the diagonal element, which is the correct decision.
396 |    The outer loop looks for the minimum, rather than summing.
397 | */
398 | 
399 |    minCI = 1.e60 ;
400 |    for (ix=0 ; ix<nbins_x ; ix++) {
401 |       nerr = marginal_x[ix] - grid[ix*nbins_y+ix] ; // Marginal that is in error
402 |       if (nerr > 0) {
403 |          cix = 0.0 ;
404 |          for (iy=0 ; iy<nbins_y ; iy++) {
405 |             if (iy == ix)  // This is the correct decision
406 |                continue ;  // So we exclude it; we are summing over errors
407 |             pyx = (double) grid[ix*nbins_y+iy] / (double) nerr ;
408 |             if (pyx > 0.0)
409 |                cix -= pyx * log ( pyx ) ;
410 |             }
411 |          if (cix < minCI)
412 |             minCI = cix ;
413 |          }
414 |       }
415 | 
416 |    FREE ( marginal_x ) ;
417 |    FREE ( grid ) ;
418 | 
419 |    return minCI ;
420 | }
421 | 


--------------------------------------------------------------------------------
/PART.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  PART - Partition an array into roughly equal size bins, avoiding ties     */
  4 | /*                                                                            */
  5 | /*  I make no special claims of optimality for this algorithm, largely        */
  6 | /*  because there is no single optimality criterion!  All algorithms involve  */
  7 | /*  tradeoffs.  However, I am reasonably certain that this algorithm has two  */
  8 | /*  valuable properties:                                                      */
  9 | /*                                                                            */
 10 | /*  1) If the user inputs npart at least as large as the number of distinct   */
 11 | /*     values in the dataset, npart will be returned equal to the number of   */
 12 | /*     distinct values in the dataset, and each bin will correspond exactly   */
 13 | /*     to a distinct value.                                                   */
 14 | /*                                                                            */
 15 | /*  2) If the data has few or no ties, and the user inputs npart much less    */
 16 | /*     than n, the dataset will be partitioned into npart bins, all of which  */
 17 | /*     have equal or very nearly equal size.                                  */
 18 | /*                                                                            */
 19 | /******************************************************************************/
 20 | 
 21 | #include <assert.h>
 22 | #include <stdio.h>
 23 | #include <string.h>
 24 | #include <math.h>
 25 | #include <stdlib.h>
 26 | #include "info.h"
 27 | 
 28 | #define DEBUG 0
 29 | 
 30 | void partition (
 31 |    int n ,         // Input: Number of cases in the data array
 32 |    double *data ,  // Input: The data array
 33 |    int *npart ,    // Input/Output: Number of partitions to find; Returned as
 34 |                    // actual number of partitions, which happens if massive ties
 35 |    double *bnds ,  // Output: Upper bound (inclusive) of each partition
 36 |                    // If the user inputs this NULL, bounds are not returned
 37 |    short int *bins // Output: Bin id (0 through npart-1) for each case
 38 |    )
 39 | {
 40 |    int i, j, k, np, *ix, *indices, *bin_end, ibound, tie_found ;
 41 |    int istart, istop, nleft, nright, nbest, ibound_best, isplit_best ;
 42 |    double *x ;
 43 | 
 44 |    if (*npart > n)  // Defend against a careless user
 45 |       *npart = n ;
 46 | 
 47 |    np = *npart ;    // Will be number of partitions
 48 | 
 49 |    MEMTEXT ( "PART.CPP: partition" ) ;
 50 |    x = (double *) MALLOC ( n * sizeof(double) ) ;
 51 |    assert ( x != NULL ) ;
 52 |    ix = (int *) MALLOC ( n * sizeof(int) ) ;
 53 |    assert ( ix != NULL ) ;
 54 |    indices = (int *) MALLOC ( n * sizeof(int) ) ;
 55 |    assert ( indices != NULL ) ;
 56 |    bin_end = (int *) MALLOC ( np * sizeof(int) ) ;
 57 |    assert ( bin_end != NULL ) ;
 58 | 
 59 | /*
 60 |    Sort the data and compute an integer rank array that identifies ties.
 61 |    We could use the x array, but the code later will run faster if it can
 62 |    work with integers instead of reals.
 63 |    Also keep the indices of the original data points, as we will need this
 64 |    information at the end of this code to assign cases to bins.
 65 | */
 66 | 
 67 |    for (i=0 ; i<n ; i++) {
 68 |       x[i] = data[i] ;
 69 |       indices[i] = i ;
 70 |       }
 71 | 
 72 |    qsortdsi ( 0 , n-1 , x , indices ) ;
 73 | 
 74 |    ix[0] = k = 0 ;
 75 |    for (i=1 ; i<n ; i++) {
 76 |       if (x[i] - x[i-1] >= 1.e-12 * (1.0 + fabs(x[i]) + fabs(x[i-1])))
 77 |          ++k ;     // If not a tie, advance the counter of unique values
 78 |       ix[i] = k ;
 79 |       }
 80 | 
 81 | /*
 82 |    Compute initial bounds based strictly on equal number of cases in each bin.
 83 |    Ignore ties for now.
 84 | */
 85 | 
 86 |    k = 0 ;                              // Will be start of next bin up
 87 |    for (i=0 ; i<np ; i++) {             // For all partitions
 88 |       j = (n - k) / (np - i) ;          // Number of cases in this partition
 89 |       k += j ;                          // Advance the index of next one up
 90 |       bin_end[i] = k-1 ;                // Store upper bound of this bin
 91 |       }
 92 | 
 93 |    assert ( bin_end[np-1] == n-1 ) ; /*!!!!!!*/
 94 | 
 95 | /*
 96 |    If the data has no ties, we are done.  But if there are ties, we must iterate
 97 |    until no partition boundary splits a tie.
 98 |    Note that the upper bound of the last partition is always the last case
 99 |    in the sorted array, so we don't need to worry about it splitting a tie.
100 |    There are no cases above it!  All we care about are the np-1 internal
101 |    boundaries.
102 | */
103 | 
104 |    for (;;) {
105 | 
106 | #if DEBUG
107 |       printf ( "\n\n\nNew pass..." ) ;
108 |       for (ibound=0 ; ibound<np-1 ; ibound++)
109 |          printf ( "\n  %2d=%d  (%d %d)",
110 |                   ibound, bin_end[ibound], ix[bin_end[ibound]], ix[bin_end[ibound]+1] ) ;
111 | #endif
112 | 
113 |       tie_found = 0 ;
114 | 
115 |       for (ibound=0 ; ibound<np-1 ; ibound++) {
116 |          if (ix[bin_end[ibound]] == ix[bin_end[ibound]+1]) { // Splits a tie?
117 | #if DEBUG
118 |             printf ( "\n\nBound %d = %d is a tie (%d %d)",
119 |                      ibound, bin_end[ibound], ix[bin_end[ibound]], ix[bin_end[ibound]+1] ) ;
120 | #endif
121 |             // This bound splits a tie.  Remove this bound.
122 |             for (i=ibound+1 ; i<np ; i++)
123 |                bin_end[i-1] = bin_end[i] ;
124 |             --np ;
125 |             tie_found = 1 ;
126 |             break ;
127 |             }
128 |          } // For all bounds, looking for a split across a tie
129 | 
130 |       if (! tie_found)  // If we got all the way through the loop
131 |          break ;        // without finding a bad bound, we are done
132 | 
133 |       // The offending bound is now gone.  Try splitting each remaining
134 |       // bin.  For each split, check the size of the smaller resulting bin.
135 |       // Choose the split that gives the largest of the smaller.
136 |       // Note that np has been decremented, so now np < *npart.
137 | 
138 |       istart = 0 ;
139 |       nbest = -1 ;
140 |       for (ibound=0 ; ibound<np ; ibound++) {
141 |          istop = bin_end[ibound] ;
142 |          // Now processing a bin from istart through istop, inclusive
143 |          for (i=istart ; i<istop ; i++) { // Try all possible splits of this bin
144 |             if (ix[i] == ix[i+1])         // If this splits a tie
145 |                continue ;                 // Don't check
146 |             nleft = i - istart + 1 ;      // Number of cases in left half
147 |             nright = istop - i ;          // And right half
148 |             if (nleft < nright) {
149 |                if (nleft > nbest) {
150 |                   nbest = nleft ;
151 |                   ibound_best = ibound ;
152 |                   isplit_best = i ;
153 |                   }
154 |                }
155 |             else {
156 |                if (nright > nbest) {
157 |                   nbest = nright ;
158 |                   ibound_best = ibound ;
159 |                   isplit_best = i ;
160 |                   }
161 |                }
162 |             }
163 |          istart = istop + 1 ;
164 |          } // For all bounds, looking for the best bin to split
165 | 
166 | #if DEBUG
167 |       printf ( "\nnbest=%d at ibound=%d  isplit=%d", nbest, ibound_best, isplit_best ) ;
168 | #endif
169 | 
170 |       // The search is done.  It may (rarely) be the case that no further
171 |       // splits are possible.  This will happen if the user requests more
172 |       // partitions than there are unique values in the dataset.
173 |       // We know that this has happened if nbest is still -1.  In this case
174 |       // we (obviously) cannot do a split to make up for the one lost above.
175 | 
176 |       if (nbest < 0)
177 |          continue ;
178 | 
179 |       // We get here when the best split of an existing partition has been
180 |       // found.  Save it.  The bin that we are splitting is ibound_best,
181 |       // and the split for a new bound is at isplit_best.
182 | 
183 |       for (ibound=np-1 ; ibound>=ibound_best ; ibound--)
184 |          bin_end[ibound+1] = bin_end[ibound] ;
185 |       bin_end[ibound_best] = isplit_best ;
186 |       ++np ;
187 | 
188 |       } // Endless search loop
189 | 
190 | /*
191 |    The partition bounds are found.
192 |    Return them to the user if requested.
193 | */
194 | 
195 |    *npart = np ;   // Return the final number of partitions
196 | 
197 |    if (bnds != NULL) {  // Does the user want the boundary values?
198 |       for (ibound=0 ; ibound<np ; ibound++)
199 |          bnds[ibound] = x[bin_end[ibound]] ;
200 |       }
201 | 
202 | #if DEBUG
203 |    printf ( "\nFinal bounds..." ) ;
204 |    for (ibound=0 ; ibound<np ; ibound++)
205 |       printf ( "\n  %2d=%d  (%d %d) = %.5lf",
206 |                ibound, bin_end[ibound], ix[bin_end[ibound]],
207 |                ix[bin_end[ibound]+1], x[bin_end[ibound]] ) ;
208 | #endif
209 | 
210 | /*
211 |    Return the bin membership of each case in the dataset
212 | */
213 | 
214 |    istart = 0 ;                            // The current bin starts here
215 |    for (ibound=0 ; ibound<np ; ibound++) { // Process all bins
216 |       istop = bin_end[ibound] ;            // Inclusive end of this bin
217 |       for (i=istart ; i<=istop ; i++)
218 |          bins[indices[i]] = (short int) ibound ;
219 |       istart = istop + 1 ;
220 |       }
221 | 
222 |    FREE ( x ) ;
223 |    FREE ( ix ) ;
224 |    FREE ( indices ) ;
225 |    FREE ( bin_end ) ;
226 | }
227 | 


--------------------------------------------------------------------------------
/PARZDENS.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  PARZDENS - ParzDens_? used for mutual information continuous method       */
  4 | /*                                                                            */
  5 | /*  These routines transform the raw input data to a normal distribution,     */
  6 | /*  so these density estimators are NOT suitable for general use.             */
  7 | /*  They are for use by integration routines for computing mutual information.*/
  8 | /*  For general use, remove the normal transformation and compute scale       */
  9 | /*  factors appropriately.                                                    */
 10 | /*                                                                            */
 11 | /******************************************************************************/
 12 | 
 13 | #include <assert.h>
 14 | #include <stdio.h>
 15 | #include <string.h>
 16 | #include <math.h>
 17 | #include <stdlib.h>
 18 | #include "info.h"
 19 | 
 20 | 
 21 | /*
 22 | --------------------------------------------------------------------------------
 23 | 
 24 |    ParzDens_1 - Parzen density of a single variable
 25 | 
 26 | --------------------------------------------------------------------------------
 27 | */
 28 | 
 29 | ParzDens_1::ParzDens_1 ( int n_tset , double *tset , int n_div )
 30 | {
 31 |    int i, j, *indices ;
 32 |    double std, *x, *y, xbot, xinc, diff, sum ;
 33 | 
 34 |    MEMTEXT ( "ParzDens_1 constructor" ) ;
 35 | 
 36 |    nd = n_tset ;
 37 |    spline = NULL ;
 38 | 
 39 |    d = (double *) MALLOC ( nd * sizeof(double) ) ;
 40 |    assert (d != NULL) ;
 41 | 
 42 |    indices = (int *) MALLOC ( nd * sizeof(int) ) ;
 43 |    assert (indices != NULL) ;
 44 | 
 45 | /*
 46 |    Convert the data to a normal distribution
 47 | */
 48 | 
 49 |    for (i=0 ; i<nd ; i++) {
 50 |       indices[i] = i ;
 51 |       d[i] = tset[i] ;
 52 |       }
 53 |    qsortdsi ( 0 , nd-1 , d , indices ) ;
 54 |    for (i=0 ; i<nd ; i++)
 55 |       d[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ;
 56 |    FREE ( indices ) ;
 57 | 
 58 |    std = 2.0 / n_div ;
 59 |    var = std * std ;
 60 |    high = 3.0 + 3.0 * std ;
 61 |    low = -high ;
 62 | 
 63 |    factor = 1.0 / (nd * sqrt (2.0 * PI * var) ) ;
 64 | 
 65 |    if (nd <= 100)
 66 |       return ;
 67 | 
 68 |    // We have a lot of cases, so prepare for cubic spline interpolation
 69 |    x = (double *) MALLOC ( 1001 * sizeof(double) ) ;
 70 |    assert (x != NULL) ;
 71 |    y = (double *) MALLOC ( 1001 * sizeof(double) ) ;
 72 |    assert (y != NULL) ;
 73 | 
 74 |    xinc = (-1.5 - low) / 100.0 ;
 75 | 
 76 |    for (i=0 ; i<100 ; i++)
 77 |       x[i] = low + i * xinc ;
 78 | 
 79 |    xbot = x[99] ;
 80 |    xinc = (1.5 - xbot) / 801.0 ;
 81 |    for (i=0 ; i<800 ; i++)
 82 |       x[i+100] = xbot + (i+1) * xinc ;
 83 | 
 84 |    xbot = x[899] ;
 85 |    xinc = (high - xbot) / 101.0 ;
 86 |    for (i=0 ; i<101 ; i++)
 87 |       x[i+900] = xbot + (i+1) * xinc ;
 88 | 
 89 |    for (i=0 ; i<1001 ; i++) {
 90 |       sum = 0.0 ;
 91 |       for (j=0 ; j<nd ; j++) {
 92 |          diff = x[i] - d[j] ;
 93 |          sum += exp ( -0.5 * diff * diff / var ) ;
 94 |          }
 95 |       y[i] = factor * sum ;
 96 |       }
 97 | 
 98 |    spline = new CubicSpline ( 1001 , x , y ) ;
 99 |    assert (spline != NULL) ;
100 | 
101 |    FREE ( x ) ;
102 |    FREE ( y ) ;
103 | }
104 | 
105 | ParzDens_1::~ParzDens_1 ()
106 | {
107 |    MEMTEXT ( "ParzDens_1 destructor" ) ;
108 |    if (d != NULL)
109 |       FREE ( d ) ;
110 |    if (spline != NULL)
111 |       delete spline ;
112 | }
113 | 
114 | double ParzDens_1::density ( double x )
115 | {
116 |    int i ;
117 |    double sum, diff ;
118 | 
119 |    if (spline != NULL)
120 |       return spline->evaluate ( x ) ;
121 | 
122 |    sum = 0.0 ;
123 |    for (i=0 ; i<nd ; i++) {
124 |       diff = x - d[i];
125 |       sum += exp ( -0.5 * diff * diff / var ) ;
126 |       }
127 | 
128 |    return sum * factor ;
129 | }
130 | 
131 | /*
132 | --------------------------------------------------------------------------------
133 | 
134 |    ParzDens_2 - Parzen density of a bivariate pair
135 | 
136 | --------------------------------------------------------------------------------
137 | */
138 | 
139 | #define P2RES 200
140 | 
141 | ParzDens_2::ParzDens_2 ( int n_tset , double *tset0 , double *tset1 , int n_div )
142 | {
143 |    int i, j, k, k0, k1, k2, *indices ;
144 |    double *x, *y, *z, xbot, xinc, ybot, yinc, xlow, xhigh, ylow, yhigh, std ;
145 |    double diff0, diff1, sum ;
146 | 
147 |    MEMTEXT ( "ParzDens_2 constructor" ) ;
148 | 
149 |    nd = n_tset ;
150 | 
151 |    bilin = NULL ;
152 |    d0 = (double *) MALLOC ( 2 * nd * sizeof(double) ) ;
153 |    assert (d0 != NULL) ;
154 |    indices = (int *) MALLOC ( nd * sizeof(int) ) ;
155 |    assert (indices != NULL) ;
156 |    d1 = d0 + nd ;
157 | 
158 | 
159 | /*
160 |    Convert the data to a normal distribution
161 | */
162 | 
163 |    for (i=0 ; i<nd ; i++) {
164 |       indices[i] = i ;
165 |       d0[i] = tset0[i] ;
166 |       }
167 |    qsortdsi ( 0 , nd-1 , d0 , indices ) ;
168 |    for (i=0 ; i<nd ; i++)
169 |       d0[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ;
170 | 
171 |    for (i=0 ; i<nd ; i++) {
172 |       indices[i] = i ;
173 |       d1[i] = tset1[i] ;
174 |       }
175 |    qsortdsi ( 0 , nd-1 , d1 , indices ) ;
176 |    for (i=0 ; i<nd ; i++)
177 |       d1[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ;
178 | 
179 |    FREE ( indices ) ;
180 | 
181 |    std = 2.0 / n_div ;
182 |    var0 = var1 = std * std ;
183 |    xhigh = yhigh = 3.0 + 2.0 * std ;
184 |    xlow = ylow = -xhigh ;
185 | 
186 |    factor = 1.0 / (nd * 2.0 * PI * sqrt ( var0 * var1 ) ) ;
187 | 
188 |    if (nd <= 100)
189 |       return ;
190 | 
191 |    // We have a lot of cases, so prepare for bilinear interpolation
192 |    x = (double *) MALLOC ( P2RES * sizeof(double) ) ;
193 |    assert (x != NULL) ;
194 |    y = (double *) MALLOC ( P2RES * sizeof(double) ) ;
195 |    assert (y != NULL) ;
196 |    z = (double *) MALLOC ( P2RES * P2RES * sizeof(double) ) ;
197 |    assert (z != NULL) ;
198 | 
199 |    if (x == NULL  ||  y == NULL  ||  z == NULL) {
200 |       if (x != NULL)
201 |          FREE ( x ) ;
202 |       if (y != NULL)
203 |          FREE ( y ) ;
204 |       if (z != NULL)
205 |          FREE ( z ) ;
206 |       return ;  // If insufficient memory, do not interpolate
207 |       }
208 | 
209 |    k0 = (int) (0.1 * P2RES) ;
210 |    xinc = (-1.5 - xlow) / k0 ;
211 |    for (i=0 ; i<k0 ; i++)
212 |       x[i] = xlow + i * xinc ;
213 | 
214 |    k1 = (int) (0.8 * P2RES) ;
215 |    xbot = x[k0-1] ;
216 |    xinc = (1.5 - xbot) / (k1 + 1) ;
217 |    for (i=0 ; i<k1 ; i++)
218 |       x[i+k0] = xbot + (i+1) * xinc ;
219 | 
220 |    xbot = x[k0+k1-1] ;
221 |    k2 = P2RES - k0 - k1 ;
222 |    xinc = (xhigh - xbot) / k2 ;
223 |    for (i=0 ; i<k2 ; i++)
224 |       x[i+k0+k1] = xbot + (i+1) * xinc ;
225 | 
226 | 
227 |    k0 = (int) (0.1 * P2RES) ;
228 |    yinc = (-1.5 - ylow) / k0 ;
229 |    for (i=0 ; i<k0 ; i++)
230 |       y[i] = ylow + i * yinc ;
231 | 
232 |    k1 = (int) (0.8 * P2RES) ;
233 |    ybot = y[k0-1] ;
234 |    yinc = (1.5 - ybot) / (k1 + 1) ;
235 |    for (i=0 ; i<k1 ; i++)
236 |       y[i+k0] = ybot + (i+1) * yinc ;
237 | 
238 |    ybot = y[k0+k1-1] ;
239 |    k2 = P2RES - k0 - k1 ;
240 |    yinc = (yhigh - ybot) / k2 ;
241 |    for (i=0 ; i<k2 ; i++)
242 |       y[i+k0+k1] = ybot + (i+1) * yinc ;
243 | 
244 |    for (i=0 ; i<P2RES ; i++) {
245 |       for (j=0 ; j<P2RES ; j++) {
246 |          sum = 0.0 ;
247 |          for (k=0 ; k<nd ; k++) {
248 |             diff0 = x[i] - d0[k] ;
249 |             diff1 = y[j] - d1[k] ;
250 |             sum += exp ( -0.5 * (diff0 * diff0 / var0 + diff1 * diff1 / var1 ));
251 |             }
252 |          z[i*P2RES+j] = factor * sum ;
253 |          }
254 |       }
255 | 
256 |    bilin = new Bilinear ( P2RES , x , P2RES , y , z , 1 ) ;
257 |    assert (bilin != NULL) ;
258 | 
259 |    FREE ( x ) ;
260 |    FREE ( y ) ;
261 |    FREE ( z ) ;
262 | }
263 | 
264 | ParzDens_2::~ParzDens_2 ()
265 | {
266 |    MEMTEXT ( "ParzDens_2 destructor" ) ;
267 |    if (d0 != NULL)
268 |       FREE ( d0 ) ;
269 |    if (bilin != NULL)
270 |       delete bilin ;
271 | }
272 | 
273 | double ParzDens_2::density ( double x0 , double x1 )
274 | {
275 |    int i ;
276 |    double sum, diff0, diff1 ;
277 | 
278 |    if (bilin != NULL)
279 |       return bilin->evaluate ( x0 , x1 ) ;
280 | 
281 |    sum = 0.0 ;
282 |    for (i=0 ; i<nd ; i++) {
283 |       diff0 = x0 - d0[i] ;
284 |       diff1 = x1 - d1[i] ;
285 |       sum += exp ( -0.5 * (diff0 * diff0 / var0 + diff1 * diff1 / var1 ) ) ;
286 |       }
287 | 
288 |    return sum * factor ;
289 | }
290 | 
291 | /*
292 | --------------------------------------------------------------------------------
293 | 
294 |    ParzDens_3 - Parzen density of a trivariate trio
295 | 
296 | --------------------------------------------------------------------------------
297 | */
298 | 
299 | ParzDens_3::ParzDens_3 ( int n_tset , double *tset0 , double *tset1 , double *tset2 , int n_div )
300 | {
301 |    int i, *indices ;
302 |    double std ;
303 | 
304 |    MEMTEXT ( "ParzDens_3 constructor" ) ;
305 | 
306 |    nd = n_tset ;
307 | 
308 |    d0 = (double *) MALLOC ( 3 * nd * sizeof(double) ) ;
309 |    assert (d0 != NULL) ;
310 |    indices = (int *) MALLOC ( nd * sizeof(int) ) ;
311 |    assert (indices != NULL) ;
312 |    d1 = d0 + nd ;
313 |    d2 = d1 + nd ;
314 | 
315 | /*
316 |    Convert the data to a normal distribution
317 | */
318 | 
319 |    for (i=0 ; i<nd ; i++) {
320 |       indices[i] = i ;
321 |       d0[i] = tset0[i] ;
322 |       }
323 |    qsortdsi ( 0 , nd-1 , d0 , indices ) ;
324 |    for (i=0 ; i<nd ; i++)
325 |       d0[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ;
326 | 
327 |    for (i=0 ; i<nd ; i++) {
328 |       indices[i] = i ;
329 |       d1[i] = tset1[i] ;
330 |       }
331 |    qsortdsi ( 0 , nd-1 , d1 , indices ) ;
332 |    for (i=0 ; i<nd ; i++)
333 |       d1[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ;
334 | 
335 |    for (i=0 ; i<nd ; i++) {
336 |       indices[i] = i ;
337 |       d2[i] = tset2[i] ;
338 |       }
339 |    qsortdsi ( 0 , nd-1 , d2 , indices ) ;
340 |    for (i=0 ; i<nd ; i++)
341 |       d2[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ;
342 | 
343 |    FREE ( indices ) ;
344 | 
345 |    std = 2.0 / n_div ;
346 |    var0 = var1 = var2 = std * std ;
347 | 
348 |    factor = 1.0 / (nd * 2.0 * PI * sqrt(2.0 * PI) * sqrt(var0 * var1 * var2) ) ;
349 | }
350 | 
351 | ParzDens_3::~ParzDens_3 ()
352 | {
353 |    MEMTEXT ( "ParzDens_3 destructor" ) ;
354 |    if (d0 != NULL)
355 |       FREE ( d0 ) ;
356 | }
357 | 
358 | double ParzDens_3::density ( double x0 , double x1 , double x2 )
359 | {
360 |    int i ;
361 |    double sum, diff0, diff1, diff2 ;
362 | 
363 |    sum = 0.0 ;
364 |    for (i=0 ; i<nd ; i++) {
365 |       diff0 = x0 - d0[i] ;
366 |       diff1 = x1 - d1[i] ;
367 |       diff2 = x2 - d2[i] ;
368 |       sum += exp ( -0.5 * (diff0 * diff0 / var0 + diff1 * diff1 / var1 +
369 |                            diff2 * diff2 / var2 ) ) ;
370 |       }
371 | 
372 |    return sum * factor ;
373 | }
374 | 
375 | 


--------------------------------------------------------------------------------
/QSORTD.CPP:
--------------------------------------------------------------------------------
  1 | /****************************************************************************/
  2 | /*                                                                          */
  3 | /*  QSORT - Quick sort a double array.                                      */
  4 | /*                                                                          */
  5 | /****************************************************************************/
  6 | 
  7 | #include <math.h>
  8 | 
  9 | void qsortd ( int first , int last , double *data )
 10 | {
 11 |    int lower, upper ;
 12 |    double ftemp, split ;
 13 | 
 14 |    split = data[(first+last)/2] ;
 15 |    lower = first ;
 16 |    upper = last ;
 17 | 
 18 |    do {
 19 |       while ( split > data[lower] )
 20 |          ++lower ;
 21 |       while ( split < data[upper] )
 22 |          --upper ;
 23 |       if (lower == upper) {
 24 |          ++lower ;
 25 |          --upper ;
 26 |          }
 27 |       else if (lower < upper) {
 28 |          ftemp = data[lower] ;
 29 |          data[lower++] = data[upper] ;
 30 |          data[upper--] = ftemp ;
 31 |          }
 32 |       } while ( lower <= upper ) ;
 33 | 
 34 |    if (first < upper)
 35 |       qsortd ( first , upper , data ) ;
 36 |    if (lower < last)
 37 |       qsortd ( lower , last , data ) ;
 38 | }
 39 | 
 40 | void qsortds ( int first , int last , double *data , double *slave )
 41 | {
 42 |    int lower, upper ;
 43 |    double ftemp, split ;
 44 | 
 45 |    split = data[(first+last)/2] ;
 46 |    lower = first ;
 47 |    upper = last ;
 48 | 
 49 |    do {
 50 |       while ( split > data[lower] )
 51 |          ++lower ;
 52 |       while ( split < data[upper] )
 53 |          --upper ;
 54 |       if (lower == upper) {
 55 |          ++lower ;
 56 |          --upper ;
 57 |          }
 58 |       else if (lower < upper) {
 59 |          ftemp = slave[lower] ;
 60 |          slave[lower] = slave[upper] ;
 61 |          slave[upper] = ftemp ;
 62 |          ftemp = data[lower] ;
 63 |          data[lower++] = data[upper] ;
 64 |          data[upper--] = ftemp ;
 65 |          }
 66 |       } while ( lower <= upper ) ;
 67 | 
 68 |    if (first < upper)
 69 |       qsortds ( first , upper , data , slave ) ;
 70 |    if (lower < last)
 71 |       qsortds ( lower , last , data , slave ) ;
 72 | }
 73 | 
 74 | void qsortdsi ( int first , int last , double *data , int *slave )
 75 | {
 76 |    int lower, upper, itemp ;
 77 |    double ftemp, split ;
 78 | 
 79 |    split = data[(first+last)/2] ;
 80 |    lower = first ;
 81 |    upper = last ;
 82 | 
 83 |    do {
 84 |       while ( split > data[lower] )
 85 |          ++lower ;
 86 |       while ( split < data[upper] )
 87 |          --upper ;
 88 |       if (lower == upper) {
 89 |          ++lower ;
 90 |          --upper ;
 91 |          }
 92 |       else if (lower < upper) {
 93 |          itemp = slave[lower] ;
 94 |          slave[lower] = slave[upper] ;
 95 |          slave[upper] = itemp ;
 96 |          ftemp = data[lower] ;
 97 |          data[lower++] = data[upper] ;
 98 |          data[upper--] = ftemp ;
 99 |          }
100 |       } while ( lower <= upper ) ;
101 | 
102 |    if (first < upper)
103 |       qsortdsi ( first , upper , data , slave ) ;
104 |    if (lower < last)
105 |       qsortdsi ( lower , last , data , slave ) ;
106 | }
107 | 
108 | 


--------------------------------------------------------------------------------
/RAND32.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  RAND32 - Assorted high quality random numbers                             */
  4 | /*                                                                            */
  5 | /*  These routines are nonportable because they assume 32-bit ints.           */
  6 | /*  The following routines are here:                                          */
  7 | /*                                                                            */
  8 | /*    RAND32M - A very good 32-bit generator by Marsaglia that is fast and    */
  9 | /*      produces high quality numbers.  But it requires a compiler that can   */
 10 | /*      handle 64-bit ints.                                                   */
 11 | /*                                                                            */
 12 | /*    RAND_LECUYER - A very good generator by L'Ecuyer that returns values    */
 13 | /*      in the range 1 - 2147483562, which is almost (but not quite) 2^31.    */
 14 | /*                                                                            */
 15 | /*    RAND_KNUTH - An old but quite good generator by Knuth that returns      */
 16 | /*      values in the range 0 - 999,999,999, which is almost 2^30.            */
 17 | /*                                                                            */
 18 | /*    RAND16_LECUYER - A true 16-bit (0-65535) generator derived from above.  */
 19 | /*                                                                            */
 20 | /*    RAND16_KNUTH - A true 16-bit (0-65535) generator derived from above.    */
 21 | /*                                                                            */
 22 | /*    RAND32 -  A 32-bit generator that I believe is excellent.               */
 23 | /*      It puts together 32-bit ints from pairs of 16-bit ints, a practice    */
 24 | /*      that is admittedly suspect unless the 16-bit components are excellent.*/
 25 | /*      I believe this to be the case here.  The 16-bit randoms are created   */
 26 | /*      by adding the outputs of RAND16_LECUYER and RAND16_KNUTH mod 2^16,    */
 27 | /*      using a 64K (65536) element Bays-Duram shuffle on KNUTH.              */
 28 | /*                                                                            */
 29 | /*                                                                            */
 30 | /*   Summary:                                                                 */
 31 | /*                                                                            */
 32 | /*   If your compiler can handle 64-bit ints, and you want a very good and    */
 33 | /*   very fast generator, I believe that it is tough to beat RAND32M for      */
 34 | /*   the combination of speed and quality.  You can find generators that      */
 35 | /*   are faster, and some that are better.  But RAND32M does well in both.    */
 36 | /*                                                                            */
 37 | /*   If your goal is to use the outputs to generate floats, RAND_LECUYER      */
 38 | /*   will do well.  Subtract 1 so that zero is included, and divide by        */
 39 | /*   2147483562 so that 1.0 is excluded, which is the usual practice.         */
 40 | /*                                                                            */
 41 | /*   RAND_KNUTH will also serve this purpose, although it is probably not     */
 42 | /*   quite as good, and it has coarser granularity.  But the Knuth algorithm  */
 43 | /*   is very different from the L'Ecuyer algorithm, which means that if       */
 44 | /*   one fails for your application, the other may well succeed.              */
 45 | /*                                                                            */
 46 | /*   If your goal is simple selection, as in genetic algorithms, either       */
 47 | /*   RAND16_LECUYER or RAND16_KNUTH will do well.                             */
 48 | /*                                                                            */
 49 | /*   Finally, if your application demands extreme quality, and speed is not   */
 50 | /*   critical, I can say that I use RAND32.  It is difficult to prove quality */
 51 | /*   of any generator, but I have tested RAND32 for hundreds of hours using   */
 52 | /*   routines from Marsaglia's DIEHARD suite, the NIST cryptographic suite,   */
 53 | /*   and some of my own tests.  RAND32 has passed every test I used.          */
 54 | /*   But of course, I make no guarantees.  It is almost certainly excellent,  */
 55 | /*   and I have not been able to find any test that it fails.  Still, this    */
 56 | /*   does not mean that it will perform well with every application.          */
 57 | /*                                                                            */
 58 | /******************************************************************************/
 59 | 
 60 | #include <math.h>
 61 | #include <float.h>
 62 | 
 63 | /*
 64 | --------------------------------------------------------------------------------
 65 | 
 66 |    This is a generator suggested by Marsaglia in his DIEHARD suite.
 67 |    It provides a great combination of speed and quality, but it requires
 68 |    a compiler that can handle 64-bit ints.
 69 | 
 70 | --------------------------------------------------------------------------------
 71 | */
 72 | 
 73 | #if 0
 74 | 
 75 | static unsigned int Q[256], carry=362436 ;
 76 | static int MWC256_initialized = 0 ;
 77 | static int MWC256_seed = 123456789 ;
 78 | 
 79 | void RAND32M_seed ( int iseed ) { // Optionally set seed
 80 |    MWC256_seed = iseed ;
 81 |    MWC256_initialized = 0 ;
 82 |    }
 83 | 
 84 | unsigned int RAND32M ()
 85 | {
 86 |    unsigned _int64 t ;
 87 |    unsigned _int64 a=809430660 ;
 88 |    static unsigned char i=255 ;
 89 | 
 90 |    if (! MWC256_initialized) {
 91 |       unsigned int k,j=MWC256_seed ;
 92 |       MWC256_initialized = 1 ;
 93 |       for (k=0 ; k<256 ; k++) {
 94 |          j = 69069 * j + 12345 ; // This overflows, doing an automatic mod 2^32
 95 |          Q[k] = j ;
 96 |          }
 97 |       }
 98 | 
 99 |    t = a * Q[++i] + carry ;  // This is the 64-bit op, forced by a being 64-bit
100 |    carry = (unsigned int) (t >> 32) ;
101 |    Q[i] = (unsigned int) (t & 0xFFFFFFFF) ;
102 |    return Q[i] ;
103 | }
104 | #endif
105 | 
106 | /*
107 | --------------------------------------------------------------------------------
108 | 
109 |    RAND_LECUYER
110 | 
111 | --------------------------------------------------------------------------------
112 | */
113 | 
114 | static int LECUYER_initialized = 0 ;
115 | static int LECUYER_seed1 = 1 ;
116 | 
117 | void RAND_LECUYER_seed ( int iseed )  // Optionally set seed
118 | {
119 |    LECUYER_seed1 = iseed ;
120 |    LECUYER_initialized = 0 ;
121 | }
122 | 
123 | unsigned int RAND_LECUYER ()
124 | {
125 |    int i, k, index ;
126 |    static int LECUYER_seed2, LECUYER_output, LECUYER_table[32] ;
127 | 
128 |    if (! LECUYER_initialized) {     // Initialize the shuffle table
129 |       LECUYER_initialized = 1 ;
130 |       LECUYER_seed2 = LECUYER_seed1 ;       // Will use this in 'normal' call
131 |       for (i=0 ; i<32 ; i++) {
132 |          k = LECUYER_seed1 / 53668 ;
133 |          LECUYER_seed1 = 40014 * (LECUYER_seed1 - k * 53668) - k * 12211 ;
134 |          if (LECUYER_seed1 < 0)
135 |             LECUYER_seed1 += 2147483563 ;
136 |          LECUYER_table[i] = LECUYER_seed1 ;
137 |          }
138 |       LECUYER_output = LECUYER_table[0] ;  // For first use below
139 |       }
140 | 
141 |    // Generate the second of the two randoms that will be combined
142 |    k = LECUYER_seed2 / 52774 ;
143 |    LECUYER_seed2 = 40692 * (LECUYER_seed2 - k * 52774) - k * 3791 ;
144 |    if (LECUYER_seed2 < 0)
145 |       LECUYER_seed2 += 2147483399 ;
146 | 
147 |    index = LECUYER_output / 67108862 ;      // Compute the shuffle index and combine
148 |    LECUYER_output = LECUYER_table[index] - LECUYER_seed2 ;  // the two randoms
149 |    if (LECUYER_output < 0)
150 |       LECUYER_output += 2147483563 ;
151 | 
152 |    // Generate a new 'first random' and replace the just used shuffle table entry
153 |    k = LECUYER_seed1 / 53668 ;
154 |    LECUYER_seed1 = 40014 * (LECUYER_seed1 - k * 53668) - k * 12211 ;
155 |    if (LECUYER_seed1 < 0)
156 |       LECUYER_seed1 += 2147483563 ;
157 |    LECUYER_table[index] = LECUYER_seed1 ;
158 | 
159 |    return LECUYER_output ;
160 | }
161 | 
162 | /*
163 | --------------------------------------------------------------------------------
164 | 
165 |    RAND_KNUTH
166 | 
167 | --------------------------------------------------------------------------------
168 | */
169 | 
170 | static int KNUTH_initialized = 0 ;
171 | static int KNUTH_seed1 = 1 ;
172 | 
173 | void RAND_KNUTH_seed ( int iseed )  // Optionally set seed
174 | {
175 |    KNUTH_seed1 = iseed ;
176 |    KNUTH_initialized = 0 ;
177 | }
178 | 
179 | unsigned int RAND_KNUTH ()
180 | {
181 |    int i, k, index ;
182 |    static int KNUTH_output, KNUTH_table[55], next, nextp ;
183 | 
184 |    if (! KNUTH_initialized) {     // Initialize the shuffle table
185 |       KNUTH_initialized = 1 ;
186 |       KNUTH_output = KNUTH_seed1 % 1000000000 ; // Ensure seed not illegal
187 |       KNUTH_table[54] = KNUTH_output ;
188 |       k = 1 ;
189 |       for (i=1 ; i<=54 ; i++) {
190 |          index = (21 * i) % 55 - 1 ;
191 |          KNUTH_table[index] = k ;
192 |          k = KNUTH_output - k ;
193 |          if (k < 0)
194 |             k += 1000000000 ;
195 |          KNUTH_output = KNUTH_table[index] ;
196 |          }
197 |       for (k=0 ; k<4 ; k++) {
198 |          for (i=0 ; i<55 ; i++) {
199 |             KNUTH_table[i] -= KNUTH_table[(i+31)%55] ;
200 |             if (KNUTH_table[i] < 0)
201 |                KNUTH_table[i] += 1000000000 ;
202 |             }
203 |          }
204 | 
205 |       next = 0 ;
206 |       nextp = 31 ;
207 |       } // If not initialized
208 | 
209 |    KNUTH_output = KNUTH_table[next] - KNUTH_table[nextp] ;
210 |    if (KNUTH_output < 0)
211 |       KNUTH_output += 1000000000 ;
212 |    KNUTH_table[next] = KNUTH_output ;
213 | 
214 |    next = (next + 1) % 55 ;
215 |    nextp = (nextp + 1) % 55 ;
216 | 
217 |    return KNUTH_output ;
218 | }
219 | 
220 | /*
221 | --------------------------------------------------------------------------------
222 | 
223 |    These are 16-bit subgenerators
224 | 
225 | --------------------------------------------------------------------------------
226 | */
227 | 
228 | unsigned int RAND16_LECUYER ()
229 | {
230 |    long k ;
231 |    long mult = 2147483562 / 65536 ;
232 |    long max = mult * 65536L ;
233 | 
234 |    for (;;) {
235 |       k = RAND_LECUYER() - 1 ;
236 |       if (k < max )
237 |          return k / mult ;
238 |       }
239 | }
240 | 
241 | unsigned int RAND16_KNUTH ()
242 | {
243 |    long k ;
244 |    long mult = 1000000000 / 65536 ;
245 |    long max = mult * 65536 ;
246 | 
247 |    for (;;) {
248 |       k = RAND_KNUTH() ;
249 |       if (k < max )
250 |          return k / mult ;
251 |       }
252 | }
253 | 
254 | /*
255 | --------------------------------------------------------------------------------
256 | 
257 |    RAND32
258 | 
259 | --------------------------------------------------------------------------------
260 | */
261 | 
262 | static int RAND32_table[65536] ;    // Keep shuffle table here
263 | static int RAND32_initialized = 0 ; // Has it been initialized?
264 | 
265 | /*
266 |    Set the random seed
267 | */
268 | 
269 | void RAND32_seed ( unsigned int iseed )
270 | {
271 |    RAND_KNUTH_seed ( iseed & 65535 ) ;
272 |    RAND_LECUYER_seed ( (iseed >> 16) & 65535 ) ;
273 |    RAND32_initialized = 0 ;
274 | }
275 | 
276 | unsigned int RAND32 ()
277 | {
278 |    int i, k1, k2 ;
279 |    static int RAND32_randout ;
280 | 
281 |    if (! RAND32_initialized) {  // Initialize shuffle table before use
282 |       RAND32_initialized = 1 ;  // Flag to avoid more inits
283 |       for (i=0 ; i<65536 ; i++)       // Fill entire table
284 |          RAND32_table[i] = RAND16_KNUTH() ;
285 |       RAND32_randout = RAND16_KNUTH() ; // One more for first use
286 |       }
287 | 
288 |    k1 = RAND32_randout = (RAND32_table[RAND32_randout] + RAND16_LECUYER()) % 65536 ;
289 |    RAND32_table[RAND32_randout] = RAND16_KNUTH () ;
290 | 
291 |    k2 = RAND32_randout = (RAND32_table[RAND32_randout] + RAND16_LECUYER()) % 65536 ;
292 |    RAND32_table[RAND32_randout] = RAND16_KNUTH () ;
293 | 
294 |    return (k1 << 16)  |  k2 ;
295 | }
296 | 
297 | /*
298 | --------------------------------------------------------------------------------
299 | 
300 |    Generate a uniform in [0, 1).
301 | 
302 | --------------------------------------------------------------------------------
303 | */
304 | 
305 | double unifrand ()
306 | {
307 |    double r1, r2 ;
308 |    double denom = 0x7FFFFFFFL + 1.0 ;
309 | 
310 |    r1 = RAND32 () & 0x7FFFFFFFL ;
311 |    r2 = RAND32 () & 0x7FFFFFFFL ;
312 |    return (r1 + r2 / denom) / denom ;
313 | }
314 | 


--------------------------------------------------------------------------------
/READFILE.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  READFILE - Read the dataset from a text file                              */
  4 | /*                                                                            */
  5 | /*  The first line must contain the names of the variables.                   */
  6 | /*  Subsequent lines are the values, one case (all variables) per line        */
  7 | /*  Names (in the first line) and values (in subsequent lines) are delimited  */
  8 | /*  by space, tab, or comma.                                                  */
  9 | /*  This routine will allocate names and data, but they must be freed by      */
 10 | /*  calling free_data() (defined at the end of this file).                    */
 11 | /*  This returns 0 if no error, 1 if error.                                   */
 12 | /*                                                                            */
 13 | /******************************************************************************/
 14 | 
 15 | #define _CRT_SECURE_NO_DEPRECATE
 16 | 
 17 | #include <assert.h>
 18 | #include <stdio.h>
 19 | #include <string.h>
 20 | #include <stdlib.h>
 21 | #include "info.h"
 22 | 
 23 | #define MAX_VARS 8192       /* Maximum number of variables in the file */
 24 | #define MAX_NAME_LENGTH 31  /* Maximum number of characters in name */
 25 | 
 26 | #define BUFSIZE 8192        /* This many allocated at a time; not critical */ /*!!!!!!*/
 27 | 
 28 | static inline int digit ( int character )
 29 | {
 30 |    character &= 255 ;
 31 |    return (character >= '0'  &&  character <= '9') ;
 32 | }
 33 | 
 34 | double parse_double ( char **str )
 35 | {
 36 |    double number = 0.0 ;
 37 | 
 38 |    while (**str  &&  ! ( digit ( **str ) || (**str == '-') || (**str == '.')))
 39 |       ++(*str) ;  // Move up to the number
 40 | 
 41 |    sscanf ( *str , "%lf" , &number ) ; // Get the number
 42 | 
 43 |    while (digit ( **str )  ||  (**str == '-')  ||  (**str == '.'))
 44 |       ++(*str) ;  // Pass the number
 45 | 
 46 |    return number ;
 47 | }
 48 | 
 49 | int readfile (
 50 |    char *name ,    // Name of the data file to read
 51 |    int *nvars ,    // Output: Number of variables (as defined by first line)
 52 |    char ***names , // Output: Array of pointers to names
 53 |    int *ncases ,   // Output: The number of cases in the file
 54 |    double **data ) // Output: ncases by nvars data matrix, vars changing fastest
 55 | {
 56 |    int i, j, k, linelen, bufcnt ;
 57 |    char *line, *lptr, var_name[MAX_NAME_LENGTH+1] ;
 58 |    double *dptr, *temp ;
 59 |    FILE *fp ;
 60 | 
 61 |    MEMTEXT ( "READFILE: readfile() line, **names" ) ;
 62 | 
 63 |    if ((fp = fopen ( name , "rt" )) == NULL) {
 64 |       printf ( "\nERROR... Cannot open file %s", name ) ;
 65 |       return 1 ;
 66 |       }
 67 | 
 68 |    linelen = MAX_VARS * (MAX_NAME_LENGTH+1) ;
 69 |    line = (char *) MALLOC ( linelen ) ;
 70 |    assert (line != NULL) ;
 71 | 
 72 |    *names = (char **) MALLOC ( MAX_VARS * sizeof(char *) ) ;
 73 |    assert ( names != NULL ) ;
 74 |    bufcnt = BUFSIZE ;
 75 | 
 76 | /*
 77 |    Read the variable names from the first line
 78 | */
 79 | 
 80 |    if ((fgets ( line , linelen , fp ) == NULL) || (strlen ( line ) < 2)) {
 81 |       fclose ( fp ) ;
 82 |       FREE ( line ) ;
 83 |       FREE ( *names ) ;
 84 |       printf ( "\nERROR... problem reading file %s", name ) ;
 85 |       return 1 ;
 86 |       }
 87 | 
 88 |    // In the highly unusual case that the line in the file is longer
 89 |    // than we can handle, we must sadly abort
 90 |    if (strchr(line,'\n') == NULL  &&  strchr(line,'\r') == NULL) {
 91 |       fclose ( fp ) ;
 92 |       FREE ( line ) ;
 93 |       FREE ( *names ) ;
 94 |       printf ( "\nERROR... First line in file too long" ) ;
 95 |       return 1 ;
 96 |       }
 97 | 
 98 |    *nvars = 0 ;                 // Will count variables
 99 |    lptr = line ;
100 |    for (;;) {                  // For all variables (will count them now)
101 | 
102 |       if (*nvars >= MAX_VARS) {
103 |          fclose ( fp ) ;
104 |          FREE ( line ) ;
105 |          FREE ( *names ) ;
106 |          printf ( "\nERROR... More than %d variables in file", MAX_VARS ) ;
107 |          return 1 ;
108 |          }
109 | 
110 |       // Parse a single variable name
111 |       k = 0 ;                  // Will index character in name
112 |       while (*lptr  &&  *lptr != ','  &&  *lptr != '\t'  &&  *lptr != ' '
113 |          &&  *lptr != '\n'  &&  *lptr != '\r') {
114 |          if (k < MAX_NAME_LENGTH-1) // Ensure that we do not overrun name array
115 |             var_name[k++] = *lptr++ ;  // Copy the name to EOL or delimiter
116 |          else {               // Should never happen: user's name is too long
117 |             fclose ( fp ) ;
118 |             FREE ( line ) ;
119 |             printf ( "\nERROR... Variable name longer than %d characters",
120 |                      MAX_NAME_LENGTH ) ;
121 |             return 1 ;
122 |             }
123 |          }
124 |       var_name[k] = 0 ;       // Terminate this name
125 |       _strupr ( var_name ) ;
126 | 
127 |       // We have just completed parsing a single variable name
128 |       for (j=0 ; j<*nvars ; j++) {  // Is this name already present?
129 |          if (! strcmp ( var_name , (*names)[j] )) { // Check names so far
130 |             fclose ( fp ) ;
131 |             FREE ( line ) ;
132 |             FREE ( *names ) ;
133 |             printf ( "\nERROR... name '%s' is duplicated", var_name ) ;
134 |             return 1 ;
135 |             }
136 |          }
137 | 
138 |       (*names)[*nvars] = (char *) MALLOC ( (unsigned int) (strlen ( var_name )) + 1 ) ;
139 |       assert ( (*names)[*nvars] != NULL ) ;
140 |       strcpy ( (*names)[*nvars] , var_name ) ;
141 |       ++*nvars ;   // Count the number of variables in this file
142 | 
143 |       // If we have a delimiter, another name follows (probably; see below)
144 |       if (*lptr != ','  &&  *lptr != '\t'  &&  *lptr != ' ') // Not a delimiter?
145 |          break ;      // We have reached the end of the line
146 |       ++lptr ;        // Pass the delimiter
147 | 
148 |       // A careless user may have multiple blanks or tabs after a variable name
149 |       // This would cause problems, so pass them
150 |       while (*lptr == ' '  ||  *lptr == '\t')
151 |          ++lptr ;
152 |       if (! *lptr  ||  *lptr == '\n'  ||  *lptr == '\r') // Reached end of line?
153 |          break ;                                         // Done if so
154 |       } // For parsing all variables from the header line
155 | 
156 |    MEMTEXT ( "READFILE: readfile() names realloc" ) ;
157 |    *names = (char **) REALLOC ( *names , *nvars * sizeof(char *) ) ;
158 |    printf ( "\nFile %s contained %d variables", name, *nvars ) ;
159 | 
160 | /*
161 |    Read the file.
162 | */
163 | 
164 |    MEMTEXT ( "READFILE: readfile() data initial alloc" ) ;
165 |    *data = (double *) MALLOC ( BUFSIZE * *nvars * sizeof(double) ) ;
166 |    assert (*data != NULL) ;
167 | 
168 |    *ncases = 0 ;
169 |    for (;;) {
170 | 
171 |       if ((fgets ( line , linelen , fp ) == NULL) || (strlen ( line ) < 2)) {
172 |          if (ferror ( fp )  ||  ! ncases) {
173 |             fclose ( fp ) ;
174 |             FREE ( line ) ;
175 |             FREE ( *data ) ;
176 |             for (i=0 ; i<*nvars ; i++)
177 |                FREE ( (*names)[i] ) ;
178 |             FREE ( *names ) ;
179 |             printf ( "\nERROR... Problem reading file %s", name ) ;
180 |             return 1 ;
181 |             }
182 |          else
183 |             break ;       // Normal end of file
184 |          }
185 | 
186 |       // Normally, every line will be terminated by a newline character,
187 |       // which is included in the string read by fgets.  If the line is
188 |       // not terminated by a newline, one of two things has happened,
189 |       // either of which means that we are done.  It may be that the program
190 |       // that created the file failed to terminate the last line of the file
191 |       // with a newline.  If so, we just process the line and proceed normally,
192 |       // though perhaps grumbling a bit and perhaps flogging whoever wrote the
193 |       // offending program.  The other possibility, hopefully rare to the point
194 |       // of never happening, is that the line in the file is much longer than
195 |       // we anticipated (linelen).  In this case all we can do is quit in shame.
196 | 
197 |       if (strchr(line,'\n') == NULL  &&  strchr(line,'\r') == NULL) { // No newline?
198 |          fgetc ( fp ) ;        // Try reading another character to see if EOF
199 |          if (! feof ( fp )) {  // If not at EOF, the line was longer than linelen
200 |             fclose ( fp ) ;    // So all we can do is quit
201 |             FREE ( line ) ;
202 |             FREE ( *data ) ;
203 |             for (i=0 ; i<*nvars ; i++)
204 |                FREE ( (*names)[i] ) ;
205 |             FREE ( *names ) ;
206 |             printf ( "\nERROR... A line in the file is too long" ) ;
207 |             return 1 ;
208 |             }
209 |          }
210 | 
211 |       if (! bufcnt--) {  // Allocate a new memory block if needed
212 | 
213 |          MEMTEXT ( "READFILE: readfile() data internal realloc" ) ;
214 |          temp = (double *) REALLOC ( *data ,
215 |                            (*ncases + BUFSIZE) * *nvars * sizeof(double) ) ;
216 |          assert (temp != NULL) ;
217 |          *data = temp ;
218 |          bufcnt = BUFSIZE - 1 ;
219 |          }
220 | 
221 |       lptr = line ;                        // Parse the data from this line
222 |       dptr = *data + *ncases * *nvars ;    // This case will go here
223 | 
224 |       for (i=0 ; i<*nvars ; i++)
225 |          *dptr++ = parse_double ( &lptr ) ;
226 | 
227 |       ++*ncases ;                           // Count cases read
228 | 
229 |       if (feof ( fp ))
230 |          break ;
231 | 
232 |       }  /* Endless loop until file runs out */
233 | 
234 |    fclose ( fp ) ;
235 |    FREE ( line ) ;
236 | 
237 |    if (! *ncases) {
238 |       MEMTEXT ( "READFILE: readfile() no cases" ) ;
239 |       FREE ( *data ) ;
240 |       for (i=0 ; i<*nvars ; i++)
241 |          FREE ( (*names)[i] ) ;
242 |       FREE ( *names ) ;
243 |       return 1 ;
244 |       }
245 | 
246 |    MEMTEXT ( "READFILE: readfile()  data final realloc" ) ;
247 |    *data = (double *) REALLOC ( *data , *ncases * *nvars * sizeof(double) ) ;
248 | 
249 |    printf ( " and %d cases", *ncases ) ;
250 | 
251 |    return 0 ;
252 | }
253 | 
254 | void free_data ( int nvars , char **names , double *data )
255 | {
256 |    int i ;
257 | 
258 |    MEMTEXT ( "READFILE: free_data()" ) ;
259 | 
260 |    assert ( names != NULL ) ;
261 |    for (i=0 ; i<nvars ; i++) {
262 |       assert ( names[i] != NULL ) ;
263 |       FREE ( names[i] ) ;
264 |       }
265 |    FREE ( names ) ;
266 | 
267 |    assert ( data != NULL ) ;
268 |    FREE ( data ) ;
269 | }
270 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apress Source Code
 2 | 
 3 | This repository accompanies [*Assessing and Improving Prediction and Classification*](http://www.apress.com/9781484233351) by Timothy Masters (Apress, 2018).
 4 | 
 5 | [comment]: #cover
 6 | 
 7 | 
 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git.
 9 | 
10 | ## Releases
11 | 
12 | Release v1.0 corresponds to the code in the published book, without corrections or updates.
13 | 
14 | ## Contributions
15 | 
16 | See the file Contributing.md for more information on how you can contribute to this repository.
17 | 


--------------------------------------------------------------------------------
/SPLINE.CPP:
--------------------------------------------------------------------------------
 1 | /******************************************************************************/
 2 | /*                                                                            */
 3 | /*  SPLINE - CubicSpline class                                                */
 4 | /*                                                                            */
 5 | /******************************************************************************/
 6 | 
 7 | #include <assert.h>
 8 | #include <stdio.h>
 9 | #include <string.h>
10 | #include <math.h>
11 | #include <stdlib.h>
12 | #include "info.h"
13 | 
14 | CubicSpline::CubicSpline (
15 |    int nin ,       // Number of input points
16 |    double *xin ,   // They are here, not necessarily sorted
17 |    double *yin
18 |    )
19 | {
20 |    int i ;
21 |    double temp, p, *c ;
22 | 
23 |    MEMTEXT ( "CubicSpline constructor" ) ;
24 | 
25 |    n = nin ;
26 |    c =  (double *) MALLOC ( n * sizeof(double) ) ;
27 |    x =  (double *) MALLOC ( n * sizeof(double) ) ;
28 |    y =  (double *) MALLOC ( n * sizeof(double) ) ;
29 |    y2 = (double *) MALLOC ( n * sizeof(double) ) ;
30 | 
31 |    assert ( c != NULL ) ;
32 |    assert ( x != NULL ) ;
33 |    assert ( y != NULL ) ;
34 |    assert ( y2 != NULL ) ;
35 | 
36 |    memcpy ( x , xin , n * sizeof(double) ) ;
37 |    memcpy ( y , yin , n * sizeof(double) ) ;
38 |    qsortds ( 0 , n-1 , x , y ) ;
39 | 
40 |    y2[0] = c[0] = 0.0 ;
41 | 
42 |    for (i=1 ; i<n-1 ; i++) {
43 |       temp = (x[i] - x[i-1]) / (x[i+1] - x[i-1]) ;
44 |       p = temp * y2[i-1] + 2.0 ;
45 |       y2[i] = (temp - 1.0) / p ;
46 |       c[i] = (y[i+1] - y[i]) / (x[i+1] - x[i]) ;
47 |       c[i] -= (y[i] - y[i-1]) / (x[i] - x[i-1]) ;
48 |       c[i] = (6.0 * c[i] / (x[i+1] - x[i-1]) - temp * c[i-1]) / p ;
49 |       }
50 | 
51 |    y2[n-1] = 0.0 ;
52 |    for (i=n-2 ; i>=0 ; i--)
53 |       y2[i] = y2[i] * y2[i+1] + c[i] ;
54 | 
55 |    FREE ( c ) ;
56 | }
57 | 
58 | CubicSpline::~CubicSpline ()
59 | {
60 |    MEMTEXT ( "CubicSpline destructor" ) ;
61 |    FREE ( x ) ;
62 |    FREE ( y ) ;
63 |    FREE ( y2 ) ;
64 | }
65 | 
66 | double CubicSpline::evaluate ( double xpt )
67 | {
68 |    int k, klo, khi ;
69 |    double dist, a, b, aa, bb, val ;
70 | 
71 |    if (xpt < x[0])
72 |       return y[0] ;
73 | 
74 |    if (xpt > x[n-1])
75 |       return y[n-1] ;
76 | 
77 |    klo = 0 ;
78 |    khi = n - 1 ;
79 | 
80 |    while (khi > klo+1) {
81 |       k = (khi + klo) / 2 ;
82 |       if (xpt < x[k])
83 |          khi = k ;
84 |       else
85 |          klo = k ;
86 |       }
87 | 
88 |    dist = x[khi] - x[klo] + 1.e-60 ;
89 |    a = (x[khi] - xpt) / dist ;
90 |    b = (xpt - x[klo]) / dist ;
91 |    aa = a * (a * a - 1.0) ;
92 |    bb = b * (b * b - 1.0) ;
93 | 
94 |    val = (aa * y2[klo] + bb * y2[khi]) * dist * dist / 6.0 ;
95 |    return a * y[klo] + b * y[khi] + val ;
96 | }
97 | 


--------------------------------------------------------------------------------
/TEST_CON.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  TEST_CON - Test the continuous mutual information methods                 */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | #include <math.h>
 10 | #include <conio.h>
 11 | #include <ctype.h>
 12 | #include <stdlib.h>
 13 | #include "..\info.h"
 14 | 
 15 | /*
 16 |    These are defined in MEM.CPP
 17 | */
 18 | 
 19 | extern int mem_keep_log ;      // Keep a log file?
 20 | extern char mem_file_name[] ;  // Log file name
 21 | extern int mem_max_used ;      // Maximum memory ever in use
 22 | 
 23 | int main (
 24 |    int argc ,    // Number of command line arguments (includes prog name)
 25 |    char *argv[]  // Arguments (prog name is argv[0])
 26 |    )
 27 | 
 28 | {
 29 |    int i, nsamps, ntries, ndiv, divisor, itry, respect_ties ;
 30 |    double corr, correct, ptie, *x, *y, x1, x2, result, prior_x1 ;
 31 |    double total_parzen, bias_parzen, stderr_parzen ;
 32 |    double total_adapt, bias_adapt, stderr_adapt ;
 33 |    double chi_test ;
 34 |    FILE *fp ;
 35 |    MutualInformationParzen *mi_parzen ;
 36 |    MutualInformationAdaptive *mi_adapt ;
 37 | 
 38 | /*
 39 |    Process command line parameters
 40 | */
 41 | 
 42 | #if 1
 43 |    if (argc != 8) {
 44 |       printf (
 45 |          "\nUsage: TEST_CON nsamples ntries correlation ptie respect_ties ndiv chi_test" ) ;
 46 |       exit ( 1 ) ;
 47 |       }
 48 | 
 49 |    nsamps = atoi ( argv[1] ) ;
 50 |    ntries = atoi ( argv[2] ) ;
 51 |    corr = atof ( argv[3] ) ;
 52 |    ptie = atof ( argv[4] ) ;
 53 |    respect_ties = atoi ( argv[5] ) ;
 54 |    ndiv = atoi ( argv[6] ) ;
 55 |    chi_test = atof ( argv[7] ) ;
 56 | #else
 57 |    nsamps = 101 ;
 58 |    ntries = 10 ;
 59 |    corr = 0.9 ;
 60 |    ptie = 0.0 ;
 61 |    respect_ties = 0 ;
 62 |    ndiv = 5 ;
 63 |    chi_test = 6.0 ;
 64 | #endif
 65 | 
 66 |    if ((nsamps <= 0)  ||  (ntries <= 0)  || (corr < -1.0)  ||  (corr > 1.0)
 67 |     || (ptie < 0.0)  || (ptie > 1.0)  || (ndiv < 2)  || (chi_test < 0.0)) {
 68 |       printf (
 69 |          "\nUsage: TEST_CON nsamples ntries correlation ptie respect_ties ndiv chi_test" ) ;
 70 |       exit ( 1 ) ;
 71 |       }
 72 | 
 73 | /*
 74 |    These are used by MEM.CPP for runtime memory validation
 75 | */
 76 | 
 77 |    _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ;
 78 |    fp = fopen ( mem_file_name , "wt" ) ;
 79 |    if (fp == NULL) { // Should never happen
 80 |       printf ( "\nCannot open MEM.LOG file for writing!" ) ;
 81 |       return EXIT_FAILURE ;
 82 |       }
 83 |    fclose ( fp ) ;
 84 |    mem_keep_log = 1 ;
 85 |    mem_max_used = 0 ;
 86 | 
 87 | /*
 88 |    Allocate memory and initialize
 89 | */
 90 | 
 91 |    divisor = ntries / 100 ;  // This is for progress reports only
 92 |    if (divisor < 1)
 93 |       divisor = 1 ;
 94 | 
 95 |    x = (double *) MALLOC ( nsamps * sizeof(double) ) ;
 96 |    y = (double *) MALLOC ( nsamps * sizeof(double) ) ;
 97 | 
 98 | 
 99 | /*
100 |    Main outer loop does all tries
101 | */
102 | 
103 |    correct = -0.5 * log ( 1.0 - corr * corr ) ;
104 |    total_parzen = bias_parzen = stderr_parzen = 0.0 ;
105 |    total_adapt = bias_adapt = stderr_adapt = 0.0 ;
106 | 
107 |    for (itry=1 ; itry<=ntries ; itry++) {
108 | 
109 |       if (((itry-1) % divisor) == 0)
110 |          printf ( "\n\n\nTry %d of %d", itry, ntries ) ;
111 | 
112 |       prior_x1 = 0.5 ;             // Arbitrary
113 |       for (i=0 ; i<nsamps ; i++) { // Create bivariate sample with known correlation
114 |          if (unifrand() < ptie)    // Duplicate the prior observation for a tie?
115 |             x1 = prior_x1 ;
116 |          else {
117 |             x1 = normal () ;
118 |             prior_x1 = x1 ;
119 |             }
120 |          x2 = normal () ;
121 |          if (i < nsamps/2) {       // Equally split ties between X and Y
122 |             x[i] = x1 ;
123 |             y[i] = corr * x1 + sqrt ( 1.0 - corr * corr ) * x2 ;
124 |             }
125 |          else {
126 |             y[i] = x1 ;
127 |             x[i] = corr * x1 + sqrt ( 1.0 - corr * corr ) * x2 ;
128 |             }
129 |          }
130 | 
131 |       mi_adapt = new MutualInformationAdaptive ( nsamps , y , respect_ties , chi_test ) ;
132 |       result = mi_adapt->mut_inf ( x , respect_ties ) ;
133 |       delete mi_adapt ;
134 |       total_adapt += result ;
135 |       bias_adapt += result - correct ;
136 |       stderr_adapt += (result - correct) * (result - correct) ;
137 | 
138 |       mi_parzen = new MutualInformationParzen ( nsamps , y , ndiv ) ;
139 |       result = mi_parzen->mut_inf ( x ) ;
140 |       delete mi_parzen ;
141 |       total_parzen += result ;
142 |       bias_parzen += result - correct ;
143 |       stderr_parzen += (result - correct) * (result - correct) ;
144 | 
145 |       if ((((itry-1) % divisor) == 0)
146 |        || (itry == ntries) ) {        // Don't do this every try!  Too slow.
147 |          printf ( "\nParzen Mean = %.5lf   Bias = %.5lf   StdErr = %.5lf",
148 |             total_parzen/itry, bias_parzen/itry, sqrt ( stderr_parzen/itry )) ;
149 |          printf ( "\nAdapt  Mean = %.5lf   Bias = %.5lf   StdErr = %.5lf",
150 |                total_adapt/itry, bias_adapt/itry, sqrt ( stderr_adapt/itry )) ;
151 |          }
152 | 
153 |       if (_kbhit ()) {         // Has the user pressed a key?
154 |          if (_getch() == 27)   // The ESCape key?
155 |             break ;
156 |          }
157 | 
158 |       } // For all tries
159 | 
160 |    FREE ( x ) ;
161 |    FREE ( y ) ;
162 |    MEMCLOSE () ;
163 |    return EXIT_SUCCESS ;
164 | }
165 | 


--------------------------------------------------------------------------------
/TRANSFER.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  TRANSFER - Compute transfer entropy for predictor candidates              */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <assert.h>
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <math.h>
 11 | #include <conio.h>
 12 | #include <ctype.h>
 13 | #include <stdlib.h>
 14 | #include "..\info.h"
 15 | 
 16 | /*
 17 |    These are defined in MEM.CPP
 18 | */
 19 | 
 20 | extern int mem_keep_log ;      // Keep a log file?
 21 | extern char mem_file_name[] ;  // Log file name
 22 | extern int mem_max_used ;      // Maximum memory ever in use
 23 | 
 24 | 
 25 | int main (
 26 |    int argc ,    // Number of command line arguments (includes prog name)
 27 |    char *argv[]  // Arguments (prog name is argv[0])
 28 |    )
 29 | 
 30 | {
 31 |    int i, j, k, nvars, ncases, irep, nreps, nbins, nbins_dep, nbins_indep, *count ;
 32 |    int n_indep_vars, idep, icand, *index, *mcpt_max_counts, *mcpt_same_counts, *mcpt_solo_counts ;
 33 |    short int *bins_dep, *bins_indep ;
 34 |    double *data, *work, dtemp, *save_info, criterion, *crits ;
 35 |    double *ab, *bc, *b ;
 36 |    char filename[256], **names, depname[256] ;
 37 |    FILE *fp ;
 38 | 
 39 | /*
 40 |    Process command line parameters
 41 | */
 42 | 
 43 | #if 1
 44 |    if (argc != 6) {
 45 |       printf ( "\nUsage: TRANSFER  datafile  n_indep  depname  nreps" ) ;
 46 |       printf ( "\n  datafile - name of the text file containing the data" ) ;
 47 |       printf ( "\n             The first line is variable names" ) ;
 48 |       printf ( "\n             Subsequent lines are the data." ) ;
 49 |       printf ( "\n             Delimiters can be space, comma, or tab" ) ;
 50 |       printf ( "\n  n_indep - Number of independent vars, starting with the first" ) ;
 51 |       printf ( "\n  depname - Name of the 'dependent' variable" ) ;
 52 |       printf ( "\n            It must be AFTER the first n_indep variables" ) ;
 53 |       printf ( "\n  nbins - Number of bins for all variables" ) ;
 54 |       printf ( "\n  nreps - Number of Monte-Carlo permutations, including unpermuted" ) ;
 55 |       exit ( 1 ) ;
 56 |       }
 57 | 
 58 |    strcpy ( filename , argv[1] ) ;
 59 |    n_indep_vars = atoi ( argv[2] ) ;
 60 |    strcpy ( depname , argv[3] ) ;
 61 |    nbins = atoi ( argv[4] ) ;
 62 |    nreps = atoi ( argv[5] ) ;
 63 | #else
 64 |    strcpy ( filename , "..\\SYNTH.TXT" ) ;
 65 |    n_indep_vars = 7 ;
 66 |    strcpy ( depname , "SUM1234" ) ;
 67 |    nbins = 2 ;
 68 |    nreps = 1 ;
 69 | #endif
 70 | 
 71 |    _strupr ( depname ) ;
 72 | 
 73 | /*
 74 |    These are used by MEM.CPP for runtime memory validation
 75 | */
 76 | 
 77 |    _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ;
 78 |    fp = fopen ( mem_file_name , "wt" ) ;
 79 |    if (fp == NULL) { // Should never happen
 80 |       printf ( "\nCannot open MEM.LOG file for writing!" ) ;
 81 |       return EXIT_FAILURE ;
 82 |       }
 83 |    fclose ( fp ) ;
 84 |    mem_keep_log = 1 ;  // Change this to 1 to keep a memory use log (slows execution!)
 85 |    mem_max_used = 0 ;
 86 | 
 87 | /*
 88 |    Open the text file to which results will be written
 89 | */
 90 | 
 91 |    fp = fopen ( "TRANSFER.LOG" , "wt" ) ;
 92 |    if (fp == NULL) { // Should never happen
 93 |       printf ( "\nCannot open TRANSFER.LOG file for writing!" ) ;
 94 |       return EXIT_FAILURE ;
 95 |       }
 96 | 
 97 | /*
 98 |    Read the file and locate the index of the dependent variable
 99 | */
100 | 
101 |    if (readfile ( filename , &nvars , &names , &ncases , &data ))
102 |       return EXIT_FAILURE ;
103 | 
104 |    for (idep=0 ; idep<nvars ; idep++) {
105 |       if (! strcmp ( depname , names[idep] ))
106 |          break ;
107 |       }
108 | 
109 |    if (idep == nvars) {
110 |       printf ( "\nERROR... Dependent variable %s is not in file", depname ) ;
111 |       return EXIT_FAILURE ;
112 |       }
113 | 
114 |    if (idep < n_indep_vars) {
115 |       printf ( "\nERROR... Dependent variable %s must be beyond independent vars",
116 |                depname ) ;
117 |       return EXIT_FAILURE ;
118 |       }
119 | 
120 | /*
121 |    Allocate scratch memory
122 | 
123 |    crits - Transfer Entropy criterion
124 |    index - Indices that sort the criterion
125 |    save_info - Ditto, this is univariate criteria, to be sorted
126 | */
127 | 
128 |    MEMTEXT ( "TRANSFER work allocs" ) ;
129 |    work = (double *) MALLOC ( ncases * sizeof(double) ) ;
130 |    assert ( work != NULL ) ;
131 |    crits = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ;
132 |    assert ( crits != NULL ) ;
133 |    index = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ;
134 |    assert ( index != NULL ) ;
135 |    bins_indep = (short int *) MALLOC ( ncases * sizeof(short int) ) ;
136 |    assert ( bins_indep != NULL ) ;
137 |    bins_dep = (short int *) MALLOC ( ncases * sizeof(short int) ) ;
138 |    assert ( bins_dep != NULL ) ;
139 |    mcpt_max_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ;
140 |    assert ( mcpt_max_counts != NULL ) ;
141 |    mcpt_same_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ;
142 |    assert ( mcpt_same_counts != NULL ) ;
143 |    mcpt_solo_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ;
144 |    assert ( mcpt_solo_counts != NULL ) ;
145 |    save_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ;
146 |    assert ( save_info != NULL ) ;
147 |    count = (int *) MALLOC ( nbins * nbins * nbins * sizeof(int) ) ;
148 |    assert ( count != NULL ) ;
149 |    ab = (double *) MALLOC ( nbins * nbins * sizeof(double) ) ;
150 |    assert ( ab != NULL ) ;
151 |    bc = (double *) MALLOC ( nbins * nbins * sizeof(double) ) ;
152 |    assert ( bc != NULL ) ;
153 |    b = (double *) MALLOC ( nbins * sizeof(double) ) ;
154 |    assert ( b != NULL ) ;
155 | 
156 | /*
157 |    Get the dependent variable and partition it
158 | */
159 | 
160 |    for (i=0 ; i<ncases ; i++)            // Get the 'dependent' variable
161 |       work[i] = data[i*nvars+idep] ;
162 | 
163 |    nbins_dep = nbins ;
164 |    partition ( ncases , work , &nbins_dep , NULL , bins_dep ) ;
165 | 
166 | /*
167 |    Replication loop is here
168 | */
169 | 
170 |    for (irep=0 ; irep<nreps ; irep++) {
171 | 
172 | /*
173 |    Compute and save the transfer entropy of the dependent variable
174 |    with each individual independent variable candidate.
175 | */
176 | 
177 |       for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates
178 |          for (i=0 ; i<ncases ; i++)
179 |             work[i] = data[i*nvars+icand] ;
180 | 
181 |          //    Shuffle independent variable if in permutation run (irep>0)
182 | 
183 |          if (irep) {                   // If doing permuted runs, shuffle
184 |             i = ncases ;               // Number remaining to be shuffled
185 |             while (i > 1) {            // While at least 2 left to shuffle
186 |                j = (int) (unifrand () * i) ;
187 |                if (j >= i)
188 |                   j = i - 1 ;
189 |                dtemp = work[--i] ;
190 |                work[i] = work[j] ;
191 |                work[j] = dtemp ;
192 |                }
193 |             }
194 | 
195 |          nbins_indep = nbins ;
196 |          partition ( ncases , work , &nbins_indep , NULL , bins_indep ) ;
197 | 
198 |          criterion = trans_ent ( ncases , nbins_indep , nbins_dep ,
199 |                                  bins_indep , bins_dep ,
200 |                                  0 , 1 , 1 , count , ab , bc , b ) ;
201 | 
202 |          save_info[icand] = criterion ; // We will sort this when all candidates are done
203 |                                         
204 |          if (irep == 0) {               // If doing original (unpermuted), save criterion
205 |             index[icand] = icand ;      // Will need original indices when criteria are sorted
206 |             crits[icand] = criterion ;
207 |             mcpt_max_counts[icand] = mcpt_same_counts[icand] = mcpt_solo_counts[icand] = 1 ;  // This is >= itself so count it now
208 |             }
209 |          else {
210 |             if (criterion >= crits[icand])
211 |                ++mcpt_solo_counts[icand] ;
212 |             }
213 |          } // Initial list of all candidates
214 | 
215 |       if (irep == 0)  // Find the indices that sort the candidates per criterion
216 |          qsortdsi ( 0 , n_indep_vars-1 , save_info , index ) ;
217 | 
218 |       else {
219 |          qsortd ( 0 , n_indep_vars-1 , save_info ) ;
220 |          for (icand=0 ; icand<n_indep_vars ; icand++) {
221 |             if (save_info[icand] >= crits[index[icand]])
222 |                ++mcpt_same_counts[index[icand]] ;
223 |             if (save_info[n_indep_vars-1] >= crits[index[icand]]) // Valid only for largest
224 |                ++mcpt_max_counts[index[icand]] ;
225 |             }
226 |          }
227 | 
228 |       }  // For all reps
229 | 
230 |    fprintf ( fp , "\nTransfer entropy of %s", depname);
231 | 
232 |    fprintf ( fp , "\n" ) ;
233 |    fprintf ( fp , "\n" ) ;
234 |    fprintf ( fp , "\nPredictors, in order of decreasing transfer entropy" ) ;
235 |    fprintf ( fp , "\n" ) ;
236 |    fprintf ( fp , "\n                       Variable   Information   Solo pval   Min pval   Max pval" ) ;
237 | 
238 |    for (icand=0 ; icand<n_indep_vars ; icand++) { // Do all candidates
239 |       k = index[n_indep_vars-1-icand] ;           // Index of sorted candidate
240 |       fprintf ( fp , "\n%31s %11.5lf %12.4lf %10.4lf %10.4lf", names[k], crits[k],
241 |                 (double) mcpt_solo_counts[k] / nreps,
242 |                 (double) mcpt_same_counts[k] / nreps,
243 |                 (double) mcpt_max_counts[k] / nreps ) ;
244 |       }
245 | 
246 |    MEMTEXT ( "TRANSFER: Finish" ) ;
247 |    fclose ( fp ) ;
248 |    FREE ( work ) ;
249 |    FREE ( crits ) ;
250 |    FREE ( index ) ;
251 |    FREE ( bins_indep ) ;
252 |    FREE ( bins_dep ) ;
253 |    FREE ( mcpt_max_counts ) ;
254 |    FREE ( mcpt_same_counts ) ;
255 |    FREE ( mcpt_solo_counts ) ;
256 |    FREE ( save_info ) ;
257 |    FREE ( count ) ;
258 |    FREE ( ab ) ;
259 |    FREE ( bc ) ;
260 |    FREE ( b ) ;
261 |    free_data ( nvars , names , data ) ;
262 | 
263 |    MEMCLOSE () ;
264 |    printf ( "\n\nPress any key..." ) ;
265 |    _getch () ;
266 |    return EXIT_SUCCESS ;
267 | }
268 | 


--------------------------------------------------------------------------------
/TRANS_ENT.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  TRANS_ENT - Schreiber's transfer entropy (information transfer)           */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <assert.h>
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <math.h>
 11 | #include <stdlib.h>
 12 | #include "info.h"
 13 | 
 14 | #define DEBUG 0
 15 | 
 16 | /*
 17 | --------------------------------------------------------------------------------
 18 | 
 19 |    We are given two series, x and y, each having n cases.
 20 |    It is assumed that p(y[i]) is a function of y[i-1], y[i-2], ..., y[i-yhist].
 21 |    But does x[i-xlag], x[i-xlag-1], ..., x[i-xlag-xhist+1] influence the y transition
 22 |    probabilities?  This function measures the extent to which this occurs.
 23 | 
 24 |    The traditional version has xlag=1, meaning that the value of x concurrent
 25 |    with y is not allowed to participate in influencing y.
 26 |    Many models want the historical x influence to come up to y,
 27 |    allowing concurrent influence.  For this, xlag=0.
 28 |    This happens, for example, in developing model-based market trading systems
 29 |    in which the indicator/target data is such that indicators are computed
 30 |    based strictly on the past and targets strictly on the future.
 31 |    So the data already has X inherently lagged to Y, and you would not want
 32 |    to lag it still further.
 33 | 
 34 |    Note that we have nbins_x ^ xhist * nbins_y ^ (yhist+1) bins.
 35 |    In order to get decent probability  estimates, these bins must contain
 36 |    a decent number of cases.  The number of bins will blow up fast as
 37 |    xhist and yhist grow!  Keep them small unless n is gigantic.
 38 | 
 39 |    Suppose 'a' represents the current y, 'b' represents y history, and
 40 |    'c' represents x history.  Then the information transfer is:
 41 | 
 42 |    SUM [ p(a,b,c) log ( p(a|b,c) / p(a|b) ]
 43 | 
 44 |    So it's a sum of logs, weighted by the probability of each possible outcome.
 45 |    The log term is the ratio of the conditional probability of the current y
 46 |    given both its history and x history, over the conditional given just
 47 |    its own (y) history.  If c, the x history, has no impact, this ratio will
 48 |    be 1, and its log will be zero.
 49 | 
 50 |    Note that p(a|b,c) = p(a,b,c) / p(b,c) and p(a|b) = p(a,b) / p(b)
 51 | 
 52 |    To speed calculations, after cumulating p(a,b,c) we compute and save
 53 |    the marginals p(b,c), p(a,b), and p(b).
 54 | 
 55 |    Four work vectors must be supplied.
 56 |    Let nx = nbins_x ^ xhist  and  ny = nbins_y ^ yhist.  The lengths are:
 57 |       counts = nx * ny * nbins_y
 58 |       ab = nbins_y * ny
 59 |       bc = nx * ny
 60 |       b = ny
 61 | 
 62 | --------------------------------------------------------------------------------
 63 | */
 64 | 
 65 | double trans_ent (
 66 |    int n ,          // Length of x and y
 67 |    int nbins_x ,    // Number of x bins.  Beware if greater than 2.
 68 |    int nbins_y ,    // Ditto y
 69 |    short int *x ,   // Independent variable, which impacts y transitions
 70 |    short int *y ,   // Dependent variable
 71 |    int xlag ,       // Lag of most recent predictive x: 1 for traditional, 0 for concurrent
 72 |    int xhist ,      // Length of x history.  At least 1; Beware if greater than 1.
 73 |    int yhist ,      // Ditto y
 74 |    int *counts ,    // Work vector (see comment above)
 75 |    double *ab ,     // Ditto
 76 |    double *bc ,     // Ditto
 77 |    double *b        // Ditto
 78 |    )
 79 | {
 80 |    int i, j, nx, ny, nxy, istart, ix, iy, ia, total ;
 81 |    double p, trans, numer, denom ;
 82 | 
 83 | /*
 84 |    Compute key constants.
 85 | */
 86 | 
 87 |    nx = nbins_x ;
 88 |    for (i=1 ; i<xhist ; i++)   // Number of bins for X history
 89 |       nx *= nbins_x ;
 90 | 
 91 |    ny = nbins_y ;
 92 |    for (i=1 ; i<yhist ; i++)   // Number of bins for Y history
 93 |       ny *= nbins_y ;
 94 | 
 95 |    nxy = nx * ny ;             // Total number of history bins
 96 | 
 97 | /*
 98 |    Pass through the data, cumulating the bin counts
 99 |    The counts will be kept in an array with X history changing fastest,
100 |    then Y history, and current Y changing last.
101 | */
102 | 
103 |    memset ( counts , 0 , nxy * nbins_y * sizeof(int) ) ;
104 | 
105 |    istart = xhist + xlag - 1 ;
106 |    if (yhist > istart)
107 |       istart = yhist ;
108 | 
109 |    for (i=istart ; i<n ; i++) {
110 | 
111 |       // Which of the nbins_x ^ xhist X history bins does this case lie in?
112 |       ix = x[i-xlag] ;
113 |       for (j=1 ; j<xhist ; j++)
114 |          ix = nbins_x * ix + x[i-j-xlag] ;
115 | 
116 |       // Which of the nbins_y ^ yhist Y history bins does this case lie in?
117 |       iy = y[i-1] ;
118 |       for (j=2 ; j<=yhist ; j++)
119 |          iy = nbins_y * iy + y[i-j] ;
120 | 
121 |       ++counts [ y[i] * nxy + iy * nx + ix ] ;  // Increment the correct bin
122 |       }
123 | 
124 |    total = n - istart ;
125 | 
126 | #if DEBUG
127 |    denom = 0.0 ;
128 |    printf ( "\nBin probs:" ) ;
129 |    for (ia=0 ; ia<nbins_y ; ia++) {
130 |       for (iy=0 ; iy<ny ; iy++) {
131 |          for (ix=0 ; ix<nx ; ix++) {
132 |             p = (double) counts [ ia * nxy + iy * nx + ix ] / (double) total ;
133 |             denom += p ;
134 |             printf ( "\nIC=%d  IX=%d  IY=%d  p=%.5lf  Sum=%.5lf", ia, ix, iy, p, denom ) ;
135 |             }
136 |          }
137 |       }
138 | #endif
139 | 
140 | /*
141 |    Compute and save the marginals
142 | */
143 | 
144 |    for (i=0 ; i<nbins_y*ny ; i++)
145 |       ab[i] = 0.0 ;
146 |    for (i=0 ; i<nx*ny ; i++)
147 |       bc[i] = 0.0 ;
148 |    for (i=0 ; i<ny ; i++)
149 |       b[i] = 0.0 ;
150 | 
151 |    for (ia=0 ; ia<nbins_y ; ia++) {
152 |       for (iy=0 ; iy<ny ; iy++) {
153 |          for (ix=0 ; ix<nx ; ix++) {
154 |             p = (double) counts [ ia * nxy + iy * nx + ix ] / (double) total ;
155 |             ab[ia*ny+iy] += p ;
156 |             bc[iy*nx+ix] += p ;
157 |             b[iy] += p ;
158 |             }
159 |          }
160 |       }
161 | 
162 | #if DEBUG
163 |    printf ( "\nAB marginal:" ) ;
164 |    for (i=0 ; i<nbins_y*ny ; i++)
165 |       printf ( " %.5lf", ab[i] ) ;
166 |    printf ( "\nBC marginal:" ) ;
167 |    for (i=0 ; i<nx*ny ; i++)
168 |       printf ( " %.5lf", bc[i] ) ;
169 |    printf ( "\nB marginal:" ) ;
170 |    for (i=0 ; i<ny ; i++)
171 |       printf ( " %.5lf", b[i] ) ;
172 | #endif
173 | 
174 | /*
175 |    Compute the information transfer
176 | */
177 | 
178 |    trans = 0.0 ;
179 |    for (ia=0 ; ia<nbins_y ; ia++) {
180 |       for (iy=0 ; iy<ny ; iy++) {
181 |          for (ix=0 ; ix<nx ; ix++) {
182 |             p = (double) counts [ ia * nxy + iy * nx + ix ] / (double) total ;
183 |             if (p <= 0.0)
184 |                continue ;
185 |             numer = p / bc[iy*nx+ix] ;
186 |             denom = ab[ia*ny+iy] / b[iy] ;
187 |             trans += p * log ( numer / denom ) ;
188 | #if DEBUG
189 |             printf ( "\nIA=%d  IX=%d  IY=%d  p=%.5lf  n=%.5lf  d=%.5lf  t=%.5lf",
190 |                      ia, ix, iy, p, numer, denom, trans ) ;
191 | #endif
192 |             }
193 |          }
194 |       }
195 | 
196 |    return trans ;
197 | }
198 | 


--------------------------------------------------------------------------------
/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Apress Source Code
 2 | 
 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers.
 4 | 
 5 | ## How to Contribute
 6 | 
 7 | 1. Make sure you have a GitHub account.
 8 | 2. Fork the repository for the relevant book.
 9 | 3. Create a new branch on which to make your change, e.g. 
10 | `git checkout -b my_code_contribution`
11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted.
12 | 5. Submit a pull request.
13 | 
14 | Thank you for your contribution!


--------------------------------------------------------------------------------
/spearman.cpp:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  SPEARMAN - Compute and test Spearman Rho correlation                      */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | #include <math.h>
 10 | #include <conio.h>
 11 | #include <ctype.h>
 12 | #include <stdlib.h>
 13 | 
 14 | double unifrand () ;
 15 | void qsortds ( int first , int last , double *data , double *slave ) ;
 16 | 
 17 | /*
 18 | --------------------------------------------------------------------------------
 19 | 
 20 |    This is the subroutine that computes the Spearman rho
 21 | 
 22 | --------------------------------------------------------------------------------
 23 | */
 24 | 
 25 | double spearman (  // Returns rho in range -1 to 1
 26 |    int n ,         // Input: Number of cases
 27 |    double *var1 ,  // Input: One variable
 28 |    double *var2 ,  // Input: Other variable
 29 |    double *x ,     // Work vector n long
 30 |    double *y       // Work vector n long
 31 |    )
 32 | {
 33 |    int j, k, ntied ;
 34 |    double val, x_tie_correc, y_tie_correc ;
 35 |    double dn, ssx, ssy, rank, diff, rankerr, rho ;
 36 | 
 37 |    // We need to rearrange input vectors, so copy them to work vectors
 38 |    // To avoid disturbing the caller
 39 |    memcpy ( x , var1 , n * sizeof(double) ) ;
 40 |    memcpy ( y , var2 , n * sizeof(double) ) ;
 41 | 
 42 |    // Compute ties in x, compute correction as SUM ( ties**3 - ties )
 43 |    // The following routine sorts x ascending and simultaneously moves y
 44 |    qsortds ( 0 , n-1 , x , y ) ;
 45 |    x_tie_correc = 0.0 ;
 46 |    for (j=0 ; j<n ; ) { // Convert x to ranks, cumulate tie corec
 47 |       val = x[j] ;
 48 |       for (k=j+1 ; k<n ; k++) {  // Find all ties
 49 |          if (x[k] > val)
 50 |             break ;
 51 |          }
 52 |       ntied = k - j ;
 53 |       x_tie_correc += (double) ntied * ntied * ntied - ntied ;
 54 |       rank = 0.5 * ((double) j + (double) k + 1.0) ;
 55 |       while (j < k)
 56 |          x[j++] = rank ;
 57 |       } // For each case in sorted x array
 58 | 
 59 |    // Now do same for y
 60 |    qsortds ( 0 , n-1 , y , x ) ;
 61 |    y_tie_correc = 0.0 ;
 62 |    for (j=0 ; j<n ; ) { // Convert y to ranks, cumulate tie corec
 63 |       val = y[j] ;
 64 |       for (k=j+1 ; k<n ; k++) {  // Find all ties
 65 |          if (y[k] > val)
 66 |             break ;
 67 |          }
 68 |       ntied = k - j ;
 69 |       y_tie_correc += (double) ntied * ntied * ntied - ntied ;
 70 |       rank = 0.5 * ((double) j + (double) k + 1.0) ;
 71 |       while (j < k)
 72 |          y[j++] = rank ;
 73 |       } // For each case in sorted y array
 74 | 
 75 |    // Final computations
 76 |    dn = n ;
 77 |    ssx = (dn * dn * dn - dn - x_tie_correc) / 12.0 ;
 78 |    ssy = (dn * dn * dn - dn - y_tie_correc) / 12.0 ;
 79 |    rankerr = 0.0 ;
 80 |    for (j=0 ; j<n ; j++) { // Cumulate squared rank differences
 81 |       diff = x[j] - y[j] ;
 82 |       rankerr += diff * diff ;
 83 |       }
 84 |    rho = 0.5 * (ssx + ssy - rankerr) / sqrt (ssx * ssy + 1.e-20) ;
 85 |    return rho ;
 86 | }
 87 | 
 88 | /*
 89 | --------------------------------------------------------------------------------
 90 | 
 91 |    Main program to test Spearman rho
 92 | 
 93 | --------------------------------------------------------------------------------
 94 | */
 95 | 
 96 | int main (
 97 |    int argc ,    // Number of command line arguments (includes prog name)
 98 |    char *argv[]  // Arguments (prog name is argv[0])
 99 |    )
100 | 
101 | {
102 |    int i, n, discr ;
103 |    double *x, *y, coef, std, rho, *work1, *work2 ;
104 | 
105 | /*
106 |    Process command line parameters
107 | */
108 | 
109 |    if (argc != 5) {
110 |       printf ( "\nUsage: SPEARMAN n coefficient stddev discr" ) ;
111 |       exit ( 1 ) ;
112 |       }
113 | 
114 |    n = atoi ( argv[1] ) ;
115 |    coef = atof ( argv[2] ) ;
116 |    std = atof ( argv[3] ) ;
117 |    discr = atoi ( argv[4] ) ;
118 | 
119 |    if ((std < 0.0)  ||  (n <= 0)  ||  (discr < 0)) {
120 |       printf ( "\nUsage: SPEARMAN n coefficient stddev discr" ) ;
121 |       exit ( 1 ) ;
122 |       }
123 | 
124 |    x = (double *) malloc ( n * sizeof(double) ) ;
125 |    y = (double *) malloc ( n * sizeof(double) ) ;
126 |    work1 = (double *) malloc ( n * sizeof(double) ) ;
127 |    work2 = (double *) malloc ( n * sizeof(double) ) ;
128 | 
129 | 
130 |    for (i=0 ; i<n ; i++) {
131 |       x[i] = unifrand() ;
132 |       y[i] = 2.0 * fabs ( coef ) + coef * x[i] + std * unifrand() ;
133 |       if (discr > 0) {
134 |          x[i] = (int) (discr * x[i]) ;
135 |          y[i] = (int) (discr * y[i]) ;
136 |          }
137 |       }
138 | 
139 |    rho = spearman ( n , x , y , work1 , work2 ) ;
140 | 
141 |    printf ( "\nRho = %.3lf", rho ) ;
142 | 
143 |    free ( x ) ;
144 |    free ( y ) ;
145 |    free ( work1 ) ;
146 |    free ( work2 ) ;
147 |    return EXIT_SUCCESS ;
148 | }
149 | 


--------------------------------------------------------------------------------
/svdcmp.h:
--------------------------------------------------------------------------------
 1 | class SingularValueDecomp {
 2 | 
 3 | public:
 4 | 
 5 |    SingularValueDecomp ( int nrows , int ncols , int save_a=0 ) ;
 6 |    ~SingularValueDecomp () ;
 7 |    void svdcmp () ;
 8 |    void backsub ( double limit , double *soln ) ;
 9 | 
10 |    int ok ;         // Was everything legal and allocs successful?
11 | 
12 | /*
13 |    These are made public to allow access if desired.
14 |    Normally, only 'a' (the design matrix) and 'b' (the right-hand-side)
15 |    are written by the user.  If 'save_a' is nonzero, 'a' is kept intact.
16 | */
17 | 
18 |    double *a ;      // nrows by ncols input of design, output of U
19 |    double *u ;      // unless save_a nonzero, in which case U output in 'u'
20 |    double *w ;      // Unsorted ncols vector of singular values
21 |    double *v ;      // Ncols by ncols output of 'v'
22 |    double *b ;      // Nrows right-hand-side for backsub
23 | 
24 | 
25 | private:
26 | 
27 |    void bidiag ( double *matrix ) ;
28 |    double bid1 ( int col , double *matrix , double scale ) ;
29 |    double bid2 ( int col , double *matrix , double scale ) ;
30 |    void right ( double *matrix ) ;
31 |    void left ( double *matrix ) ;
32 |    void cancel ( int low , int high , double *matrix ) ;
33 |    void qr ( int low , int high , double *matrix ) ;
34 |    void qr_mrot ( int col , double sine , double cosine , double *matrix ) ;
35 |    void qr_vrot ( int col , double sine , double cosine ) ;
36 | 
37 |    int rows ;       // Nrows preserved here
38 |    int cols ;       // And ncols
39 |    double *work ;   // Scratch vector ncols long
40 |    double norm ;    // Norm of 'a' matrix
41 | } ;
42 | 


--------------------------------------------------------------------------------