├── DEEP.exe
├── Thumbs.db
├── V2_MANUAL.pdf
├── cudart64_65.dll
├── 9781484235904.jpg
├── errata.md
├── README.md
├── Contributing.md
├── V1 Source
    ├── README.TXT
    ├── GENERATIVE.CPP
    ├── RBM_THR1.CPP
    ├── MLFN_CUDA.CPP
    ├── SVDCMP.CPP
    └── MLFN_THR.CPP
├── LICENSE.txt
└── V2 Source
    ├── MRFFT_P.TXT
    ├── CUDA_GRAD.TXT
    ├── SERIES.TXT
    ├── MRFFT.TXT
    ├── MRFFT_K.TXT
    ├── SVDCMP.TXT
    └── THREADED_GRAD.TXT


/DEEP.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/deep-belief-nets-vol-1/HEAD/DEEP.exe


--------------------------------------------------------------------------------
/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/deep-belief-nets-vol-1/HEAD/Thumbs.db


--------------------------------------------------------------------------------
/V2_MANUAL.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/deep-belief-nets-vol-1/HEAD/V2_MANUAL.pdf


--------------------------------------------------------------------------------
/cudart64_65.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/deep-belief-nets-vol-1/HEAD/cudart64_65.dll


--------------------------------------------------------------------------------
/9781484235904.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/deep-belief-nets-vol-1/HEAD/9781484235904.jpg


--------------------------------------------------------------------------------
/errata.md:
--------------------------------------------------------------------------------
 1 | # Errata for *Book Title*
 2 | 
 3 | On **page xx** [Summary of error]:
 4 |  
 5 | Details of error here. Highlight key pieces in **bold**.
 6 | 
 7 | ***
 8 | 
 9 | On **page xx** [Summary of error]:
10 |  
11 | Details of error here. Highlight key pieces in **bold**.
12 | 
13 | ***


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apress Source Code
 2 | 
 3 | This repository accompanies [*Deep Belief Nets in C++ and CUDA C: Volume 1*](http://www.apress.com/9781484235904) by Timothy Masters (Apress, 2018).
 4 | 
 5 | [comment]: #cover
 6 | ![Cover image](9781484235904.jpg)
 7 | 
 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git.
 9 | 
10 | ## Releases
11 | 
12 | Release v1.0 corresponds to the code in the published book, without corrections or updates.
13 | 
14 | ## Contributions
15 | 
16 | See the file Contributing.md for more information on how you can contribute to this repository.


--------------------------------------------------------------------------------
/Contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Apress Source Code
 2 | 
 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers.
 4 | 
 5 | ## How to Contribute
 6 | 
 7 | 1. Make sure you have a GitHub account.
 8 | 2. Fork the repository for the relevant book.
 9 | 3. Create a new branch on which to make your change, e.g. 
10 | `git checkout -b my_code_contribution`
11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted.
12 | 5. Submit a pull request.
13 | 
14 | Thank you for your contribution!


--------------------------------------------------------------------------------
/V1 Source/README.TXT:
--------------------------------------------------------------------------------
 1 | The code in the book has been simplified by the removal
 2 | of memory allocation, error checking, et cetera,
 3 | in order to provide the clearest presentation.
 4 | 
 5 | Most users would like examples of reasonable ways
 6 | to handle these issues.  Also, few readers would want
 7 | to type in those long subroutines.
 8 | 
 9 | For these reasons I am making available mostly or
10 | entirely complete subroutines in this download.
11 | 
12 | However, because these routines are part of a large program,
13 | they necessarily contain numerous references to external
14 | routines.  These references must be replaced by the user
15 | with routines appropriate to the program the user
16 | is constructing.  This should not be difficult, as these
17 | external references are just to things like memory allocation
18 | and checking if the user has pressed the ESCape key.
19 | 
20 | Also, this code includes numerous references to a Model class
21 | whose declaration is not supplied.  This is because the
22 | complete declaration is large and complex.  But the
23 | references to Model members are all straightforward,
24 | so the reader should have no difficulty adapting these
25 | references to his/her own Model class.


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Freeware License, some rights reserved
 2 | 
 3 | Copyright (c) 2018 Timothy Masters
 4 | 
 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 
 6 | of this software and associated documentation files (the "Software"), 
 7 | to work with the Software within the limits of freeware distribution and fair use. 
 8 | This includes the rights to use, copy, and modify the Software for personal use. 
 9 | Users are also allowed and encouraged to submit corrections and modifications 
10 | to the Software for the benefit of other users.
11 | 
12 | It is not allowed to reuse,  modify, or redistribute the Software for 
13 | commercial use in any way, or for a user’s educational materials such as books 
14 | or blog articles without prior permission from the copyright holder. 
15 | 
16 | The above copyright notice and this permission notice need to be included 
17 | in all copies or substantial portions of the software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/V2 Source/MRFFT_P.TXT:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  MRFFT_P - This contains the 'permute' routine called from MRFFT.          */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | void permute ( double *real , double *imag , int ntot , int npts ,
  8 |                int nspan , int inc , int n_facs , int n_sq_facs , double *work1 ,
  9 |                double *work2 , int *index , int *factors , int max_factor )
 10 | {
 11 |    int i, j, k, ibase, ip, span_inc, sqfac_index, fac_sum, offset, lfm1 ;
 12 |    int inner_span, inner_span_m1, tot_pts, jmp, jump, jump_save, nearp, farp ;
 13 |    int this_fac, other_fac, current_index, cycle, limit, which_index ;
 14 |    double temp, *wptr1, *wptr2 ;
 15 | 
 16 |    index[0] = inner_span = inc * nspan ;
 17 |    inner_span_m1 = inner_span - 1 ;
 18 |    tot_pts = inc * ntot ;
 19 |    jump = inner_span / npts ;
 20 |    current_index = 0 ;
 21 | 
 22 | /*
 23 |    Do the square factors
 24 | */
 25 | 
 26 |    if (n_sq_facs) {
 27 |       i = 0 ;
 28 |       j = 2 * n_sq_facs ;
 29 |       if (j >= n_facs)
 30 |          --j ;
 31 |       index[j+1] = jump ;
 32 |       for (;;) {
 33 |          index[i+1] = index[i] / factors[i] ;
 34 |          index[j] = index[j+1] * factors[i] ;
 35 |          if (++i >= --j)
 36 |             break ;
 37 |          }
 38 |       nearp = jump ;
 39 |       farp = span_inc = index[1] ;
 40 |       jump_save = index[j+1] ;
 41 |       which_index = 1 ;
 42 | 
 43 |       if (npts != ntot) {
 44 | mv_permute:
 45 |          limit = nearp + jump ;  // Permutations for multivariate transform
 46 |          while (nearp < limit) {
 47 |             temp = real[nearp] ;
 48 |             real[nearp] = real[farp] ;
 49 |             real[farp] = temp ;
 50 |             temp = imag[nearp] ;
 51 |             imag[nearp] = imag[farp] ;
 52 |             imag[farp] = temp ;
 53 |             nearp += inc ;
 54 |             farp += inc ;
 55 |             }
 56 | 
 57 |          k = inner_span - jump ;
 58 |          nearp += k ;
 59 |          farp += k ;
 60 |          if (nearp >= tot_pts-1) {
 61 |             nearp += jump - tot_pts ;
 62 |             farp += span_inc - tot_pts ;
 63 |             if (farp < inner_span_m1)
 64 |                goto mv_permute ;
 65 |             }
 66 |          else
 67 |             goto mv_permute ;
 68 | 
 69 |          for (;;) {
 70 |             ++which_index ;
 71 |             farp += index[which_index] - index[which_index-2] ;
 72 |             if (farp >= index[which_index-1])
 73 |                continue ;
 74 |             which_index = 1 ;
 75 |             for (;;) {
 76 |                if (nearp < farp)
 77 |                   goto mv_permute ;
 78 |                nearp += jump ;
 79 |                farp += span_inc ;
 80 |                if (farp >= inner_span_m1)
 81 |                   break ;
 82 |                }
 83 |       
 84 |             if (nearp >= inner_span_m1)
 85 |                break ;
 86 |             }
 87 |          }
 88 | 
 89 | /*
 90 |    Permutation for single-value transform
 91 | */
 92 | 
 93 |       else {
 94 | permute_loop:
 95 |          for (;;) {
 96 |             temp = real[nearp] ;
 97 |             real[nearp] = real[farp] ;
 98 |             real[farp] = temp ;
 99 |             temp = imag[nearp] ;
100 |             imag[nearp] = imag[farp] ;
101 |             imag[farp] = temp ;
102 |             nearp += inc ;
103 |             farp += span_inc ;
104 |             if (farp >= inner_span_m1)
105 |                break ;
106 |             }
107 | 
108 |          for (;;) {
109 |             ++which_index ;
110 |             farp += index[which_index] - index[which_index-2] ;
111 |             if (farp >= index[which_index-1])
112 |                continue ;
113 |             which_index = 1 ;
114 |             for (;;) {
115 |                if (nearp >= farp) {
116 |                   nearp += inc ;
117 |                   farp += span_inc ;
118 |                   if (farp >= inner_span_m1)
119 |                      break ;
120 |                   }
121 |                else
122 |                   goto permute_loop ;
123 |                }
124 |             if (nearp >= inner_span_m1)
125 |                break ;
126 |             }
127 |          }
128 | 
129 |       jump = jump_save ;
130 |       }
131 | 
132 | /*
133 |    The square factors are done.  If that is all of them, we are done.
134 | */
135 | 
136 |    if (2*n_sq_facs+1 >= n_facs)
137 |       return ;
138 | 
139 | /*
140 |    Permutations for square-free factors
141 | */
142 | 
143 |    sqfac_index = index[n_sq_facs] ;
144 |    k = n_facs - n_sq_facs ;
145 |    factors[k] = 1 ;
146 |    while (--k > n_sq_facs)
147 |       factors[k-1] *= factors[k] ;
148 | 
149 |    other_fac = factors[n_sq_facs++] ;
150 |    lfm1 = other_fac - 1 ;
151 |    k = n_sq_facs ;
152 |    this_fac = factors[k] ;
153 |    fac_sum = 0 ;
154 |    
155 |    for (i=0 ; i<lfm1 ;) {
156 |       fac_sum += this_fac ;
157 |       if (fac_sum >= other_fac) {
158 |          fac_sum -= other_fac ;
159 |          other_fac = this_fac ;
160 |          this_fac = factors[++k] ;
161 |          continue ;
162 |          }
163 |       other_fac = factors[n_sq_facs-1] ;
164 |       k = n_sq_facs ;
165 |       this_fac = factors[k] ;
166 |       index[i++] = fac_sum ;
167 |       }
168 | 
169 | /*
170 |    Determine permutation cycles > 1
171 | */
172 | 
173 |    which_index = 0 ;
174 |    for (;;) {
175 |       while (index[which_index] < 0)
176 |          ++which_index ;
177 |       cycle = index[which_index++] ;
178 |       if (cycle != which_index) {
179 |          for (;;) {
180 |             i = cycle - 1 ;
181 |             cycle = index[i] ;
182 |             index[i] = -cycle ;
183 |             if (cycle == which_index)
184 |                break ;
185 |             }
186 |          current_index = cycle ;
187 |          continue ;
188 |          }
189 |       index[which_index-1] = -which_index ;
190 |       if (which_index == lfm1)
191 |          break ;
192 |       }
193 | 
194 |    max_factor *= inc ;
195 | 
196 | /*
197 |    Reorder 
198 | */
199 | 
200 |    which_index = current_index ;
201 |    tot_pts -= sqfac_index ;
202 | 
203 |    while (tot_pts >= 0) {
204 |       while (index[--which_index] < 0) ;
205 |       jmp = jump ;
206 |       offset = tot_pts - inc ;
207 | 
208 |       for (;;) {
209 |          inner_span = jmp ;
210 |          if (inner_span > max_factor)
211 |             inner_span = max_factor ;
212 |          jmp -= inner_span ;
213 |          cycle = index[which_index] ;
214 |          ibase = offset + jump * cycle + jmp ;
215 |          ip = ibase + inner_span ;
216 |          wptr1 = work1 ;
217 |          wptr2 = work2 ;
218 |          for (;;) {
219 |             *wptr1++ = real[ip] ;
220 |             *wptr2++ = imag[ip] ;
221 |             ip -= inc ;
222 |             if (ip == ibase)
223 |                break ;
224 |             }
225 |          for (;;) {
226 |             ip = ibase + inner_span ;
227 |             k = jump * (cycle + index[cycle-1]) ;
228 |             wptr1 = real - k ;
229 |             wptr2 = imag - k ;
230 |             for (;;) {
231 |                real[ip] = wptr1[ip] ;
232 |                imag[ip] = wptr2[ip] ;
233 |                ip -= inc ;
234 |                if (ip == ibase)
235 |                   break ;
236 |                }
237 |             ibase -= k ;
238 |             cycle = -index[cycle-1] ;
239 |             if (cycle == which_index+1)
240 |                break ;
241 |             }
242 |          ip = ibase + inner_span ;
243 |          wptr1 = work1 ;
244 |          wptr2 = work2 ;
245 |          for (;;) {
246 |             real[ip] = *wptr1++ ;
247 |             imag[ip] = *wptr2++ ;
248 |             ip -= inc ;
249 |             if (ip == ibase)
250 |                break ;
251 |             }
252 |          if (! jmp)
253 |             break ;
254 |          }
255 |       if (which_index)
256 |          continue ;
257 | 
258 |       which_index = current_index ;
259 |       tot_pts -= sqfac_index ;
260 |       if (tot_pts < 0)
261 |          break ;
262 |       }
263 |    return ;
264 | }
265 | 
266 | 


--------------------------------------------------------------------------------
/V2 Source/CUDA_GRAD.TXT:
--------------------------------------------------------------------------------
  1 | /*
  2 | --------------------------------------------------------------------------------
  3 | 
  4 |    gradient_cuda - Compute the gradient for the entire training set
  5 | 
  6 | --------------------------------------------------------------------------------
  7 | */
  8 | 
  9 | double CpxAuto::gradient_cuda (
 10 |    int nc ,             // Number of cases
 11 |    int nin ,            // Number of (possibly complex) inputs
 12 |    double *input ,      // Nc by max_neurons input matrix
 13 |    int nout ,           // Number of (possibly complex) outputs
 14 |    double *target ,     // Nc by nout target matrix, or autoencoding if NULL
 15 |    int n_layers ,       // Number of layers, including output (Number of hidden is one less than this)
 16 |    int *nhid ,          // Number of hidden neurons in each layer
 17 |    int n_weights ,      // Total (actual) number of weights, including final layer and bias
 18 |    double *weights[] ,  // Weight matrices for layers
 19 |    int use_final_layer_weights , // Use final_layer_weights (vs last weight layer)?
 20 |    double *grad         // Concatenated gradient vector, which is computed here
 21 |    )
 22 | {
 23 |    int i, k, n, ilayer, ineuron, ivar, ret_val, ibatch, n_in_batch, n_subsets, istart, istop, n_done, max_batch ;
 24 |    int n_prior, gradlen, nin_this_layer, timer, n_last_layer_weights, mult ;
 25 |    double mse, wpen, *wptr, *gptr, *last_layer_weights ;
 26 |    char msg[256] ;
 27 | 
 28 |    assert ( n_layers >= 2 ) ;  // Use CUDA only if at least one hidden layer
 29 | 
 30 |    mult = is_complex  ?  2 : 1 ;
 31 | 
 32 |    if (use_final_layer_weights) {                      // Full CpxAuto model
 33 |       assert ( target != NULL ) ;
 34 |       last_layer_weights = final_layer_weights ;
 35 |       n_last_layer_weights = n_final_layer_weights ;   // Per output, not total; If complex, this is actual
 36 |       }
 37 | 
 38 |    else {                                              // Greedily training an autoencoder
 39 |       assert ( target == NULL ) ;                      // which may or may not be complex
 40 |       last_layer_weights = weights[n_layers-1] ;
 41 |       n_last_layer_weights = mult * (nhid[n_layers-2] + 1) ;
 42 |       }
 43 | 
 44 | // Setup pointers to gradient for each layer
 45 |    gptr = grad ;
 46 | 
 47 |    for (ilayer=0 ; ilayer<n_layers ; ilayer++) {
 48 |       grad_ptr[ilayer] = gptr ;
 49 | 
 50 |       if (ilayer == 0  &&  n_layers == 1) {          // Direct input to output?
 51 |          n = nout * mult * (nin+1) ;                 // This many inputs to each neuron in this layer
 52 |          gptr += n ;                                 // Not needed, but it illustrates the process
 53 |          }
 54 | 
 55 |       else if (ilayer == 0) {                        // First hidden layer?
 56 |          n = nhid[ilayer] * mult * (nin+1) ;         // This many inputs to each neuron in this layer
 57 |          gptr += n ;
 58 |          }
 59 | 
 60 |       else if (ilayer < n_layers-1) {                   // Subsequent hidden layer?
 61 |          n = nhid[ilayer] * mult * (nhid[ilayer-1]+1) ; // This many inputs to each neuron in this layer
 62 |          gptr += n ;
 63 |          }
 64 | 
 65 |       else
 66 |          n = nout * mult * (nhid[ilayer-1]+1) ;         // Not needed but illustrates process
 67 |       } // For all layers, including output
 68 | 
 69 | /*
 70 |    In order to prevent integer overflow in allocating memory for the gradient
 71 |    we compute the minimum number of batches needed to get each batch small enough.
 72 | */
 73 | 
 74 |    gradlen = 0 ;
 75 |    n_prior = nin ;
 76 |    for (i=0 ; i<n_layers-1 ; i++) {   // Hidden layers
 77 |       gradlen += mult * nhid[i] * (n_prior + 1) ;
 78 |       n_prior = nhid[i] ;
 79 |       }
 80 |    gradlen += mult * nout * (n_prior + 1) ;    // Output layer
 81 |    assert ( gradlen == n_weights ) ;
 82 | 
 83 |    max_batch = MAXPOSNUM / (gradlen * sizeof(float)) ;  // Memory allocation size
 84 |    if (max_batch > 65535)                               // Grid dimension
 85 |       max_batch = 65535 ;
 86 |    if (max_batch > nc)
 87 |       max_batch = nc ;
 88 |    n_subsets = nc / max_batch + 1 ;    // The +1 is required
 89 | 
 90 |    if (n_subsets < TrainParams.n_subsets)  // Allow user to increase to prevent Windows timeout
 91 |       n_subsets = TrainParams.n_subsets ;
 92 | 
 93 |    if (n_subsets > nc) {  // Happens only if user specifies a huge model for a tiny dataset
 94 |       ... Issue error message and abort
 95 |       }
 96 | 
 97 | /*
 98 |    Initialize CUDA device if not yet done for this session
 99 | 
100 |    Programming WARNING... If ANY of the parameters in the call to cpx_cuda_init change,
101 |                           then cpx_cuda_cleanup MUST be called and init redone!
102 | */
103 | 
104 |    if (! cpx_cuda_initialized) {
105 | 
106 |       n_done = 0 ;         // Must find max batch size for cuda init
107 |       for (ibatch=0 ; ibatch<n_subsets ; ibatch++) {
108 |          n_in_batch = (nc - n_done) / (n_subsets - ibatch) ;   // Cases left to do / batches left to do
109 |          if (ibatch == 0  ||  n_in_batch > max_batch)
110 |             max_batch = n_in_batch ;
111 |          n_done += n_in_batch ;
112 |          }
113 | 
114 |       cpx_cuda_init ( is_complex , classifier && (target != NULL) , class_ids ,
115 |                       nc , nin , max_neurons , input ,
116 |                       nout , target , max_batch , n_layers , nhid , msg ) ;
117 | 
118 |       cpx_cuda_initialized = 1 ;
119 |       }
120 | 
121 | 
122 |    if (cuda_weights_changed) {
123 |       cuda_cpx_weights_to_device ( nin , nout , n_layers , nhid , weights , last_layer_weights ) ;
124 |       cuda_weights_changed = 0 ;
125 |       }
126 | 
127 | /*
128 |    Gradient computation starts here
129 | */
130 | 
131 |    for (i=0 ; i<n_weights ; i++)
132 |       grad[i] = 0.0 ;
133 | 
134 |    istart = 0 ;         // Batch start = training data start
135 |    n_done = 0 ;         // Number of training cases done in this epoch so far
136 | 
137 |    for (ibatch=0 ; ibatch<n_subsets ; ibatch++) {
138 |       n_in_batch = (nc - n_done) / (n_subsets - ibatch) ;   // Cases left to do / batches left to do
139 |       istop = istart + n_in_batch ;                         // Stop just before this index
140 | 
141 | /*
142 |    Forward pass
143 | */
144 | 
145 |       for (ilayer=0 ; ilayer<n_layers-1 ; ilayer++)
146 |          cuda_cpx_hidden_activation ( istart , istop , nhid[ilayer] , ilayer , 1 ) ;
147 | 
148 |       cuda_cpx_output_activation ( istart , istop , nout ) ;
149 | 
150 |       if (classifier && (target != NULL))
151 |          cuda_cpx_softmax ( istart , istop ) ;
152 | 
153 | /*
154 |    Backward pass
155 | */
156 | 
157 |       cuda_cpx_output_delta ( istart , istop , classifier && (target != NULL) , nout ) ;
158 |       cuda_cpx_output_gradient ( n_in_batch , nhid[n_layers-2] , n_layers-2 , nout ) ;
159 | 
160 |       for (ilayer=n_layers-2 ; ilayer>0 ; ilayer--)
161 |          cuda_cpx_subsequent_hidden_gradient ( n_in_batch , ilayer ,
162 |                  nhid[ilayer] , nhid[ilayer-1] , ilayer==n_layers-2 ) ;
163 | 
164 |       cuda_cpx_first_hidden_gradient ( istart , istop , nin , nhid[0] , n_layers==2 ) ;
165 | 
166 |       cuda_cpx_fetch_gradient ( n_in_batch , grad ) ;
167 | 
168 |       n_done += n_in_batch ;
169 |       istart = istop ;
170 |       }  // For all batches
171 | 
172 |    for (i=0 ; i<n_weights ; i++)
173 |       grad[i] /= nc * nout ;
174 | 
175 | 
176 |    if (classifier && (target != NULL)) {
177 |       cuda_cpx_ll ( nc , &mse ) ;
178 |       mse /= nout ;  // cuda_cpx_ll() divided by n but not nout
179 |       }
180 | 
181 |    else
182 |       ret_val = cuda_cpx_mse ( nc * nout , &mse ) ;
183 | 
184 | /*
185 |    Deal with weight penalty
186 |    First block of code does hidden layers, second does output layer
187 | */
188 | 
189 |    wpen = TrainParams.wpen / n_weights ;
190 |    penalty = 0.0 ;
191 | 
192 |    nin_this_layer = nin ;
193 | 
194 |    for (ilayer=0 ; ilayer<n_layers-1 ; ilayer++) {  // Do all hidden layers
195 | 
196 |       for (ineuron=0 ; ineuron<nhid[ilayer] ; ineuron++) {
197 |          wptr =  weights[ilayer] + ineuron * mult * (nin_this_layer+1) ;  // Weights for this neuron in this layer
198 |          gptr = grad_ptr[ilayer] + ineuron * mult * (nin_this_layer+1) ;  // Ditto grad
199 |          for (ivar=0 ; ivar<mult*nin_this_layer ; ivar++) {               // Do not include bias
200 |             penalty += wptr[ivar] * wptr[ivar] ;
201 |             gptr[ivar] -= 2.0 * wpen * wptr[ivar] ;
202 |             }
203 |          }
204 |       nin_this_layer = nhid[ilayer] ;
205 |       }
206 | 
207 |    for (ineuron=0 ; ineuron<nout ; ineuron++) {
208 |       wptr = last_layer_weights + ineuron * n_last_layer_weights ;
209 |       gptr = grad_ptr[n_layers-1] + ineuron * n_last_layer_weights ;
210 |       for (ivar=0 ; ivar<mult*nin_this_layer ; ivar++) {             // Do not include bias
211 |          penalty += wptr[ivar] * wptr[ivar] ;
212 |          gptr[ivar] -= 2.0 * wpen * wptr[ivar] ;
213 |          }
214 |       }
215 | 
216 |    penalty *= wpen ;
217 |    return mse + penalty ;
218 | }


--------------------------------------------------------------------------------
/V2 Source/SERIES.TXT:
--------------------------------------------------------------------------------
  1 | // The routines and code fragments here are related to preprocessing a time series
  2 | 
  3 | 
  4 | /*
  5 | --------------------------------------------------------------------------------
  6 | 
  7 |    Compute first, second, and third-order normalized orthogonal coefs
  8 |    for n data points.
  9 |    Form the dot product of c1 with a series to compute the linear slope
 10 |    Use c2 or c3 for quadratic or cubic fit.
 11 | 
 12 | --------------------------------------------------------------------------------
 13 | */
 14 | 
 15 | void legendre_3 ( int n , double *c1 , double *c2 , double *c3 )
 16 | {
 17 |    int i ;
 18 |    double sum, mean, proj ;
 19 | 
 20 | /*
 21 |    Compute c1
 22 | */
 23 | 
 24 |    sum = 0.0 ;
 25 |    for (i=0 ; i<n ; i++) {
 26 |       c1[i] = 2.0 * i / (n - 1.0) - 1.0 ;
 27 |       sum += c1[i] * c1[i] ;
 28 |       }
 29 | 
 30 |    sum = sqrt ( sum ) ;
 31 |    for (i=0 ; i<n ; i++)
 32 |       c1[i] /= sum ;
 33 | 
 34 | /*
 35 |    Compute c2
 36 | */
 37 | 
 38 |    sum = 0.0 ;
 39 |    for (i=0 ; i<n ; i++) {
 40 |       c2[i] = c1[i] * c1[i] ;
 41 |       sum += c2[i] ;
 42 |       }
 43 | 
 44 |    mean = sum / n ;               // Center it and normalize to unit length
 45 | 
 46 |    sum = 0.0 ;
 47 |    for (i=0 ; i<n ; i++) {
 48 |       c2[i] -= mean ;
 49 |       sum += c2[i] * c2[i] ;
 50 |       }
 51 | 
 52 |    sum = sqrt ( sum ) ;
 53 |    for (i=0 ; i<n ; i++)
 54 |       c2[i] /= sum ;
 55 | 
 56 | /*
 57 |    Compute c3
 58 | */
 59 | 
 60 |    sum = 0.0 ;
 61 |    for (i=0 ; i<n ; i++) {
 62 |       c3[i] = c1[i] * c1[i] * c1[i] ;
 63 |       sum += c3[i] ;
 64 |       }
 65 | 
 66 |    mean = sum / n ;               // Center it and normalize to unit length
 67 | 
 68 |    sum = 0.0 ;
 69 |    for (i=0 ; i<n ; i++) {
 70 |       c3[i] -= mean ;
 71 |       sum += c3[i] * c3[i] ;
 72 |       }
 73 | 
 74 |    sum = sqrt ( sum ) ;
 75 |    for (i=0 ; i<n ; i++)
 76 |       c3[i] /= sum ;
 77 | 
 78 |    // Remove the projection of c1
 79 | 
 80 |    proj = 0.0 ;
 81 |    for (i=0 ; i<n ; i++)
 82 |       proj += c1[i] * c3[i] ;
 83 | 
 84 |    sum = 0.0 ;
 85 |    for (i=0 ; i<n ; i++) {
 86 |       c3[i] -= proj * c1[i] ;
 87 |       sum += c3[i] * c3[i] ;
 88 |       }
 89 | 
 90 |    sum = sqrt ( sum ) ;
 91 |    for (i=0 ; i<n ; i++)
 92 |       c3[i] /= sum ;
 93 | }
 94 | 
 95 | 
 96 | 
 97 | 
 98 | /*
 99 | --------------------------------------------------------------------------------
100 | 
101 |    Local routine computes FFT
102 | 
103 |    There may be an even or odd number of cases,
104 |    and we may or may not be centering the data.
105 | 
106 |    After the transform:
107 | 
108 |       If n is even
109 |          R[0] = sum
110 |          I[0] = 0
111 |          R[n/2] = alternating sum
112 |          I[n/2] = 0
113 |          R[i] = R[n-i]
114 |          I[i] = -I[n-i]
115 |          We have n/2 real and n/2 imaginary unique values
116 |          and n/2+1 complex numbers with two zero parts
117 | 
118 |       If n is odd
119 |          R[0] = sum
120 |          I[0] = 0
121 |          R[i] = R[n-i]
122 |          I[i] = -I[n-i]
123 |          R[n/2] and I[n/2] are valid almost-Nyquist data
124 |          We have n/2+1 real and n/2 imaginary unique values
125 |          and n/2+1 complex numbers with one zero part
126 | 
127 |       But if we center, the sum is zero, so R[0] = I[0] = 0
128 |          
129 | --------------------------------------------------------------------------------
130 | */
131 | 
132 | void do_fft ( int n , int center , double *in , double *out , double *work , FFT *fft )
133 | {
134 |    int i, k ;
135 |    double *xr, *xi, win, wsum, dsum, wsq ;
136 | 
137 |    xr = work ;
138 |    xi = xr + n ;
139 | 
140 | /*
141 |    It would be slightly more efficient to use the half-length FFT
142 |    method.  But the difference is tiny and not worth the bother of
143 |    dealing with possibly odd length series.
144 | */
145 | 
146 |    for (i=0 ; i<n ; i++) {
147 |       xr[i] = in[i] ;
148 |       xi[i] = 0.0 ;
149 |       }
150 | 
151 |    wsum = dsum = wsq = 0.0 ;
152 |    for (i=0 ; i<n ; i++) {
153 |       win = (i - 0.5 * (n-1)) / (0.5 * (n+1)) ;
154 |       win = 1.0 - win * win ;  // Welch data window
155 |       wsum += win ;
156 |       dsum += win * xr[i] ;
157 |       wsq += win * win ;
158 |       }
159 | 
160 |    if (center)
161 |       dsum /= wsum ;                  // Weighted mean
162 |    else
163 |       dsum = 0.0 ;
164 | 
165 |    wsq = 1.0 / sqrt ( n * wsq ) ;     // Compensate for reduced power
166 | 
167 |    for (i=0 ; i<n ; i++) {
168 |       win = (i - 0.5 * (n-1)) / (0.5 * (n+1)) ;
169 |       win = 1.0 - win * win ;         // Welch data window
170 |       win *= wsq ;                    // Compensate for reduced power
171 |       xr[i] = win * (xr[i] - dsum) ;  // Window after centering
172 |       }
173 | 
174 |    fft->cpx ( xr , xi , 1 ) ;  // Transform to frequency domain
175 | 
176 |    k = 0 ;
177 | 
178 |    if (! center)
179 |       out[k++] = xr[0] ;
180 | 
181 |    for (i=1 ; i<n/2 ; i++) {
182 |       out[k++] = xr[i] ;
183 |       out[k++] = xi[i] ;
184 |       }
185 | 
186 |    out[k++] = xr[n/2] ;
187 |    if (n % 2)
188 |       out[k++] = xi[n/2] ;
189 | }
190 | 
191 | 
192 | 
193 | /*
194 | --------------------------------------------------------------------------------------
195 | 
196 |    Do the Morlet transform
197 | 
198 | --------------------------------------------------------------------------------------
199 | */
200 | 
201 | static void compute_morlet (
202 |    FFT *fft ,        // Does the FFT
203 |    int period ,      // Period (1 / center frequency) of desired filter
204 |    int width ,       // Width on each side of center
205 |    int lag ,         // Lag back from current for center of filter; ideally equals width
206 |    int lookback ,    // Number of samples in input buffer
207 |    int n ,           // Lookback plus padding, bumped up to nearest power of two
208 |    double *buffer ,  // Input data
209 |    double *realval , // Real value returned here
210 |    double *imagval , // Imaginary value returned here
211 |    double *xr ,      // Work vector n long
212 |    double *xi ,      // Ditto
213 |    double *yr ,      // Ditto
214 |    double *yi )      // Ditto
215 | {
216 |    int i, nyquist ;
217 |    double mean, freq, fwidth, multiplier, f, wt ;
218 | 
219 |    nyquist = n / 2 ;   // The transform and function are symmetric around this index
220 |    freq = 1.0 / period ;
221 |    fwidth = 0.8 / width ;
222 | 
223 | 
224 | /*
225 |    Copy the data from the user's series to a local work area, and pad with mean as needed.
226 |    Reverse the time order for slight simplification:
227 |    Lag will be from start of series, and padding is at end.
228 | */
229 | 
230 |    mean = 0.0 ;
231 |    for (i=0 ; i<lookback ; i++) {
232 |       xr[i] = buffer[lookback-1-i] ;
233 |       xi[i] = 0.0 ;
234 |       mean += xr[i] ;
235 |       }
236 | 
237 |    mean /= lookback ;
238 | 
239 |    while (i<n) {
240 |       xr[i] = mean ;
241 |       xi[i++] = 0.0 ;
242 |       }
243 | 
244 | 
245 | /*
246 | -------------------------------------------------------------
247 |    Do the forward transform and multiply by REAL Morlet coefs
248 |    We need 'multiplier' to normalize the magnitude.
249 | -------------------------------------------------------------
250 | */
251 | 
252 |       fft->cpx ( xr , xi , 1 ) ;  // Transform to frequency domain
253 |       multiplier = 1.0 / (morlet_coefs ( freq , freq , fwidth , 1 ) + 1.e-140 ) ;
254 | 
255 |       for (i=1 ; i<nyquist ; i++) {     // Do just symmetric part
256 |          f = (double) i / (double) n ;  // This frequency
257 |          wt = multiplier * morlet_coefs ( f , freq , fwidth , 1 ) ;
258 |          yr[i] = xr[i] * wt ;
259 |          yi[i] = xi[i] * wt ;
260 |          yr[n-i] = xr[n-i] * wt ;
261 |          yi[n-i] = xi[n-i] * wt ;
262 |          } // For all unique frequencies strictly between zero and Nyquist
263 | 
264 |       // The Morlet coef at f=0 is zero, so set yr[0] and yi[0] to zero.
265 |       // Also, the imaginary Nyquist in yi[nyquist] is always zero by definition.
266 |       // If this is a real transform, we need to weight the real Nyquist in
267 |       // yr[nyquist].  But if imaginary, the function is antisymmetric and
268 |       // crosses here.
269 | 
270 |       yr[0] = yi[0] = yi[nyquist] = 0.0 ;  // Always true
271 |       wt = multiplier * morlet_coefs ( 0.5 , freq , fwidth , 1 ) ;
272 |       yr[nyquist] = xr[nyquist] * wt ;
273 | 
274 | /*
275 |    Transform back to the time domain and return lagged value.
276 | */
277 | 
278 |       fft->cpx ( yr , yi , -1 ) ;        // Back to time domain
279 |       *realval = yr[lag] / n ;
280 | 
281 | 
282 | /*
283 | ------------------------------------------------------------------
284 |    Do the forward transform and multiply by IMAGINARY Morlet coefs
285 |    We need 'multiplier' to normalize the magnitude.
286 | ------------------------------------------------------------------
287 | */
288 | 
289 |       multiplier = 1.0 / (morlet_coefs ( freq , freq , fwidth , 0 ) + 1.e-140 ) ;
290 | 
291 |       for (i=1 ; i<nyquist ; i++) {     // Do just symmetric part
292 |          f = (double) i / (double) n ;  // This frequency
293 |          wt = multiplier * morlet_coefs ( f , freq , fwidth , 0 ) ;
294 |          yr[i] = -xi[i] * wt ;
295 |          yi[i] = xr[i] * wt ;
296 |          yr[n-i] = xi[n-i] * wt ;
297 |          yi[n-i] = -xr[n-i] * wt ;
298 |          } // For all unique frequencies strictly between zero and Nyquist
299 | 
300 |       // The Morlet coef at f=0 is zero, so set yr[0] and yi[0] to zero.
301 |       // Also, the imaginary Nyquist in yi[nyquist] is always zero by definition.
302 |       // The imaginary function is antisymmetric and crosses here.
303 | 
304 |       yr[0] = yi[0] = yr[nyquist] = yi[nyquist] = 0.0 ;
305 | 
306 | /*
307 |    Transform back to the time domain and return lagged value.
308 | */
309 | 
310 |       fft->cpx ( yr , yi , -1 ) ;        // Back to time domain
311 |       *imagval = -yr[lag] / n ;
312 | } ;
313 | 


--------------------------------------------------------------------------------
/V2 Source/MRFFT.TXT:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /* MRFFT - This is the constructor, destructor, and external entry points     */
  4 | /*         for the FFT class, which implements a mixed-radix Fast Fourier     */
  5 | /*         Transform.  Two large subroutines are called from here.            */
  6 | /*         MRFFT_K contains 'kernels' which transforms for all prime kernels. */
  7 | /*         MRFFT_P contains 'permute' which does the final permutations.      */
  8 | /*                                                                            */
  9 | /* When the user constructs an FFT object, working storage is allocated.      */
 10 | /* If there is a problem, the public member variable 'ok' is set to zero.     */
 11 | /* Normally it is set to one.                                                 */
 12 | /*                                                                            */
 13 | /* The constructor call supplies all of the dimensions of the transform.      */
 14 | /* The member functions that perform the transform are:                       */
 15 | /*                                                                            */
 16 | /*   cpx ( double *real , double *imag , int isign ) - Compute the fully      */
 17 | /*         general multivariate complex Fourier transform.                    */
 18 | /*                                                                            */
 19 | /*   rv ( double *real , double *imag ) - Compute the forward transform       */
 20 | /*        (isign=1) of a real vector containing an even number of points.     */
 21 | /*                                                                            */
 22 | /*   irv ( double *real , double *imag ) - Compute the inverse transform      */
 23 | /*         (isign=-1) of the transform of a real vector.                      */
 24 | /*                                                                            */
 25 | /*                                                                            */
 26 | /* This algorithm is heavily inspired by Singleton's famous FORTRAN version.  */
 27 | /* The following changes have been made relative to the version of the        */
 28 | /* program on which this is based:                                            */
 29 | /*   1) There was a severe bug in the method for computing the maximum amount */
 30 | /*      of working storage required.  Some values of N cause it to be         */
 31 | /*      underestimated.  This has been fixed.                                 */
 32 | /*   2) Like most efficient implementations, that version uses a trig         */
 33 | /*      identity to compute successive angle functions across spans.          */
 34 | /*      However, he reverts to direct sine and cosine computation every       */
 35 | /*      32 passes to avoid buildup of error.  Unfortunately, the logic for    */
 36 | /*      doing that contains a flaw that results in an endless loop for a      */
 37 | /*      a few rare values of N.  (The first appears to be 2*2*3*3*11*11.)     */
 38 | /*      Modern math coprocessors work at an internal precision that exceeds   */
 39 | /*      external precision.  Thus, error does not build up as much as it did  */
 40 | /*      in early mainframes.  The recomputation logic has been removed from   */
 41 | /*      the version here.                                                     */
 42 | /*   3) Modern processors rely heavily on pipelines.  Therefore, great        */
 43 | /*      effort was expended to reorder all loops so as to avoid breaking      */
 44 | /*      pipelines and to take advantage of branch prediction as much as       */
 45 | /*      possible.  Sometimes that has required repetition of some code.       */
 46 | /*      So be it.  It is worthwhile.                                          */
 47 | /*   4) The order of many floating-point operations has been revised to       */
 48 | /*      keep repeatedly used operands physically close in the code.           */
 49 | /*      This helps optimizing compilers, as well as facilitating use of       */
 50 | /*      the fpt stack in assembler versions.                                  */
 51 | /*   5) Many subscript operations have been changed to pointer references.    */
 52 | /*      Those that are kept as subscripts (mainly in the permutation code     */
 53 | /*      because the subscripts are intimately related to the algorithm)       */
 54 | /*      have been revised to reflect the 0-origin of C as opposed to the      */
 55 | /*      1-origin of FORTRAN.  This occasionally made it necessary to make     */
 56 | /*      significant changes to the algorithm relative to Singleton's version. */
 57 | /*                                                                            */
 58 | /******************************************************************************/
 59 | 
 60 | 
 61 | void kernels ( double *real , double *imag , int ntot , int npts , int nspan ,
 62 |                int isign , int n_facs , double *rwork , double *iwork ,
 63 |                double *cosines , double *sines , int *factors ) ;
 64 | 
 65 | void permute ( double *real , double *imag , int ntot , int npts ,
 66 |                int nspan , int inc , int n_facs , int n_sq_facs , double *work1 ,
 67 |                double *work2 , int *index , int *factors , int max_factor ) ;
 68 | 
 69 | /*
 70 | --------------------------------------------------------------------------------
 71 | 
 72 |    Constructor
 73 | 
 74 |    If there is insufficient memory, it leaves public ok=0.
 75 |    The user should check for this after allocating with new.
 76 | 
 77 | --------------------------------------------------------------------------------
 78 | */
 79 | 
 80 | FFT::FFT (
 81 |    int ndim ,       // Dimension of current variable, N for a vector
 82 |    int spacing ,    // Spacing of consecutive points, 1 for a vector
 83 |    int n_segments   // Number of ndim*spacing segments, 1 for a vector
 84 |    )
 85 | {
 86 |    int i, kernel, trial, trial_sq, max_permute ;
 87 | 
 88 |    rwork = NULL ;
 89 |    iwork = NULL ;
 90 |    ok = 1 ;  // In case early return due to parameters
 91 | 
 92 |    npts = ndim ;
 93 |    if (npts == 1)  // FFT of a single point is itself
 94 |       return ;
 95 | 
 96 |    nspan = ndim * spacing ;
 97 |    ntot = nspan * n_segments ;
 98 | 
 99 |    if (ntot == 0)
100 |       return ;  // error if any of these are zero
101 | 
102 | /*
103 |    Determine the factors of n
104 | */
105 | 
106 |    max_permute = 1 ; // Added 5/97 per LINT (Not a bug)
107 |    kernel = ndim ;   // Successively divide this as it is factored
108 |    n_facs = 0 ;      // Number of factors so far
109 | 
110 |    while (kernel % 16 == 0) {   // Factors of 4*4
111 |       all_factors[n_facs++] = 4 ;
112 |       kernel /= 16 ;
113 |       }
114 | 
115 |    trial = 3 ;
116 |    trial_sq = 9 ;
117 | 
118 |    while (trial_sq <= kernel) {     // Factors of 3*3, 5*5, 7*7, etc.
119 |       while (kernel % trial_sq == 0) {
120 |          all_factors[n_facs++] = trial ;
121 |          kernel /= trial_sq ;
122 |          }
123 |       trial += 2 ;
124 |       trial_sq = trial * trial ;
125 |       }
126 | 
127 |    if (kernel <= 4) {               // Is just this one last factor left?
128 |       n_sq_facs = n_facs ;
129 |       if (kernel != 1)
130 |          all_factors[n_facs++] = kernel ;
131 |       }
132 | 
133 |    else {                           // More factors remain
134 |       if (kernel % 4 == 0) {        // Factor of 2*2
135 |          all_factors[n_facs++] = 2 ;
136 |          kernel /= 4 ;
137 |          }
138 | 
139 | /*
140 |    All square factors are out.  Now do the rest.
141 | */
142 | 
143 |       n_sq_facs = n_facs ;          // Preserve number of square factors
144 | 
145 |       max_permute = n_sq_facs + n_sq_facs + 2 ;  // Length of work area
146 |       if (kernel-1 > max_permute)
147 |          max_permute = kernel-1 ;
148 | 
149 |       trial = 2 ;
150 |       while (trial <= kernel) {      // trial=2, 3, 5, 7, 9, ...
151 |          if (kernel % trial == 0) {
152 |             all_factors[n_facs++] = trial ;
153 |             kernel /= trial ;
154 |             }
155 |          trial = (trial == 2)  ?  3  :  trial+2 ;
156 |          }
157 |       }
158 | 
159 |    if (n_facs <= n_sq_facs+1)          // Length of work area
160 |       max_permute = n_facs + n_sq_facs + 1 ;
161 | 
162 | /*
163 |    The factoring is done.
164 |    The square factors are up front, followed by the rest.
165 |    Copy the square factors to the end in reverse order.
166 | */
167 | 
168 |    i = n_sq_facs ;
169 |    while (i)
170 |       all_factors[n_facs++] = all_factors[--i] ;
171 | 
172 | /*
173 |    We will need working storage of length equal to the largest factor.
174 |    Find that value.
175 | */
176 | 
177 |    max_factor = 0 ;
178 |    for (i=0 ; i<n_facs ; i++) {
179 |       if (all_factors[i] > max_factor)
180 |          max_factor = all_factors[i] ;
181 |       }
182 | 
183 | /*
184 |    Allocate work areas
185 | */
186 | 
187 |    rwork = (double *) malloc ( 4 * max_factor * sizeof(double) ) ;
188 |    iwork = (int *) malloc ( max_permute * sizeof(int) ) ;
189 |    if ((rwork == NULL)  ||  (iwork == NULL)) {
190 |       if (rwork != NULL)
191 |          free ( rwork ) ;
192 |       if (iwork != NULL)
193 |          free ( iwork ) ;
194 |       rwork = NULL ;
195 |       iwork = NULL ;
196 |       ok = 0 ;
197 |       return ;
198 |       }
199 | }
200 | 
201 | /*
202 | --------------------------------------------------------------------------------
203 | 
204 |    Destructor
205 | 
206 | --------------------------------------------------------------------------------
207 | */
208 | 
209 | FFT::~FFT ()
210 | {
211 |    if (! ok)
212 |       return ;
213 |    if (rwork != NULL)
214 |       free ( rwork ) ;
215 |    if (iwork != NULL)
216 |       free ( iwork ) ;
217 | }
218 | 
219 | /*
220 | --------------------------------------------------------------------------------
221 | 
222 |    Compute a full complex multivariate transform
223 | 
224 | --------------------------------------------------------------------------------
225 | */
226 | 
227 | void FFT::cpx ( double *real , double *imag , int isign ) // Complex array
228 | {
229 |    int i, factors[64] ;
230 | 
231 |    if (npts == 1)
232 |       return ;
233 | 
234 |    for (i=0 ; i<n_facs ; i++)
235 |       factors[i] = all_factors[i] ;
236 | 
237 |    kernels ( real , imag , ntot , npts , nspan , isign , n_facs ,
238 |              rwork , rwork+max_factor , rwork+2*max_factor ,
239 |              rwork+3*max_factor , factors ) ;
240 | 
241 |    permute ( real , imag , ntot , npts , nspan , abs(isign) , n_facs ,
242 |              n_sq_facs , rwork , rwork+max_factor , iwork , factors ,
243 |              max_factor ) ;
244 | }
245 | 
246 | /*
247 | --------------------------------------------------------------------------------
248 | 
249 |    Compute a forward transform (positive sign) of a real vector 2*N long
250 |    alternately arranged in the real and imaginary inputs.
251 | 
252 |    The constructor must have been called with ndim equal to half the length
253 |    of the real series.
254 | 
255 |    Note that the real part of the Nyquist point is returned in imag[0],
256 |    which is truly zero.
257 | 
258 | --------------------------------------------------------------------------------
259 | */
260 | 
261 | void FFT::rv (
262 |    double *real ,  // In: 0,2,4,... Out:Real parts
263 |    double *imag    // In: 1,3,5,... Out: Imaginary parts
264 |    )
265 | {
266 |    int i, j, lim ;
267 |    double theta, wr, wi, wkr, wki, t, h1r, h1i, h2r, h2i ;
268 | 
269 |    cpx ( real , imag , 1 ) ;
270 | 
271 | /*
272 |    Use the guaranteed zero imag[0] to actually return real[n]
273 | */
274 | 
275 |    t = real[0] ;
276 |    real[0] = t + imag[0] ;
277 |    imag[0] = t - imag[0] ;
278 | 
279 | /*
280 |    Now do the remainder through n-1
281 | */
282 | 
283 |    theta = PI / (double) npts ;
284 |    t = sin ( 0.5 * theta ) ;
285 |    wr = 1.0 + (wkr = -2.0 * t * t) ;
286 |    wi = wki = sin ( theta ) ;
287 | 
288 |    lim = (npts % 2)  ?  npts/2+1 : npts/2 ;
289 |    for (i=1 ; i<lim ; i++) {
290 |       j = npts - i ;
291 |       h1r =  0.5 * (real[i] + real[j]) ;
292 |       h1i =  0.5 * (imag[i] - imag[j]) ;
293 |       h2r =  0.5 * (imag[i] + imag[j]) ;
294 |       h2i = -0.5 * (real[i] - real[j]) ;
295 |       real[i] =  wr * h2r  -  wi * h2i  +  h1r ;
296 |       imag[i] =  wr * h2i  +  wi * h2r  +  h1i ;
297 |       real[j] = -wr * h2r  +  wi * h2i  +  h1r ;
298 |       imag[j] =  wr * h2i  +  wi * h2r  -  h1i ;
299 |       t = wr ;
300 |       wr += t * wkr  -  wi * wki ;
301 |       wi += t * wki  +  wi * wkr ;
302 |       }
303 | }
304 | 
305 | 
306 | /*
307 | --------------------------------------------------------------------------------
308 | 
309 |    Compute an inverse transform (negative sign) of an N+1 complex vector
310 |    in the real and imaginary inputs.  This routine assumes that this vector
311 |    is the Fourier transform of a real vector.  Thus, it is symmetric and
312 |    the imaginary part of its first and Nyquist terms are zero.  Therefore,
313 |    the user must place real[N] in imag[0] before calling this routine.
314 |    The transform is done in place using only N elements of each vector.
315 |    The output series is returned alternating in real[i] and imag[i], and
316 |    divided by N.
317 |    In other words, this routine exactly reverses the operation of rv.
318 | 
319 |    The constructor must have been called with ndim equal to half the length
320 |    of the real output series.
321 | 
322 | --------------------------------------------------------------------------------
323 | */
324 | 
325 | void FFT::irv (
326 |    double *real ,  // In: Real parts         Out: 0,2,4,...
327 |    double *imag    // In: Imaginary parts    Out: 1,3,5,...
328 |    )
329 | {
330 |    int i, j, lim ;
331 |    double theta, wr, wi, wkr, wki, t, h1r, h1i, h2r, h2i ;
332 | 
333 |    theta = -PI / (double) npts ;
334 |    t = sin ( 0.5 * theta ) ;
335 |    wr = 1.0 + (wkr = -2.0 * t * t) ;
336 |    wi = wki = sin ( theta ) ;
337 | 
338 |    lim = (npts % 2)  ?  npts/2+1 : npts/2 ;
339 |    for (i=1 ; i<lim ; i++) {
340 |       j = npts - i ;
341 |       h1r =  0.5 * (real[i] + real[j]) ;
342 |       h1i =  0.5 * (imag[i] - imag[j]) ;
343 |       h2r = -0.5 * (imag[i] + imag[j]) ;
344 |       h2i =  0.5 * (real[i] - real[j]) ;
345 |       real[i] =  wr * h2r  -  wi * h2i  +  h1r ;
346 |       imag[i] =  wr * h2i  +  wi * h2r  +  h1i ;
347 |       real[j] = -wr * h2r  +  wi * h2i  +  h1r ;
348 |       imag[j] =  wr * h2i  +  wi * h2r  -  h1i ;
349 |       t = wr ;
350 |       wr += t * wkr  -  wi * wki ;
351 |       wi += t * wki  +  wi * wkr ;
352 |       }
353 | 
354 |    t = real[0] ;
355 |    real[0] = 0.5 * (t + imag[0]) ;
356 |    imag[0] = 0.5 * (t - imag[0]) ;
357 | 
358 |    cpx ( real , imag , -1 ) ;
359 | 
360 |    t = 1.0 / npts ;
361 |    for (i=0 ; i<npts ; i++) {
362 |       real[i] *= t ;
363 |       imag[i] *= t ;
364 |       }
365 | }
366 | 
367 | 


--------------------------------------------------------------------------------
/V2 Source/MRFFT_K.TXT:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  MRFFT_K - This contains the 'kernels' routine called from MRFFT.          */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | 
  8 | #if ! defined ( PI )
  9 | #define PI 3.141592653589793
 10 | #endif
 11 | 
 12 | void kernels ( double *real , double *imag , int ntot , int npts , int nspan ,
 13 |                int isign , int n_facs , double *rwork , double *iwork ,
 14 |                double *cosines , double *sines , int *factors )
 15 | {
 16 |    int j, k, l, m, ibase, inner_span, full_span, facnum, itrig ;
 17 | #if defined (_WIN64)
 18 |    __int64 offset ;
 19 | #else
 20 |    int offset ;
 21 | #endif
 22 |    int last_point, previous_kernel, limit, kernel, inc, jump, tot_pts ;
 23 |    double c0, s0, c1, s1, c2, s2, c3, s3, r0, i0 ;
 24 |    double sin_third, sin_fifth, cos_fifth ;
 25 |    double angle, rtemp, itemp, rtemp2, itemp2, temp ;
 26 |    double real_sum, real_diff, real_sum2, real_diff2 ;
 27 |    double imag_sum, imag_diff, imag_sum2, imag_diff2 ;
 28 |    double *rptr0, *iptr0, *rptr1, *iptr1, *rptr2, *iptr2, *rptr3, *iptr3 ;
 29 |    double *rptr4, *iptr4, *flagptr, *flagptr2, *endptr, *finalptr ;
 30 | 
 31 |    inc = abs ( isign ) ;
 32 |    tot_pts = inc * ntot ;
 33 |    last_point = tot_pts - inc ;
 34 |    inner_span = inc * nspan ;
 35 |    jump = inner_span / npts ;
 36 | 
 37 | /*
 38 |    Compute some trig values that may be used later.
 39 |    These are for a third and a fifth of a circle,
 40 |    for the kernels 3 and 5, respectively.
 41 | */
 42 | 
 43 |    sin_third = sin ( 2.0 * PI / 3.0 ) ;
 44 |    sin_fifth = sin ( 2.0 * PI / 5.0 ) ;
 45 |    cos_fifth = cos ( 2.0 * PI / 5.0 ) ;
 46 | 
 47 |    if (isign < 0) {
 48 |       sin_third = -sin_third ;
 49 |       sin_fifth = -sin_fifth ;
 50 |       }
 51 | 
 52 | /*
 53 |    Main Fourier loop
 54 | */
 55 | 
 56 |    facnum = 0 ;            // Indexes the factors
 57 |    previous_kernel = 0 ;   // Only recompute trig if kernel changes
 58 | 
 59 | kernel_loop:
 60 |    angle = 2.0 * PI * jump / (double) inner_span ;
 61 |    if (isign < 0)
 62 |       angle = -angle ;
 63 |    temp = sin ( 0.5 * angle ) ;
 64 |    c0 = 2.0 * temp * temp ;
 65 |    s0 = sin ( angle ) ;
 66 |    ibase = 0 ;
 67 |    kernel = factors[facnum++] ;
 68 | 
 69 | /*
 70 |    Kernel of 2
 71 | */
 72 | 
 73 |    if (kernel == 2) {
 74 |       inner_span /= 2 ;
 75 | 
 76 |       rptr0 = real ;
 77 |       iptr0 = imag ;
 78 |       flagptr = real + last_point ;
 79 |       endptr = real + jump ;
 80 |       for (;;) {
 81 |          rptr1 = rptr0 + inner_span ;
 82 |          temp = *rptr1 ;
 83 |          *rptr1 = *rptr0 - temp ;
 84 |          *rptr0 += temp ;
 85 |          iptr1 = iptr0 + inner_span ;
 86 |          temp = *iptr1 ;
 87 |          *iptr1 = *iptr0 - temp ;
 88 |          *iptr0 += temp ;
 89 |          rptr0 = rptr1 + inner_span ;
 90 |          iptr0 = iptr1 + inner_span ;
 91 |          if (rptr0 < flagptr)
 92 |             continue ;
 93 |          rptr0 -= last_point ;
 94 |          iptr0 -= last_point ;
 95 |          if (rptr0 >= endptr)
 96 |             break ;
 97 |          }
 98 | 
 99 |       if (rptr0 - real >= inner_span)
100 |          return ;
101 | 
102 |       k = inner_span + 2 ;
103 |       for (;;) {
104 |          c1 = 1.0 - c0 ;
105 |          s1 = s0 ;
106 |          limit = k / 2 ;
107 | 
108 |          flagptr = real + tot_pts - 1 ;
109 |          endptr = real + limit - jump ;
110 |          for (;;) {
111 |             rptr1 = rptr0 + inner_span ;
112 |             rtemp = *rptr0 - *rptr1 ;
113 |             *rptr0 += *rptr1 ;
114 |             iptr1 = iptr0 + inner_span ;
115 |             itemp = *iptr0 - *iptr1 ;
116 |             *iptr0 += *iptr1 ;
117 |             *rptr1 = c1 * rtemp - s1 * itemp ;
118 |             *iptr1 = s1 * rtemp + c1 * itemp ;
119 |             rptr0 += 2 * inner_span ;
120 |             iptr0 += 2 * inner_span ;
121 |             if (rptr0 < flagptr)
122 |                continue ;
123 |             offset = rptr0 - real ;
124 |             offset = k + tot_pts - 2 * offset - 2 ;
125 |             rptr0 += offset ;
126 |             iptr0 += offset ;
127 |             c1 = -c1 ;
128 |             if (tot_pts + offset  >  0)
129 |                continue ;
130 |             if (rptr0 >= endptr)
131 |                break ;
132 |             rptr0 += jump ;
133 |             iptr0 += jump ;
134 |             temp = c1 - (c0 * c1 + s0 * s1) ;
135 |             s1 = (s0 * c1 - c0 * s1) + s1 ;
136 |             c1 = temp ;
137 |             }
138 | 
139 |          k += inc * 2 ;
140 |          ibase = (k - inner_span) / 2 + jump - 1 ;
141 |          if (ibase >= 2 * jump)
142 |             break ;
143 |          rptr0 = real + ibase ;
144 |          iptr0 = imag + ibase ;
145 |          }
146 |       } // Kernel 2
147 | 
148 | /*
149 |    Kernel of 4
150 | */
151 | 
152 |    else if (kernel == 4) {
153 |       inner_span /= 4 ;
154 | 
155 |       rptr0 = real ;
156 |       iptr0 = imag ;
157 |       finalptr = real + jump + inner_span - inc ;
158 | 
159 |       for (;;) {
160 |          c1 = c2 = c3 = 1.0 ;
161 |          s1 = s2 = s3 = 0.0 ;
162 | 
163 |          flagptr = real + tot_pts ;
164 |          endptr = real + inner_span ;
165 | 
166 |          for (;;) {
167 |             rptr1 = rptr0 + inner_span ;
168 |             rptr2 = rptr1 + inner_span ;
169 |             rptr3 = rptr2 + inner_span ;
170 |             real_sum = *rptr0 + *rptr2 ;
171 |             real_diff = *rptr0 - *rptr2 ;
172 |             real_sum2 = *rptr1 + *rptr3 ;
173 |             real_diff2 = *rptr1 - *rptr3 ;
174 |             *rptr0 = real_sum + real_sum2 ;
175 |             real_sum2 = real_sum - real_sum2 ;
176 |             iptr1 = iptr0 + inner_span ;
177 |             iptr2 = iptr1 + inner_span ;
178 |             iptr3 = iptr2 + inner_span ;
179 |             imag_sum = *iptr0 + *iptr2 ;
180 |             imag_diff = *iptr0 - *iptr2 ;
181 |             imag_sum2 = *iptr1 + *iptr3 ;
182 |             imag_diff2 = *iptr1 - *iptr3 ;
183 |             *iptr0 = imag_sum + imag_sum2 ;
184 |             imag_sum2 = imag_sum - imag_sum2 ;
185 |             if (isign < 0) {
186 |                real_sum = real_diff + imag_diff2 ;
187 |                real_diff -= imag_diff2 ;
188 |                imag_sum = imag_diff - real_diff2 ;
189 |                imag_diff += real_diff2 ;
190 |                }
191 |             else {
192 |                real_sum = real_diff - imag_diff2 ;
193 |                real_diff += imag_diff2 ;
194 |                imag_sum = imag_diff + real_diff2 ;
195 |                imag_diff -= real_diff2 ;
196 |                }
197 | 
198 |             *rptr1 = real_sum * c1 - imag_sum * s1 ;
199 |             *rptr2 = real_sum2 * c2 - imag_sum2 * s2 ;
200 |             *rptr3 = real_diff * c3 - imag_diff * s3 ;
201 |             *iptr1 = real_sum * s1 + imag_sum * c1 ;
202 |             *iptr2 = real_sum2 * s2 + imag_sum2 * c2 ;
203 |             *iptr3 = real_diff * s3 + imag_diff * c3 ;
204 | 
205 |             rptr0 += 4 * inner_span ;
206 |             iptr0 += 4 * inner_span ;
207 |             if (rptr0 < flagptr)
208 |                continue ;
209 | 
210 |             rptr0 += jump - tot_pts ;
211 |             iptr0 += jump - tot_pts ;
212 |             if (rptr0 >= endptr)
213 |                break ;
214 | 
215 |             temp = c1 - (c0 * c1 + s0 * s1) ;
216 |             s1 = (s0 * c1 - c0 * s1) + s1 ;
217 |             c1 = temp ;
218 |             c2 = c1 * c1 - s1 * s1 ;
219 |             s2 = 2.0 * c1 * s1 ;
220 |             c3 = c2 * c1 - s2 * s1 ;
221 |             s3 = c2 * s1 + s2 * c1 ;
222 |             }
223 | 
224 |          if (rptr0 >= finalptr)
225 |             break ;
226 |          rptr0 += inc - inner_span ;
227 |          iptr0 += inc - inner_span ;
228 |          }
229 | 
230 |       if (inner_span == jump)
231 |          return ;
232 |       }
233 | 
234 | /*
235 |    All odd kernels
236 | */
237 | 
238 |    else {
239 |       full_span = inner_span ;
240 |       inner_span /= kernel ;
241 | 
242 | /*
243 |    Kernel 3
244 | */
245 | 
246 |       if (kernel == 3) {
247 |          rptr0 = real ;
248 |          iptr0 = imag ;
249 |          flagptr = real + last_point - 1 ;
250 |          endptr = flagptr + inner_span + 1 ;
251 |          for (;;) {
252 |             rptr1 = rptr0 + inner_span ;
253 |             rptr2 = rptr1 + inner_span ;
254 |             rtemp = *rptr0 ;
255 |             real_sum = *rptr1 + *rptr2 ;
256 |             *rptr0 = rtemp + real_sum ;
257 |             rtemp -= 0.5 * real_sum ;
258 |             rtemp2 = (*rptr1 - *rptr2) * sin_third ;
259 |             iptr1 = iptr0 + inner_span ;
260 |             iptr2 = iptr1 + inner_span ;
261 |             itemp = *iptr0 ;
262 |             imag_sum = *iptr1 + *iptr2 ;
263 |             *iptr0 = itemp + imag_sum ;
264 |             itemp -= 0.5 * imag_sum ;
265 |             itemp2 = (*iptr1 - *iptr2) * sin_third ;
266 |             *rptr1 = rtemp - itemp2 ;
267 |             *rptr2 = rtemp + itemp2 ;
268 |             *iptr1 = itemp + rtemp2 ;
269 |             *iptr2 = itemp - rtemp2 ;
270 |             rptr0 += 3 * inner_span ;
271 |             iptr0 += 3 * inner_span ;
272 |             if (rptr0 < flagptr)
273 |                continue ;
274 |             if (rptr0 >= endptr)
275 |                break ;
276 |             rptr0 -= last_point ;
277 |             iptr0 -= last_point ;
278 |             }
279 |          } // If kernel == 3
280 | 
281 | /*
282 |    Kernel 5
283 | */
284 | 
285 |       else if (kernel == 5) {
286 |          c2 = cos_fifth * cos_fifth - sin_fifth * sin_fifth ;
287 |          s2 = 2.0 * cos_fifth * sin_fifth ;
288 |          rptr0 = real ;
289 |          iptr0 = imag ;
290 |          flagptr = real + last_point - 1 ;
291 |          endptr = flagptr + inner_span + 1 ;
292 |          for (;;) {
293 |             rptr1 = rptr0 + inner_span ;
294 |             rptr2 = rptr1 + inner_span ;
295 |             rptr3 = rptr2 + inner_span ;
296 |             rptr4 = rptr3 + inner_span ;
297 |             real_sum = *rptr1 + *rptr4 ;
298 |             real_diff = *rptr1 - *rptr4 ;
299 |             real_sum2 = *rptr2 + *rptr3 ;
300 |             real_diff2 = *rptr2 - *rptr3 ;
301 |             r0 = *rptr0 ;
302 |             *rptr0 = r0 + real_sum + real_sum2 ;
303 |             rtemp = real_sum * cos_fifth + real_sum2 * c2 + r0 ;
304 |             iptr1 = iptr0 + inner_span ;
305 |             iptr2 = iptr1 + inner_span ;
306 |             iptr3 = iptr2 + inner_span ;
307 |             iptr4 = iptr3 + inner_span ;
308 |             imag_sum = *iptr1 + *iptr4 ;
309 |             imag_diff = *iptr1 - *iptr4 ;
310 |             imag_sum2 = *iptr2 + *iptr3 ;
311 |             imag_diff2 = *iptr2 - *iptr3 ;
312 |             i0 = *iptr0 ;
313 |             *iptr0 = i0 + imag_sum + imag_sum2 ;
314 |             itemp = imag_sum * cos_fifth + imag_sum2 * c2 + i0 ;
315 |             rtemp2 = real_diff * sin_fifth + real_diff2 * s2 ;
316 |             itemp2 = imag_diff * sin_fifth + imag_diff2 * s2 ;
317 |             *rptr1 = rtemp - itemp2 ;
318 |             *rptr4 = rtemp + itemp2 ;
319 |             *iptr1 = itemp + rtemp2 ;
320 |             *iptr4 = itemp - rtemp2 ;
321 |             rtemp = real_sum * c2 + real_sum2 * cos_fifth + r0 ;
322 |             itemp = imag_sum * c2 + imag_sum2 * cos_fifth + i0 ;
323 |             rtemp2 = real_diff * s2 - real_diff2 * sin_fifth ;
324 |             itemp2 = imag_diff * s2 - imag_diff2 * sin_fifth ;
325 |             *rptr2 = rtemp - itemp2 ;
326 |             *rptr3 = rtemp + itemp2 ;
327 |             *iptr2 = itemp + rtemp2 ;
328 |             *iptr3 = itemp - rtemp2 ;
329 |             rptr0 += 5 * inner_span ;
330 |             iptr0 += 5 * inner_span ;
331 |             if (rptr0 < flagptr)
332 |                continue ;
333 |             if (rptr0 >= endptr)
334 |                break ;
335 |             rptr0 -= last_point ;
336 |             iptr0 -= last_point ;
337 |             }
338 |          }
339 | 
340 | /*
341 |    Arbitrary odd kernel
342 | */
343 | 
344 |       else {
345 |          if (kernel != previous_kernel) {  // If different from last time, compute trig
346 |             previous_kernel = kernel ;
347 |             angle = 2.0 * PI / kernel ;
348 |             if (isign < 0)
349 |                angle = -angle ;
350 |             c1 = cos ( angle ) ;
351 |             s1 = sin ( angle ) ;
352 |             cosines[kernel-1] = 1.0 ;
353 |             sines[kernel-1] = 0.0 ;
354 |             rptr0 = cosines ;
355 |             rptr1 = cosines + kernel - 1 ;
356 |             iptr0 = sines ;
357 |             iptr1 = sines + kernel - 1 ;
358 |             for (;;) {
359 |                *rptr0 = *rptr1 * c1 + *iptr1 * s1 ;
360 |                *iptr0 = *rptr1 * s1 - *iptr1 * c1 ;
361 |                *(--rptr1) = *(rptr0++) ;
362 |                *(--iptr1) = -*(iptr0++) ;
363 |                if (rptr0 >= rptr1)
364 |                   break ;
365 |                }
366 |             } // Compute trig if changed from last kernel
367 | 
368 |          rptr0 = real ;
369 |          iptr0 = imag ;
370 |          flagptr = real + last_point ;
371 |          endptr = flagptr + inner_span ;
372 |          for (;;) {    // Trig is ready.  Do the odd kernel.
373 |             rptr1 = rptr0 + inner_span ;
374 |             rptr2 = rptr0 + full_span ;
375 |             rtemp = r0 = *rptr0 ;
376 |             iptr1 = iptr0 + inner_span ;
377 |             iptr2 = iptr0 + full_span ;
378 |             itemp = i0 = *iptr0 ;
379 |             rptr3 = rwork ;
380 |             iptr3 = iwork ;
381 |             for (;;) {
382 |                rptr2 -= inner_span ;
383 |                *rptr3 = *rptr1 + *rptr2 ;
384 |                rtemp += *(rptr3++) ;
385 |                *(rptr3++) = *rptr1 - *rptr2 ;
386 |                rptr1 += inner_span ;
387 |                iptr2 -= inner_span ;
388 |                *iptr3 = *iptr1 + *iptr2 ;
389 |                itemp += *(iptr3++) ;
390 |                *(iptr3++) = *iptr1 - *iptr2 ;
391 |                iptr1 += inner_span ;
392 |                if (rptr1 >= rptr2)
393 |                   break ;
394 |                }
395 |             *rptr0 = rtemp ;
396 |             *iptr0 = itemp ;
397 |             rptr1 = rptr0 + inner_span ;
398 |             rptr2 = rptr0 + full_span ;
399 |             iptr1 = iptr0 + inner_span ;
400 |             iptr2 = iptr0 + full_span ;
401 |             j = 1 ;
402 |             for (;;) {
403 |                rtemp = r0 ;
404 |                itemp = i0 ;
405 |                rtemp2 = itemp2 = 0.0 ;
406 |                rptr3 = rwork ;
407 |                iptr3 = iwork ;
408 |                finalptr = rwork + kernel - 1 ;
409 |                itrig = j - 1 ;
410 |                for (;;) {
411 |                   rtemp += *(rptr3++) * cosines[itrig] ;
412 |                   rtemp2 += *(rptr3++) * sines[itrig] ;
413 |                   itemp += *(iptr3++) * cosines[itrig] ;
414 |                   itemp2 += *(iptr3++) * sines[itrig] ;
415 |                   if (rptr3 >= finalptr)
416 |                      break ;
417 |                   itrig = (itrig + j) % kernel ;
418 |                   }
419 |                rptr2 -= inner_span ;
420 |                iptr2 -= inner_span ;
421 |                *rptr1 = rtemp - itemp2 ;
422 |                *rptr2 = rtemp + itemp2 ;
423 |                *iptr1 = itemp + rtemp2 ;
424 |                *iptr2 = itemp - rtemp2 ;
425 |                if (2 * (++j) > kernel)
426 |                   break ;
427 |                rptr1 += inner_span ;
428 |                iptr1 += inner_span ;
429 |                }
430 |             rptr0 += full_span ;
431 |             iptr0 += full_span ;
432 |             if (rptr0 < flagptr)
433 |                continue ;
434 |             if (rptr0 >= endptr)
435 |                break ;
436 |             rptr0 -= last_point ;
437 |             iptr0 -= last_point ;
438 |             }
439 |          } // Else arbitrary odd kernel
440 | 
441 |       if (facnum == n_facs)
442 |          return ;
443 | 
444 | /*
445 |    Multiply by rotation factor
446 | */
447 | 
448 |       rptr0 = real + jump ;
449 |       iptr0 = imag + jump ;
450 |       j = inner_span - tot_pts ;
451 |       k = full_span + tot_pts - jump ;
452 |       l = inner_span - k ;
453 |       m = jump + inc - k ;
454 |       flagptr = real + tot_pts ;
455 |       flagptr2 = real + k ;
456 |       endptr = real + full_span - j ;
457 |       finalptr = real + full_span + tot_pts - inc ;
458 | 
459 |       for (;;) {
460 |          c2 = 1.0 - c0 ;
461 |          s1 = s0 ;
462 | 
463 |          for (;;) {
464 |             c1 = c2 ;
465 |             s2 = s1 ;
466 |             rptr0 += inner_span ;
467 |             iptr0 += inner_span ;
468 |             for (;;) {
469 |                rtemp = *rptr0 ;
470 |                *rptr0 = c2 * rtemp - s2 * *iptr0 ;
471 |                *iptr0 = s2 * rtemp + c2 * *iptr0 ;
472 |                rptr0 += full_span ;
473 |                iptr0 += full_span ;
474 |                if (rptr0 < flagptr)
475 |                   continue ;
476 |                rtemp = s1 * s2 ;
477 |                s2 = s1 * c2 + c1 * s2 ;
478 |                c2 = c1 * c2 - rtemp ;
479 |                if (rptr0 >= endptr)
480 |                   break ;
481 |                rptr0 += j ;
482 |                iptr0 += j ;
483 |                }
484 |             if (rptr0 >= flagptr2)
485 |                break ;
486 |             rptr0 += l ;
487 |             iptr0 += l ;
488 |             c2 = c1 - (c0 * c1 + s0 * s1) ;
489 |             s1 += (s0 * c1 - c0 * s1) ;
490 |             }
491 | 
492 |          if (rptr0 >= finalptr)
493 |             break ;
494 |          rptr0 += m ;
495 |          iptr0 += m ;
496 |          }
497 |       } // All odd factors
498 | 
499 |    goto kernel_loop ;
500 | }
501 | 


--------------------------------------------------------------------------------
/V1 Source/GENERATIVE.CPP:
--------------------------------------------------------------------------------
  1 | /************************************************************/
  2 | /*                                                          */
  3 | /*  GENERATIVE - Display generative samples                 */
  4 | /*                                                          */
  5 | /*  Computation fragment only; no display                   */
  6 | /*                                                          */
  7 | /************************************************************/
  8 | 
  9 | 
 10 | class GenerativeChild {
 11 | 
 12 | public:
 13 |    GenerativeChild ( int first_case , int nrows , int ncols , int nchain ) ;
 14 |    ~GenerativeChild () ;
 15 | 
 16 |    int ok ;
 17 |    int first_case ;
 18 |    int nrows ;
 19 |    int ncols ;
 20 |    int nchain ;
 21 |    DIBimage *dib ;       /* The image is here */
 22 | } ;
 23 | 
 24 | 
 25 | /*
 26 | --------------------------------------------------------------------------------
 27 | 
 28 |    Workhorse routine that computes a single generative sample
 29 | 
 30 | --------------------------------------------------------------------------------
 31 | */
 32 | 
 33 | static void gen_threaded (
 34 |    int nvis ,                // Number of inputs to the first (bottom) layer
 35 |    int max_neurons ,         // Maximum number of neurons in any layer, as well as nvis
 36 |    int n_unsup ,             // Number of unsupervised layers
 37 |    int *nhid_unsup ,         // N_unsup vector containing the number of hidden neurons in each layer
 38 |    double **weights_unsup ,  // N_unsup pointers to weight matrices, each being nhid sets of nvis weights
 39 |    double *in_bias ,         // Input bias vectors; n_unsup sets of max_neurons each
 40 |    double *hid_bias ,        // Hidden bias vectors; n_unsup sets of max_neurons each
 41 |    int nchain ,              // Length of Gibbs chain, 0 to return raw data
 42 |    int input_vis ,           // Start with visible (as opposed to hidden)?
 43 |    double *workvec1 ,        // Work vector max_neurons long, also inputs starting case if input_vis
 44 |    double *workvec2 ,        // Work vector max_neurons long, also inputs starting hidden if ! input_vis
 45 |    unsigned char *image      // Computed image, 0-255 returned here
 46 |    )
 47 | {
 48 |    int i, k, ichain, ivis, nin, ihid, nhid, i_layer, randnum ;
 49 |    double *vis_layer, *hid_layer, *w, *wptr, *ibptr, *hbptr, sum, Q, frand ;
 50 | 
 51 |    vis_layer = workvec1 ;
 52 |    hid_layer = workvec2 ;
 53 |       
 54 |    // Either a training set image is in workvec1 (input_vis),
 55 |    // or a hidden weight vector is in workvec2 (! input_vis).
 56 | 
 57 |    if (nchain == 0) {   // User wants original image?  This overrides input_vis.
 58 |       for (i=0 ; i<nvis ; i++)
 59 |          image[i] = (unsigned char) (255.9999 * vis_layer[i]) ; 
 60 |       return ;
 61 |       }
 62 | 
 63 |    if (input_vis) {
 64 | 
 65 |       randnum = 1 ;               // Get a somewhat random seed
 66 |       for (i=0 ; i<nvis ; i++) {
 67 |          if (vis_layer[i] > 0.5)
 68 |             ++randnum ;
 69 |          }
 70 |             
 71 |    // Propagate up until we reach the RBM
 72 | 
 73 |       nin = nvis ;
 74 |       for (i_layer=0 ; i_layer<n_unsup-1 ; i_layer++) {
 75 |          nhid = nhid_unsup[i_layer] ;
 76 |          w = weights_unsup[i_layer] ;
 77 |          hbptr = hid_bias + i_layer * max_neurons ;
 78 |           for (ihid=0 ; ihid<nhid ; ihid++) {
 79 |             wptr = w + ihid * nin ;          // Weight vector for this neuron
 80 |             sum = hbptr[ihid] ;              // This hidden neuron's bias
 81 |             for (ivis=0 ; ivis<nin ; ivis++)
 82 |                sum += wptr[ivis] * vis_layer[ivis] ;
 83 |             hid_layer[ihid] = 1.0 / (1.0 + exp(-sum)) ;
 84 |             }
 85 |          nin = nhid ;
 86 |          if (vis_layer == workvec1) {
 87 |             vis_layer = workvec2 ;
 88 |             hid_layer = workvec1 ;
 89 |             }
 90 |          else {
 91 |             vis_layer = workvec1 ;
 92 |             hid_layer = workvec2 ;
 93 |             }
 94 |          } // For i_layer, propagating up until the RBM
 95 |       } // If input_vis
 96 | 
 97 |    else { // Not input_vis, so user is inputting hidden layer of RBM
 98 |       randnum = 1 ;               // Get a somewhat random seed
 99 |       for (i=0 ; i<nhid_unsup[n_unsup-1] ; i++) {
100 |          if (hid_layer[i] > 0.5)
101 |             ++randnum ;
102 |          }
103 | 
104 |       if (n_unsup == 1)
105 |          nin = nvis ;
106 |       else
107 |          nin = nhid_unsup[n_unsup-2] ;
108 |       } // If not input_vis
109 | 
110 | 
111 |    // Gibbs chain in the RBM
112 | 
113 |    nhid = nhid_unsup[n_unsup-1] ;
114 |    w = weights_unsup[n_unsup-1] ;
115 |    hbptr = hid_bias + (n_unsup-1) * max_neurons ;
116 |    ibptr = in_bias + (n_unsup-1) * max_neurons ;
117 | 
118 |    for (ichain=0 ; ichain<nchain ; ichain++) {
119 | 
120 |       if (ichain  ||  input_vis) {           // Skip first visible-to-hidden if user inputs hidden
121 |          for (ihid=0 ; ihid<nhid ; ihid++) { // Visible to hidden, with sampling
122 |             wptr = w + ihid * nin ;          // Weight vector for this neuron
123 |             sum = hbptr[ihid] ;              // This hidden neuron's bias
124 |             for (ivis=0 ; ivis<nin ; ivis++)
125 |                sum += wptr[ivis] * vis_layer[ivis] ;
126 |             Q = 1.0 / (1.0 + exp(-sum)) ;
127 |             k = randnum / IQ ;
128 |             randnum = IA * (randnum - k * IQ) - IR * k ;
129 |             if (randnum < 0)
130 |                randnum += IM ;
131 |             frand = AM * randnum ;
132 |             hid_layer[ihid] = (frand < Q) ? 1.0 : 0.0 ;
133 |             }
134 |          }
135 |    
136 |       for (ivis=0 ; ivis<nin ; ivis++) {   // Hidden to visible, without sampling
137 |          sum = ibptr[ivis] ;
138 |          for (ihid=0 ; ihid<nhid ; ihid++)
139 |             sum += w[ihid*nin+ivis] * hid_layer[ihid] ;
140 |          vis_layer[ivis] = 1.0 / (1.0 + exp(-sum)) ;
141 |          }
142 | 
143 |       if (escape_key_pressed)
144 |          break ;
145 | 
146 |       } // For ichain
147 | 
148 |    // The Gibbs chain is complete.  Work back down to the input.
149 | 
150 |    for (i_layer=n_unsup-2 ; i_layer>=0 ; i_layer--) {
151 |       nhid = nin ;
152 |       assert ( nhid == nhid_unsup[i_layer] ) ;
153 |       if (i_layer == 0)
154 |          nin = nvis ;
155 |       else
156 |          nin = nhid_unsup[i_layer-1] ;
157 |       w = weights_unsup[i_layer] ;
158 |       ibptr = in_bias + i_layer * max_neurons ;
159 | 
160 |       if (vis_layer == workvec1) {
161 |          vis_layer = workvec2 ;
162 |          hid_layer = workvec1 ;
163 |          }
164 |       else {
165 |          vis_layer = workvec1 ;
166 |          hid_layer = workvec2 ;
167 |          }
168 | 
169 |       for (ivis=0 ; ivis<nin ; ivis++) {   // Hidden to visible, without sampling
170 |          sum = ibptr[ivis] ;
171 |          for (ihid=0 ; ihid<nhid ; ihid++)
172 |             sum += w[ihid*nin+ivis] * hid_layer[ihid] ;
173 |          vis_layer[ivis] = 1.0 / (1.0 + exp(-sum)) ;
174 |          }
175 |       } // For i_layer, propagating down until the data input
176 | 
177 |    for (i=0 ; i<nvis ; i++)
178 |       image[i] = (unsigned char) (255.9999 * vis_layer[i]) ; 
179 | }
180 | 
181 | 
182 | /*
183 | --------------------------------------------------------------------------------
184 | 
185 |    Thread stuff...
186 |       Structure for passing information to/from threaded code
187 |       Threaded code called by the main subroutine
188 | 
189 | --------------------------------------------------------------------------------
190 | */
191 | 
192 | typedef struct {
193 |    int nvis ;                // Number of inputs to the first (bottom) layer
194 |    int max_neurons ;         // Maximum number of neurons in any layer, as well as nin
195 |    int n_unsup ;             // Number of unsupervised layers to greedily train
196 |    int *nhid_unsup ;         // N_unsup vector containing the number of hidden neurons in each layer
197 |    double **weights_unsup ;  // N_unsup pointers to computed weight matrices, each being nhid sets of n_inputs weights
198 |    double *in_bias ;         // Input bias vectors; n_unsup sets of max_neurons each
199 |    double *hid_bias ;        // Hidden bias vectors; n_unsup sets of max_neurons each
200 |    int nchain ;              // Length of Gibbs chain, 0 to return raw data
201 |    int input_vis ;           // Start with visible (as opposed to hidden)?
202 |    double *workvec1 ;        // Work vector max_neurons long, also inputs starting case
203 |    double *workvec2 ;        // Work vector max_neurons long
204 |    unsigned char *image ;    // Computed image, 0-255 returned here
205 | } RBM_GENER_PARAMS ;
206 | 
207 | static unsigned int __stdcall gen_wrapper ( LPVOID dp )
208 | {
209 |    gen_threaded (
210 |        ((RBM_GENER_PARAMS *) dp)->nvis ,
211 |        ((RBM_GENER_PARAMS *) dp)->max_neurons ,
212 |        ((RBM_GENER_PARAMS *) dp)->n_unsup ,
213 |        ((RBM_GENER_PARAMS *) dp)->nhid_unsup ,
214 |        ((RBM_GENER_PARAMS *) dp)->weights_unsup ,
215 |        ((RBM_GENER_PARAMS *) dp)->in_bias ,
216 |        ((RBM_GENER_PARAMS *) dp)->hid_bias ,
217 |        ((RBM_GENER_PARAMS *) dp)->nchain ,
218 |        ((RBM_GENER_PARAMS *) dp)->input_vis ,
219 |        ((RBM_GENER_PARAMS *) dp)->workvec1 ,
220 |        ((RBM_GENER_PARAMS *) dp)->workvec2 ,
221 |        ((RBM_GENER_PARAMS *) dp)->image ) ;
222 |    return 0 ;
223 | }
224 | 
225 | 
226 | /*
227 | --------------------------------------------------------------------------------
228 | 
229 |    Child members
230 | 
231 | --------------------------------------------------------------------------------
232 | */
233 | 
234 | GenerativeChild::GenerativeChild ( int c_first_case , int c_nrows , int c_ncols , int c_nchain  )
235 | {
236 |    int i, k, irow, icol, nr, nc, irnum, icnum, ir, ic, nvis, icase, ret_val, n_threads ;
237 |    int image_number, data_index, save_data_index, empty_slot ;
238 |    double *inptr, *workvec1, *workvec2 ;
239 |    char msg[256] ;
240 |    unsigned char *raw_image, *data, *dptr ;
241 |    RBM_GENER_PARAMS params[MAX_THREADS] ;
242 |    HANDLE threads[MAX_THREADS] ;
243 | 
244 |    first_case = c_first_case ;
245 |    nrows = c_nrows ;  // These refer to the grid of images displayed
246 |    ncols = c_ncols ;
247 |    nchain = c_nchain ;
248 | 
249 |    nvis = model->n_data_inputs ;
250 | 
251 | /*
252 |    Allocate memory
253 | */
254 | 
255 |    raw_image = NULL ;
256 |    data = NULL ;
257 |    dib = NULL ;
258 | 
259 |    nr = MNIST_rows * nrows + (nrows-1) * 3 + 2 * 2 ;
260 |    nc = MNIST_cols * ncols + (ncols-1) * 3 + 2 * 2 ;
261 | 
262 |    ok = 1 ;
263 | 
264 |    raw_image = (unsigned char *) MALLOC ( 3 * nr * nc ) ;
265 |    data = (unsigned char *) MALLOC ( nrows * ncols * nvis * sizeof(unsigned char) ) ;
266 |    workvec1 = (double *) MALLOC ( model->max_neurons * max_threads * sizeof(double) ) ;
267 |    workvec2 = (double *) MALLOC ( model->max_neurons * max_threads * sizeof(double) ) ;
268 | 
269 |    if (raw_image == NULL  ||  data == NULL  ||  workvec1 == NULL  ||  workvec2 == NULL) {
270 |       if (raw_image != NULL)
271 |          FREE ( raw_image ) ;
272 |       if (data != NULL)
273 |          FREE ( data ) ;
274 |       if (workvec1 != NULL)
275 |          FREE ( workvec1 ) ;
276 |       if (workvec2 != NULL)
277 |          FREE ( workvec2 ) ;
278 |       ok = 0 ;
279 |       audit ( "" ) ;
280 |       audit ( "ERROR... Insufficient memory to display generative samples" ) ;
281 |       return ;
282 |       }
283 | 
284 | /*
285 |    Initialize parameters that will not change for threads.
286 | */
287 | 
288 |    for (i=0 ; i<max_threads ; i++) {
289 |       params[i].nvis = model->n_data_inputs ;
290 |       params[i].max_neurons = model->max_neurons ;
291 |       params[i].n_unsup = model->n_unsup ;
292 |       params[i].nhid_unsup = model->nhid_unsup ;
293 |       params[i].weights_unsup = model->weights_unsup ;
294 |       params[i].in_bias = model->in_bias ;
295 |       params[i].hid_bias = model->hid_bias ;
296 |       params[i].nchain = nchain ;
297 |       params[i].input_vis = (first_case > 0) ;
298 |       params[i].workvec1 = workvec1 + i * model->max_neurons ;
299 |       params[i].workvec2 = workvec2 + i * model->max_neurons ;
300 |       }
301 | 
302 | /*
303 |    Compute the generated images
304 | */
305 | 
306 |    n_threads = 0 ;                    // Counts threads that are active
307 |    for (i=0 ; i<max_threads ; i++)
308 |       threads[i] = NULL ;
309 | 
310 |    image_number = 0 ; // Index of generated image (nrows*ncols of them)
311 |    empty_slot = -1 ;  // After full, will identify the thread that just completed
312 | 
313 |    for (;;) {         // Main thread loop processes all images
314 | 
315 | /*
316 |    Handle user ESCape
317 | */
318 | 
319 |       if (escape_key_pressed  ||  user_pressed_escape ()) {
320 |          audit ( "" ) ;
321 |          audit ( "WARNING: User pressed ESCape during generative sampling" ) ;
322 |          MEMTEXT ( "GENERATIVE.CPP: ESCape detected" ) ;
323 |          user_pressed_escape () ;
324 |          for (i=0, k=0 ; i<max_threads ; i++) {
325 |             if (threads[i] != NULL)
326 |                threads[k++] = threads[i] ;
327 |             }
328 |          ret_val = WaitForMultipleObjects ( n_threads , threads , TRUE , 1200000 ) ;
329 |          if (ret_val == WAIT_TIMEOUT)
330 |             audit ( "Timeout waiting for generative computation user ESCape" ) ;
331 |          sprintf ( msg, "GENERATIVE.CPP: User abort; n_threads=%d  k=%d  Wait retval=%d", n_threads, k, ret_val ) ;
332 |          MEMTEXT ( msg ) ;
333 |          for (i=0 ; i<n_threads ; i++)
334 |             CloseHandle ( threads[i] ) ;
335 |          ok = 0 ;
336 |          escape_key_pressed = 0 ;
337 |          return ;
338 |          }
339 | 
340 | /*
341 |    Start a new thread if we still have work to do
342 | */
343 | 
344 |       if (image_number < nrows*ncols) { // If there are still some to do
345 | 
346 |          if (empty_slot < 0)    // Negative while we are initially filling the queue
347 |             k = n_threads ;
348 |          else
349 |             k = empty_slot ;
350 | 
351 |          if (first_case > 0) {  // We start with a visible layer from training set
352 |             icase = (first_case + image_number - 1) % n_cases ;
353 |             inptr = database + icase * n_vars ;     // Point to this case in the database
354 |             for (i=0 ; i<nvis ; i++) {              // Put starting case in workvec1
355 |                if (TrainParams.binary_input)
356 |                   params[k].workvec1[i] = (inptr[model->inputs[i]] > model->in_mean[i]) ? 1.0 : 0.0 ;
357 |                else {
358 |                   params[k].workvec1[i] = (inptr[model->inputs[i]] - model->in_min[i]) / (model->in_max[i] - model->in_min[i]) ;
359 |                   assert ( workvec1[i] >= 0.0 ) ;
360 |                   assert ( workvec1[i] <= 1.0 ) ;
361 |                   }
362 |                }
363 |             }
364 | 
365 |          else {  // We start with a random top hidden layer (the RBM)
366 |             for (i=0 ; i<model->nhid_unsup[model->n_unsup-1] ; i++)
367 |                params[k].workvec2[i] = (unifrand_fast() >= 0.5)  ?  1.0 : 0.0 ;
368 |             }
369 | 
370 |          params[k].image = data + image_number * nvis ;
371 | 
372 |          threads[k] = (HANDLE) _beginthreadex ( NULL , 0 , gen_wrapper , &params[k] , 0 , NULL ) ;
373 |          if (threads[k] == NULL) {
374 |             audit ( "Internal ERROR: bad thread creation in GENERATIVE.CPP" ) ;
375 |             for (i=0 ; i<n_threads ; i++) {
376 |                if (threads[i] != NULL)
377 |                   CloseHandle ( threads[i] ) ;
378 |                }
379 |             ok = 0 ;
380 |             return ;
381 |             }
382 |          ++n_threads ;
383 |          ++image_number ;
384 |          } // if (image_number < nrows*ncols)
385 | 
386 |       if (n_threads == 0)  // Are we done?
387 |          break ;
388 | 
389 | /*
390 |    Handle full suite of threads running and more threads to add as soon as some are done.
391 |    Wait for just one thread to finish.
392 | */
393 | 
394 |       if (n_threads == max_threads  &&  image_number < nrows*ncols) {
395 |          ret_val = WaitForMultipleObjects ( n_threads , threads , FALSE , 1200000 ) ;
396 |          if (ret_val == WAIT_TIMEOUT  ||  ret_val == WAIT_FAILED  ||  ret_val < 0  ||  ret_val >= n_threads) {
397 |             sprintf ( msg, "INTERNAL ERROR!!!  Thread wait 1 failed (%d) in GENERATIVE", ret_val ) ;
398 |             audit ( msg ) ;
399 |             MEMTEXT ( msg ) ;
400 |             if (ret_val == WAIT_TIMEOUT)
401 |                audit ( "Timeout waiting for generative computation to finish; problem too large" ) ;
402 |             ok = 0 ;
403 |             return ;
404 |             }
405 | 
406 |          empty_slot = ret_val ;
407 |          CloseHandle ( threads[empty_slot] ) ;
408 |          threads[empty_slot] = NULL ;
409 |          --n_threads ;
410 |          }
411 | 
412 | /*
413 |    Handle all work has been started and now we are just waiting for threads to finish
414 | */
415 | 
416 |       else if (image_number == nrows*ncols) {
417 |          ret_val = WaitForMultipleObjects ( n_threads , threads , TRUE , 1200000 ) ;
418 |          if (ret_val == WAIT_TIMEOUT  ||  ret_val == WAIT_FAILED  ||  ret_val < 0  ||  ret_val >= n_threads) {
419 |             sprintf ( msg, "INTERNAL ERROR!!!  Thread wait 2 failed (%d) in GENERATIVE", ret_val ) ;
420 |             audit ( msg ) ;
421 |             MEMTEXT ( msg ) ;
422 |             if (ret_val == WAIT_TIMEOUT)
423 |                audit ( "Timeout waiting for generative computation to finish; problem too large" ) ;
424 |             ok = 0 ;
425 |             return ;
426 |             }
427 | 
428 |          for (i=0 ; i<n_threads ; i++)
429 |             CloseHandle ( threads[i] ) ;
430 | 
431 |          break ;
432 |          } // Waiting for final threads to finish
433 |       } // Endless loop which threads computation of criterion for all random tries
434 | 
435 | /*
436 |    All computation is finished.  Build the display.
437 | */
438 | 
439 | // Display as desired
440 | }
441 | 


--------------------------------------------------------------------------------
/V1 Source/RBM_THR1.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  RBM_THR1 - Restricted Boltzman Machine trains a single hidden layer       */
  4 | /*                                                                            */
  5 | /*  This is the first half: find good initial weights                         */
  6 | /*                                                                            */
  7 | /******************************************************************************/
  8 | 
  9 | #define STRICT
 10 | #include <windows.h>
 11 | #include <commctrl.h>
 12 | #include <assert.h>
 13 | #include <stdlib.h>
 14 | #include <stdio.h>
 15 | #include <math.h>
 16 | #include <string.h>
 17 | #include <ctype.h>
 18 | #include <malloc.h>
 19 | #include <new.h>
 20 | #include <float.h>
 21 | #include <process.h>
 22 | 
 23 | #include "deep.rh"
 24 | #include "const.h"
 25 | #include "classes.h"
 26 | #include "extern.h"
 27 | #include "funcdefs.h"
 28 | 
 29 | 
 30 | /*
 31 | --------------------------------------------------------------------------------
 32 | 
 33 |    Workhorse routine that computes the criterion (reproduction error)
 34 |    for a weight matrix
 35 | 
 36 | --------------------------------------------------------------------------------
 37 | */
 38 | 
 39 | static double rbm1_threaded (
 40 |    int nc ,                // Number of cases
 41 |    int n_inputs ,          // Number of inputs
 42 |    int max_neurons ,       // Maximum number of neurons in any layer, as well as nin
 43 |    double *data ,          // Nc rows by max_neurons columns (n_inputs used) of input data; 0-1
 44 |    int nhid ,              // Number of hidden neurons
 45 |    double *w ,             // Computed weight matrix, nhid sets of n_inputs weights
 46 |    double *in_bias ,       // Computed input bias vector
 47 |    double *hid_bias ,      // Computed hidden bias vector
 48 |    double *visible1 ,      // Work vector n_inputs long
 49 |    double *hidden1         // Work vector nhid long
 50 |    )
 51 | {
 52 |    int icase, ihid, ivis ;
 53 |    double error, sum, *wptr, *dptr, P ;
 54 | 
 55 |    error = 0.0 ;  // Will cumulate reconstruction error, which is our criterion for best parameters here
 56 | 
 57 |    for (icase=0 ; icase<nc ; icase++) {    // Pass through all cases, cumulating error
 58 |       dptr = data + icase * max_neurons ;  // Point to this case in the data
 59 |       for (ivis=0 ; ivis<n_inputs ; ivis++)
 60 |          visible1[ivis] = dptr[ivis] ;
 61 | 
 62 |       // For each hidden neuron, compute Q[h=1|visible1].  Do not sample.
 63 | 
 64 |       for (ihid=0 ; ihid<nhid ; ihid++) {
 65 | 
 66 |          wptr = w + ihid * n_inputs ;      // Weight vector for this neuron
 67 |          sum = hid_bias[ihid] ;
 68 |          for (ivis=0 ; ivis<n_inputs ; ivis++)
 69 |             sum += wptr[ivis] * visible1[ivis] ;
 70 |          hidden1[ihid] = 1.0 / (1.0 + exp(-sum)) ;
 71 |          }
 72 | 
 73 |       // For each visible neuron, compute P[x=1|hidden layer]
 74 |       // and then find reconstruction error
 75 | 
 76 |       for (ivis=0 ; ivis<n_inputs ; ivis++) {
 77 |          sum = in_bias[ivis] ;
 78 |          for (ihid=0 ; ihid<nhid ; ihid++)
 79 |             sum += w[ihid*n_inputs+ivis] * hidden1[ihid] ;
 80 |          P = 1.0 / (1.0 + exp(-sum)) ;
 81 | #if RECON_ERR_XENT
 82 |          error -= visible1[ivis] * log(P+1.e-10) + (1.0 - visible1[ivis]) * log(1.0-P+1.e-10) ;
 83 | #else
 84 |          double diff ;
 85 |          diff = visible1[ivis] - P ;
 86 |          error += diff * diff ;
 87 | #endif
 88 |          }
 89 | 
 90 |       } // For icase
 91 | 
 92 |    return error ;
 93 | }
 94 | 
 95 | 
 96 | /*
 97 | --------------------------------------------------------------------------------
 98 | 
 99 |    Thread stuff...
100 |       Structure for passing information to/from threaded code
101 |       Threaded code called by the main subroutine
102 | 
103 | --------------------------------------------------------------------------------
104 | */
105 | 
106 | typedef struct {
107 |    int nc ;                // Number of cases
108 |    int n_inputs ;          // Number of inputs
109 |    int max_neurons ;       // Maximum number of neurons in any layer, including input
110 |    double *data ;          // Nc rows by max_neurons columns of input data; 0-1
111 |    int nhid ;              // Number of hidden neurons
112 |    double *w ;             // Weight matrix; nhid sets of n_inputs weights
113 |    double *in_bias ;       // Input bias vector
114 |    double *hid_bias ;      // Hidden bias vector
115 |    double *visible1 ;      // Work vector n_inputs long
116 |    double *hidden1 ;       // Work vector nhid long
117 |    double crit ;           // Computed criterion returned here
118 | } RBM_THR1_PARAMS ;
119 | 
120 | static unsigned int __stdcall rbm1_wrapper ( LPVOID dp )
121 | {
122 |    ((RBM_THR1_PARAMS *) dp)->crit = rbm1_threaded (
123 |                           ((RBM_THR1_PARAMS *) dp)->nc ,
124 |                           ((RBM_THR1_PARAMS *) dp)->n_inputs ,
125 |                           ((RBM_THR1_PARAMS *) dp)->max_neurons ,
126 |                           ((RBM_THR1_PARAMS *) dp)->data ,
127 |                           ((RBM_THR1_PARAMS *) dp)->nhid ,
128 |                           ((RBM_THR1_PARAMS *) dp)->w ,
129 |                           ((RBM_THR1_PARAMS *) dp)->in_bias ,
130 |                           ((RBM_THR1_PARAMS *) dp)->hid_bias ,
131 |                           ((RBM_THR1_PARAMS *) dp)->visible1 ,
132 |                           ((RBM_THR1_PARAMS *) dp)->hidden1 ) ;
133 |    return 0 ;
134 | }
135 | 
136 | 
137 | /*
138 | --------------------------------------------------------------------------------
139 | 
140 |    Main routine called from greedy()
141 | 
142 | --------------------------------------------------------------------------------
143 | */
144 | 
145 | double rbm_thr1 (
146 |    int nc ,                // Number of cases
147 |    int n_inputs ,          // Number of inputs
148 |    int max_neurons ,       // Maximum number of neurons in any layer, including input
149 |    double *data ,          // Nc rows by max_neurons columns of input data; 0-1
150 |    int nhid ,              // Number of hidden neurons
151 |    double *w ,             // Returned weight matrix, nhid sets of n_inputs weights; max_threads sets
152 |    double *in_bias ,       // Returned input bias vector; max_threads sets
153 |    double *hid_bias ,      // Returned hidden bias vector; max_threads sets
154 |    double *visible1 ,      // Work vector n_inputs long; max_threads sets
155 |    double *hidden1 ,       // Work vector nhid long; max_threads sets
156 |    double *in_bias_best ,  // Work vector n_inputs long
157 |    double *hid_bias_best , // Work vector nhid long
158 |    double *w_best ,        // Work vector n_inputs * nhid long
159 |    double *data_mean       // Work vector n_inputs long
160 |    )
161 | 
162 | {
163 |    int irand, ivis, ihid ;
164 |    int i, k, n_rand, n_threads, empty_slot, ret_val ;
165 |    double error, best_err ;
166 |    double sum, wt, *dptr, *wptr, *hid_bias_ptr, *in_bias_ptr, diff ;
167 |    char msg[4096] ;
168 |    RBM_THR1_PARAMS params[MAX_THREADS] ;
169 |    HANDLE threads[MAX_THREADS] ;
170 | 
171 |    user_pressed_escape () ;
172 |    escape_key_pressed = 0 ;  // Allow subsequent operations
173 | 
174 | /*
175 |    Find the mean of the data for each input.
176 |    This is used to initialize visible bias terms to reasonable values.
177 | */
178 | 
179 |    for (ivis=0 ; ivis<n_inputs ; ivis++)
180 |       data_mean[ivis] = 0.0 ;
181 | 
182 |    for (i=0 ; i<nc ; i++) {            // Pass through all cases, cumulating mean vector
183 |       dptr = data + i * max_neurons ;  // Point to this case in the data
184 |       for (ivis=0 ; ivis<n_inputs ; ivis++)
185 |          data_mean[ivis] += dptr[ivis] ;
186 |       }
187 | 
188 |    for (ivis=0 ; ivis<n_inputs ; ivis++) {
189 |       data_mean[ivis] /= nc ;
190 |       if (data_mean[ivis] < 1.e-8)
191 |          data_mean[ivis] = 1.e-8 ;
192 |       if (data_mean[ivis] > 1.0 - 1.e-8)
193 |          data_mean[ivis] = 1.0 - 1.e-8 ;
194 |       }
195 | 
196 | 
197 | /*
198 |    Get the training parameters from the global storage
199 |    Initialize parameters that will not change for threads.
200 | */
201 | 
202 |    n_rand = TrainParams.n_rand ;
203 | 
204 |    for (i=0 ; i<max_threads ; i++) {
205 |       params[i].nc = nc ;
206 |       params[i].n_inputs = n_inputs ;
207 |       params[i].max_neurons = max_neurons ;
208 |       params[i].nhid = nhid ;
209 |       params[i].data = data ;
210 |       params[i].visible1 = visible1 + i * max_neurons ;
211 |       params[i].hidden1 = hidden1 + i * max_neurons ;
212 |       params[i].w = w + i * nhid * n_inputs ;
213 |       params[i].hid_bias = hid_bias + i * max_neurons ;
214 |       params[i].in_bias = in_bias + i * max_neurons ;
215 |       }
216 | 
217 | 
218 | /*
219 | ------------------------------------------------------------------------------------------------
220 | 
221 |    Try some small weight vectors and choose as starter the one with minimum reconstruction error.
222 |    We also initialize all bias vectors to minus half of the weight sum for rough balance.
223 | 
224 | ------------------------------------------------------------------------------------------------
225 | */
226 | 
227 |    n_threads = 0 ;                    // Counts threads that are active
228 |    for (i=0 ; i<max_threads ; i++)
229 |       threads[i] = NULL ;
230 | 
231 |    irand = 0 ;        // Index of try
232 |    empty_slot = -1 ;  // After full, will identify the thread that just completed
233 |    best_err = 1.e40 ;
234 | 
235 |    for (;;) {         // Main thread loop processes all tries
236 | 
237 | /*
238 |    Handle user ESCape
239 | */
240 | 
241 |       if (irand  &&  (escape_key_pressed  ||  user_pressed_escape ())) { // Make sure at least one tried
242 |          user_pressed_escape () ;
243 |          escape_key_pressed = 0 ;  // Allow subsequent operations
244 |          for (i=0, k=0 ; i<max_threads ; i++) {
245 |             if (threads[i] != NULL)
246 |                threads[k++] = threads[i] ;
247 |             }
248 |          ret_val = WaitForMultipleObjects ( n_threads , threads , TRUE , 12000000 ) ;
249 |          if (ret_val == WAIT_TIMEOUT)
250 |             audit ( "Timeout waiting for computation to finish; problem too large" ) ;
251 |          sprintf ( msg, "RBM_THR1.CPP: User abort; n_threads=%d  k=%d  Wait retval=%d", n_threads, k, ret_val ) ;
252 |          MEMTEXT ( msg ) ;
253 |          for (i=0 ; i<n_threads ; i++)
254 |             CloseHandle ( threads[i] ) ;
255 |          audit ( "" ) ;
256 |          audit ( "WARNING: User pressed ESCape during initial search for RBM starting weights" ) ;
257 |          audit ( "         Results may be substandard" ) ;
258 |          return best_err ;  // Let greedy() continue with whatever we have so far
259 |          }
260 | 
261 | /*
262 |    Start a new thread if we still have work to do
263 | */
264 | 
265 |       if (irand < n_rand) {     // If there are still some to do
266 |          if (empty_slot < 0)    // Negative while we are initially filling the queue
267 |             k = n_threads ;
268 |          else
269 |             k = empty_slot ;
270 | 
271 |          // Generate the trial weight matrix and bias vectors
272 | 
273 |          wptr = params[k].w ;
274 |          hid_bias_ptr = params[k].hid_bias ;
275 |          in_bias_ptr = params[k].in_bias ;
276 |          
277 |          diff = 4.0 * unifrand_fast() / sqrt ( sqrt ( (double) n_inputs * nhid ) ) ;
278 | 
279 |          for (ihid=0 ; ihid<nhid ; ihid++) {
280 |             sum = 0.0 ;
281 | 
282 |             for (ivis=0 ; ivis<n_inputs ; ivis++) {   // Get all visible weights for this hidden neuron
283 |                wt = diff * (unifrand_fast() - 0.5) ;  // This is symmetric with heavy-ish tails
284 |                wptr[ihid*n_inputs+ivis] = wt ;
285 |                sum += data_mean[ivis] * wt ;          // We'll need this for this hidden neuron's bias
286 |                }
287 | 
288 |             hid_bias_ptr[ihid] = -sum ;               // Center the distribution
289 |             } // For ihid
290 | 
291 | 
292 |          for (ivis=0 ; ivis<n_inputs ; ivis++) {      // Also center the visible
293 |             sum = 0.0 ;
294 |             for (ihid=0 ; ihid<nhid ; ihid++)
295 |                sum += wptr[ihid*n_inputs+ivis] ;            
296 |             in_bias_ptr[ivis] = log ( data_mean[ivis] / (1.0 - data_mean[ivis]) ) - 0.5 * sum ;
297 |             }
298 | 
299 |          // Start the thread for this trial
300 | 
301 |          threads[k] = (HANDLE) _beginthreadex ( NULL , 0 , rbm1_wrapper , &params[k] , 0 , NULL ) ;
302 |          if (threads[k] == NULL) {
303 |             audit ( "Internal ERROR: bad thread creation in RBM_THR1" ) ;
304 |             for (i=0 ; i<n_threads ; i++) {
305 |                if (threads[i] != NULL)
306 |                   CloseHandle ( threads[i] ) ;
307 |                }
308 |             return -best_err ;  // Signal greedy() that a catastrophic error occurred
309 |             }
310 |          ++n_threads ;
311 |          ++irand ;
312 |          } // if (irand < n_rand)
313 | 
314 |       if (n_threads == 0)  // Are we done?
315 |          break ;
316 | 
317 | /*
318 |    Handle full suite of threads running and more threads to add as soon as some are done.
319 |    Wait for just one thread to finish.
320 | */
321 | 
322 |       if (n_threads == max_threads  &&  irand < n_rand) {
323 |          ret_val = WaitForMultipleObjects ( n_threads , threads , FALSE , 12000000 ) ;
324 |          if (ret_val == WAIT_TIMEOUT  ||  ret_val == WAIT_FAILED  ||  ret_val < 0  ||  ret_val >= n_threads) {
325 |             sprintf ( msg, "INTERNAL ERROR!!!  Thread wait 1 failed (%d) in RBM_THR1", ret_val ) ;
326 |             audit ( msg ) ;
327 |             MEMTEXT ( msg ) ;
328 |             if (ret_val == WAIT_TIMEOUT)
329 |                audit ( "Timeout waiting for computation to finish; problem too large" ) ;
330 |             return -best_err ;  // Signal greedy() that a catastrophic error occurred
331 |             }
332 | 
333 |          error = params[ret_val].crit ;
334 | 
335 |          // If we just improved, save the best-so-far parameters
336 | 
337 |          if (error < best_err) {
338 |             best_err = error ;
339 |             for (ihid=0 ; ihid<nhid ; ihid++) {
340 |                hid_bias_best[ihid] = params[ret_val].hid_bias[ihid] ;
341 |                for (ivis=0 ; ivis<n_inputs ; ivis++)
342 |                   w_best[ihid*n_inputs+ivis] = params[ret_val].w[ihid*n_inputs+ivis] ;
343 |                }
344 | 
345 |             for (ivis=0 ; ivis<n_inputs ; ivis++)
346 |                in_bias_best[ivis] = params[ret_val].in_bias[ivis] ;
347 |             }
348 | 
349 | #if RECON_ERR_XENT
350 |          sprintf ( msg, "%d of %d  XENT=%7.4lf  Best=%7.4lf",
351 |                    irand-max_threads+1, n_rand, error / (n_inputs * nc),
352 |                    best_err / (n_inputs * nc) ) ;
353 | #else
354 |          sprintf ( msg, "%d of %d  RMS Err=%7.4lf  Best=%7.4lf",
355 |                    irand-max_threads+1, n_rand, sqrt ( error / (n_inputs * nc) ),
356 |                    sqrt ( best_err / (n_inputs * nc) ) ) ;
357 | #endif
358 | 
359 |          empty_slot = ret_val ;
360 |          CloseHandle ( threads[empty_slot] ) ;
361 |          threads[empty_slot] = NULL ;
362 |          --n_threads ;
363 |          }
364 | 
365 | /*
366 |    Handle all work has been started and now we are just waiting for threads to finish
367 | */
368 | 
369 |       else if (irand == n_rand) {
370 |          ret_val = WaitForMultipleObjects ( n_threads , threads , TRUE , 1200000 ) ;
371 |          if (ret_val == WAIT_TIMEOUT  ||  ret_val == WAIT_FAILED  ||  ret_val < 0  ||  ret_val >= n_threads) {
372 |             sprintf ( msg, "INTERNAL ERROR!!!  Thread wait 2 failed (%d) in RBM_THR1.CPP", ret_val ) ;
373 |             audit ( msg ) ;
374 |             MEMTEXT ( msg ) ;
375 |             if (ret_val == WAIT_TIMEOUT)
376 |                audit ( "Timeout waiting for computation to finish; problem too large" ) ;
377 |             return -best_err ;  // Signal greedy() that a catastrophic error occurred
378 |             }
379 | 
380 |          for (i=0 ; i<n_threads ; i++) {
381 | 
382 |             error = params[i].crit ;
383 | 
384 |             // If we just improved, save the best-so-far parameters
385 | 
386 |             if (error < best_err) {
387 |                for (ihid=0 ; ihid<nhid ; ihid++) {
388 |                   hid_bias_best[ihid] = params[i].hid_bias[ihid] ;
389 |                   best_err = error ;
390 |                   for (ivis=0 ; ivis<n_inputs ; ivis++)
391 |                      w_best[ihid*n_inputs+ivis] = params[i].w[ihid*n_inputs+ivis] ;
392 |                   }
393 |    
394 |                for (ivis=0 ; ivis<n_inputs ; ivis++)
395 |                   in_bias_best[ivis] = params[i].in_bias[ivis] ;
396 |                }
397 | 
398 |             CloseHandle ( threads[i] ) ;
399 | 
400 | #if RECON_ERR_XENT
401 |             sprintf ( msg, "%d of %d  XENT=%7.4lf  Best=%7.4lf",
402 |                       n_rand-n_threads+i+1, n_rand, error / (n_inputs * nc),
403 |                       best_err / (n_inputs * nc) ) ;
404 | #else
405 |             sprintf ( msg, "%d of %d  RMS Err=%7.4lf  Best=%7.4lf",
406 |                       n_rand-n_threads+i+1, n_rand, sqrt ( error / (n_inputs * nc) ),
407 |                       sqrt ( best_err / (n_inputs * nc) ) ) ;
408 | #endif
409 |             } // For i, processing all threads just returned
410 |          break ;
411 |          } // Waiting for final threads to finish
412 |       } // Endless loop which threads computation of criterion for all random tries
413 | 
414 | /*
415 |    Copy the best parameters (in ?_best) into the weights.
416 |    Since the error is stochastic, we cannot expect an exact match with what we will get
417 |    on the first epoch, which uses the 'best' weights.  But they should usually be close.
418 | */
419 | 
420 |    for (ihid=0 ; ihid<nhid ; ihid++) {
421 |       hid_bias[ihid] = hid_bias_best[ihid] ;
422 |       for (ivis=0 ; ivis<n_inputs ; ivis++)
423 |          w[ihid*n_inputs+ivis] = w_best[ihid*n_inputs+ivis] ;
424 |       }
425 | 
426 |    for (ivis=0 ; ivis<n_inputs ; ivis++)
427 |       in_bias[ivis] = in_bias_best[ivis] ;
428 | 
429 |    return best_err / (nc * n_inputs) ;
430 | }


--------------------------------------------------------------------------------
/V1 Source/MLFN_CUDA.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  MLFN_CUDA - MLFN routines modified for CUDA processing                    */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #define STRICT
  8 | #include <windows.h>
  9 | #include <commctrl.h>
 10 | #include <assert.h>
 11 | #include <stdlib.h>
 12 | #include <stdio.h>
 13 | #include <math.h>
 14 | #include <string.h>
 15 | #include <ctype.h>
 16 | #include <malloc.h>
 17 | #include <new.h>
 18 | #include <float.h>
 19 | #include <process.h>
 20 | 
 21 | #include "deep.rh"
 22 | #include "const.h"
 23 | #include "classes.h"
 24 | #include "extern.h"
 25 | #include "funcdefs.h"
 26 | 
 27 | 
 28 | /*
 29 | --------------------------------------------------------------------------------
 30 | 
 31 |    trial_error_cuda - Compute the mean square error for the entire training set
 32 | 
 33 | --------------------------------------------------------------------------------
 34 | */
 35 | 
 36 | double Model::trial_error_cuda (
 37 |    int nc ,             // Number of cases
 38 |    double *input ,      // Input matrix, nc by Model::n_model_inputs
 39 |    double *target       // Target matrix, nc by ntarg
 40 |    )
 41 | {
 42 |    int i, ilayer, ineuron, ivar, ret_val, ibatch, n_in_batch, n_subsets, max_batch, istart, istop, n_done ;
 43 |    int n_prior, gradlen, nin_this_layer, timer ;
 44 |    double mse, *wptr ;
 45 |    char msg[256] ;
 46 | 
 47 |    assert ( n_all >= 2 ) ;  // Use CUDA only if at least one hidden layer
 48 | 
 49 | /*
 50 |    In order to prevent integer overflow in allocating memory for the gradient
 51 |    we compute the minimum number of batches needed to get each batch small enough.
 52 | */
 53 | 
 54 |    gradlen = 0 ;
 55 |    n_prior = n_model_inputs ;
 56 |    for (i=0 ; i<n_all-1 ; i++) {
 57 |       gradlen += nhid_all[i] * (n_prior + 1) ;
 58 |       n_prior = nhid_all[i] ;
 59 |       }
 60 |    gradlen += ntarg * (n_prior + 1) ;
 61 |    assert ( gradlen == n_all_weights ) ;
 62 | 
 63 |    max_batch = MAXPOSNUM / (gradlen * sizeof(float)) ;  // Memory allocation size
 64 |    if (max_batch > 65535)                               // Grid dimension
 65 |       max_batch = 65535 ;
 66 |    n_subsets = nc / max_batch + 1 ;
 67 | 
 68 |    if (n_subsets < TrainParams.n_subsets)
 69 |       n_subsets = TrainParams.n_subsets ;
 70 | 
 71 |    else if (n_subsets > TrainParams.n_subsets  &&  ! mlfn_cuda_initialized) {
 72 |       sprintf ( msg, "MLFN CUDA increased n_subsets to %d", n_subsets ) ;
 73 |       MEMTEXT ( msg ) ;
 74 |       cudalog ( msg ) ;
 75 |       audit ( "" ) ;
 76 |       sprintf ( msg, "NOTE... Number of subsets had to be increased to %d", n_subsets ) ;
 77 |       audit ( msg ) ;
 78 |       }
 79 | 
 80 | 
 81 | /*
 82 |    Initialize CUDA device if not yet done for this session
 83 | 
 84 |    Programming WARNING... If ANY of the parameters in the call to mlfn_cuda_init change,
 85 |                           then mlfn_cuda_cleanup MUST be called and init redone!
 86 | */
 87 | 
 88 |    if (! mlfn_cuda_initialized) {
 89 | 
 90 |       n_done = 0 ;         // Must find max batch size for cuda init
 91 |       for (ibatch=0 ; ibatch<n_subsets ; ibatch++) {
 92 |          n_in_batch = (nc - n_done) / (n_subsets - ibatch) ;   // Cases left to do / batches left to do
 93 |          if (ibatch == 0  ||  n_in_batch > max_batch)
 94 |             max_batch = n_in_batch ;
 95 |          n_done += n_in_batch ;
 96 |          }
 97 | 
 98 |       assert ( max_batch * sizeof(float) <= MAXPOSNUM / gradlen ) ;
 99 | 
100 |       ret_val =  mlfn_cuda_init ( classifier , class_ids , nc , n_model_inputs , max_neurons , input ,
101 |                                   ntarg , target , max_batch , n_all , nhid_all , msg ) ;
102 | 
103 |       if (ret_val == ERROR_INSUFFICIENT_MEMORY) {
104 |          audit ( "" ) ;
105 |          audit ( "ERROR... Host computer has insufficient memory" ) ;
106 |          }
107 |       if (ret_val == ERROR_CUDA_MEMORY) {
108 |          audit ( "" ) ;
109 |          audit ( "ERROR... CUDA device has insufficient memory" ) ;
110 |          }
111 |       if (ret_val == ERROR_CUDA_ERROR) {
112 |          audit ( "" ) ;
113 |          audit ( "ERROR... CUDA device had unexpected serious error" ) ;
114 |          }
115 |       if (ret_val) {
116 |          audit ( "" ) ;
117 |          audit ( "ERROR... Unrecoverable serious error... aborting" ) ;
118 |          return -1.e40 ;
119 |          }
120 | 
121 |       mlfn_cuda_initialized = 1 ;
122 |       }
123 | 
124 | 
125 |    if (cuda_weights_changed) {
126 |       ++CudaTimers.mlfn_ncalls_weights ;
127 |       timer = timeGetTime() ;
128 |       ret_val = cuda_weights_to_device ( n_model_inputs , ntarg ,
129 |                   n_all , nhid_all , weights_opt , final_layer_weights ) ;
130 |       if (ret_val) {
131 |          audit ( "" ) ;
132 |          audit ( "ERROR - Serious CUDA error" ) ;
133 |          return -1.e40 ;
134 |          }
135 |       CudaTimers.mlfn_weights += timeGetTime() - timer ;
136 |       cuda_weights_changed = 0 ;
137 |       }
138 | 
139 |    istart = 0 ;         // Batch start = training data start
140 |    n_done = 0 ;         // Number of training cases done in this epoch so far
141 | 
142 |    for (ibatch=0 ; ibatch<n_subsets ; ibatch++) {
143 |       n_in_batch = (nc - n_done) / (n_subsets - ibatch) ;   // Cases left to do / batches left to do
144 |       istop = istart + n_in_batch ;                         // Stop just before this index
145 | 
146 |       for (ilayer=0 ; ilayer<n_all-1 ; ilayer++) {
147 |          ++CudaTimers.mlfn_ncalls_hidden[ilayer] ;
148 |          timer = timeGetTime() ;
149 |          ret_val = cuda_hidden_activation ( istart , istop , nhid_all[ilayer] , ilayer ) ;
150 |          if (ret_val) {
151 |             audit ( "" ) ;
152 |             sprintf ( msg, "ERROR - Serious CUDA error (1 - %d) in MLFN_CUDA.CPP trial_error_cuda", ilayer ) ;
153 |             audit ( msg ) ;
154 |             return -1.e40 ;
155 |             }
156 |          CudaTimers.mlfn_hidden[ilayer] += timeGetTime() - timer ;
157 |          }
158 | 
159 |       ++CudaTimers.mlfn_ncalls_outact ;
160 |       timer = timeGetTime() ;
161 |       ret_val = cuda_output_activation ( istart , istop , nhid_all[n_all-2] , ntarg , n_all-2 ) ;
162 |       if (ret_val) {
163 |          audit ( "" ) ;
164 |          audit ( "ERROR - Serious CUDA error (2) in MLFN_CUDA.CPP trial_error_cuda" ) ;
165 |          return -1.e40 ;
166 |          }
167 |       CudaTimers.mlfn_outact += timeGetTime() - timer ;
168 | 
169 |       if (classifier) {
170 |          ++CudaTimers.mlfn_ncalls_softmax ;
171 |          timer = timeGetTime() ;
172 |          ret_val = cuda_softmax ( istart , istop ) ;
173 |          if (ret_val) {
174 |             audit ( "" ) ;
175 |             audit ( "ERROR - Serious CUDA error (3) in MLFN_CUDA.CPP trial_error_cuda" ) ;
176 |             return -1.e40 ;
177 |             }
178 |          CudaTimers.mlfn_softmax += timeGetTime() - timer ;
179 |          }
180 | 
181 |       n_done += n_in_batch ;
182 |       istart = istop ;
183 |       }  // For all batches
184 | 
185 |    if (classifier) {
186 |       ++CudaTimers.mlfn_ncalls_ll ;
187 |       timer = timeGetTime() ;
188 |       ret_val = cuda_ll ( nc , &mse ) ;
189 |       CudaTimers.mlfn_ll += timeGetTime() - timer ;
190 |       mse /= ntarg ;
191 |       }
192 |    else {
193 |       ++CudaTimers.mlfn_ncalls_mse ;
194 |       timer = timeGetTime() ;
195 |       ret_val = cuda_mse ( nc * ntarg , &mse ) ;
196 |       CudaTimers.mlfn_mse += timeGetTime() - timer ;
197 |       }
198 | 
199 |    if (ret_val) {
200 |       audit ( "" ) ;
201 |       audit ( "ERROR - Serious CUDA error (4) in MLFN_CUDA.CPP trial_error_cuda" ) ;
202 |       return -1.e40 ;
203 |       }
204 | 
205 | 
206 | /*
207 |    Deal with weight penalty
208 | */
209 | 
210 |    ++CudaTimers.mlfn_ncalls_wpen ;
211 |    timer = timeGetTime() ;
212 |    penalty = 0.0 ;
213 |    nin_this_layer = n_model_inputs ;
214 |    for (ilayer=0 ; ilayer<n_all-1 ; ilayer++) {  // Do all hidden layers
215 |       for (ineuron=0 ; ineuron<nhid_all[ilayer] ; ineuron++) {
216 |          wptr = weights_opt[ilayer]+ineuron*(nin_this_layer+1) ;  // Weights for this neuron in this layer
217 |          for (ivar=0 ; ivar<nin_this_layer ; ivar++)              // Do not include bias
218 |             penalty += wptr[ivar] * wptr[ivar] ;
219 |          }
220 |       nin_this_layer = nhid_all[ilayer] ;
221 |       }
222 | 
223 |    for (ineuron=0 ; ineuron<ntarg ; ineuron++) {
224 |       wptr = final_layer_weights + ineuron * n_final_layer_weights ;
225 |       for (ivar=0 ; ivar<nin_this_layer ; ivar++)
226 |          penalty += wptr[ivar] * wptr[ivar] ;
227 |       }
228 |    CudaTimers.mlfn_wpen += timeGetTime() - timer ;
229 | 
230 |    penalty *= TrainParams.wpen / n_all_weights ;
231 |    return mse + penalty ;
232 | }
233 | 
234 | 
235 | /*
236 | --------------------------------------------------------------------------------
237 | 
238 |    gradient_cuda - Compute the gradient for the entire training set
239 | 
240 | --------------------------------------------------------------------------------
241 | */
242 | 
243 | double Model::gradient_cuda (
244 |    int nc ,             // Number of cases
245 |    double *input ,      // Input matrix, nc by Model::n_model_inputs
246 |    double *target ,     // Target matrix, nc by ntarg
247 |    double *grad         // Complete gradient
248 |    )
249 | {
250 |    int i, k, n, ilayer, ineuron, ivar, ret_val, ibatch, n_in_batch, n_subsets, istart, istop, n_done, max_batch ;
251 |    int n_prior, gradlen, nin_this_layer, timer ;
252 |    double mse, wpen, *wptr, *gptr ;
253 |    char msg[256] ;
254 | 
255 |    assert ( n_all >= 2 ) ;  // Use CUDA only if at least one hidden layer
256 | 
257 | // Setup pointers to gradient for each layer
258 |    gptr = grad ;  // CONJGRAD.CPP allocated this
259 | 
260 |    k = 0 ;
261 |    for (ilayer=0 ; ilayer<n_all ; ilayer++) {
262 |       grad_ptr[ilayer] = gptr ;
263 | 
264 |       if (ilayer == 0  &&  n_all == 1) {             // Direct input to output?
265 |          n = ntarg * (n_model_inputs+1) ;            // This many inputs to each neuron in this layer
266 |          gptr += n ;                                 // Not needed, but it illustrates the process
267 |          k += n ;   // Can remove this when final assert is assured
268 |          }
269 | 
270 |       else if (ilayer == 0) {                        // First hidden layer?
271 |          n = nhid_all[ilayer] * (n_model_inputs+1) ; // This many inputs to each neuron in this layer
272 |          gptr += n ;
273 |          k += n ;   // Can remove this when final assert is assured
274 |          }
275 | 
276 |       else if (ilayer < n_all-1) {                       // Subsequent hidden layer?
277 |          n = nhid_all[ilayer] * (nhid_all[ilayer-1]+1) ; // This many inputs to each neuron in this layer
278 |          gptr += n ;
279 |          k += n ;   // Can remove this when final assert is assured
280 |          }
281 | 
282 |       else {
283 |          assert ( (nhid_all[ilayer-1]+1) == n_final_layer_weights ) ;
284 |          n = ntarg * (nhid_all[ilayer-1]+1) ; // This many inputs to each neuron in this layer
285 |          k += n ;   // Can remove this when final assert is assured
286 |          }
287 |       } // For all layers, including output
288 | 
289 |    assert ( k == n_all_weights ) ;
290 | 
291 | /*
292 |    In order to prevent integer overflow in allocating memory for the gradient
293 |    we compute the minimum number of batches needed to get each batch small enough.
294 | */
295 | 
296 |    gradlen = 0 ;
297 |    n_prior = n_model_inputs ;
298 |    for (i=0 ; i<n_all-1 ; i++) {   // Hidden layers
299 |       gradlen += nhid_all[i] * (n_prior + 1) ;
300 |       n_prior = nhid_all[i] ;
301 |       }
302 |    gradlen += ntarg * (n_prior + 1) ;    // Output layer
303 |    assert ( gradlen == n_all_weights ) ;
304 | 
305 |    max_batch = MAXPOSNUM / (gradlen * sizeof(float)) ;  // Memory allocation size
306 |    if (max_batch > 65535)                               // Grid dimension
307 |       max_batch = 65535 ;
308 |    n_subsets = nc / max_batch + 1 ;
309 | 
310 |    if (n_subsets < TrainParams.n_subsets)
311 |       n_subsets = TrainParams.n_subsets ;
312 | 
313 |    else if (n_subsets > TrainParams.n_subsets  &&  ! mlfn_cuda_initialized) {
314 |       sprintf ( msg, "WARNING... MLFN CUDA increased n_subsets to %d", n_subsets ) ;
315 |       MEMTEXT ( msg ) ;
316 |       cudalog ( msg ) ;
317 |       audit ( "" ) ;
318 |       sprintf ( msg, "NOTE... Number of batches had to be increased to %d", n_subsets ) ;
319 |       audit ( msg ) ;
320 |       }
321 | 
322 | 
323 | /*
324 |    Initialize CUDA device if not yet done for this session
325 | 
326 |    Programming WARNING... If ANY of the parameters in the call to mlfn_cuda_init change,
327 |                           then mlfn_cuda_cleanup MUST be called and init redone!
328 | */
329 | 
330 |    if (! mlfn_cuda_initialized) {
331 | 
332 |       n_done = 0 ;         // Must find max batch size for cuda init
333 |       for (ibatch=0 ; ibatch<n_subsets ; ibatch++) {
334 |          n_in_batch = (nc - n_done) / (n_subsets - ibatch) ;   // Cases left to do / batches left to do
335 |          if (ibatch == 0  ||  n_in_batch > max_batch)
336 |             max_batch = n_in_batch ;
337 |          n_done += n_in_batch ;
338 |          }
339 | 
340 |       assert ( max_batch * sizeof(float) <= MAXPOSNUM / gradlen ) ;
341 | 
342 |       ret_val =  mlfn_cuda_init ( classifier , class_ids , nc , n_model_inputs , max_neurons , input ,
343 |                                   ntarg , target , max_batch , n_all , nhid_all , msg ) ;
344 | 
345 |       if (ret_val == ERROR_INSUFFICIENT_MEMORY) {
346 |          audit ( "" ) ;
347 |          audit ( "ERROR... Host computer has insufficient memory" ) ;
348 |          }
349 |       if (ret_val == ERROR_CUDA_MEMORY) {
350 |          audit ( "" ) ;
351 |          audit ( "ERROR... CUDA device has insufficient memory" ) ;
352 |          }
353 |       if (ret_val == ERROR_CUDA_ERROR) {
354 |          audit ( "" ) ;
355 |          audit ( "ERROR... CUDA device had unexpected serious error" ) ;
356 |          }
357 |       if (ret_val) {
358 |          audit ( "" ) ;
359 |          audit ( "ERROR... Unrecoverable serious error... aborting" ) ;
360 |          return -1.e40 ;
361 |          }
362 | 
363 |       mlfn_cuda_initialized = 1 ;
364 |       }
365 | 
366 | 
367 |    if (cuda_weights_changed) {
368 |       ++CudaTimers.mlfn_ncalls_weights ;
369 |       timer = timeGetTime() ;
370 |       ret_val = cuda_weights_to_device ( n_model_inputs , ntarg ,
371 |                   n_all , nhid_all , weights_opt , final_layer_weights ) ;
372 |       if (ret_val) {
373 |          audit ( "" ) ;
374 |          audit ( "ERROR - Serious CUDA error" ) ;
375 |          return -1.e40 ;
376 |          }
377 |       CudaTimers.mlfn_weights += timeGetTime() - timer ;
378 |       cuda_weights_changed = 0 ;
379 |       }
380 | 
381 | /*
382 |    Gradient computation starts here
383 | */
384 | 
385 |    for (i=0 ; i<n_all_weights ; i++)
386 |       grad[i] = 0.0 ;
387 | 
388 |    istart = 0 ;         // Batch start = training data start
389 |    n_done = 0 ;         // Number of training cases done in this epoch so far
390 | 
391 |    for (ibatch=0 ; ibatch<n_subsets ; ibatch++) {
392 |       n_in_batch = (nc - n_done) / (n_subsets - ibatch) ;   // Cases left to do / batches left to do
393 |       istop = istart + n_in_batch ;                         // Stop just before this index
394 | 
395 | /*
396 |    Forward pass
397 | */
398 | 
399 |       for (ilayer=0 ; ilayer<n_all-1 ; ilayer++) {
400 |          ++CudaTimers.mlfn_ncalls_hidden[ilayer] ;
401 |          timer = timeGetTime() ;
402 |          ret_val = cuda_hidden_activation ( istart , istop , nhid_all[ilayer] , ilayer ) ;
403 |          if (ret_val) {
404 |             audit ( "" ) ;
405 |             sprintf ( msg, "ERROR - Serious CUDA error (1 - %d) in MLFN_CUDA.CPP gradient_cuda", ilayer ) ;
406 |             audit ( msg ) ;
407 |             return -1.e40 ;
408 |             }
409 |          CudaTimers.mlfn_hidden[ilayer] += timeGetTime() - timer ;
410 |          }
411 | 
412 |       ++CudaTimers.mlfn_ncalls_outact ;
413 |       timer = timeGetTime() ;
414 |       ret_val = cuda_output_activation ( istart , istop , nhid_all[n_all-2] , ntarg , n_all-2 ) ;
415 |       if (ret_val) {
416 |          audit ( "" ) ;
417 |          audit ( "ERROR - Serious CUDA error (2) in MLFN_CUDA.CPP gradient_cuda" ) ;
418 |          return -1.e40 ;
419 |          }
420 |       CudaTimers.mlfn_outact += timeGetTime() - timer ;
421 | 
422 |       if (classifier) {
423 |          ++CudaTimers.mlfn_ncalls_softmax ;
424 |          timer = timeGetTime() ;
425 |          ret_val = cuda_softmax ( istart , istop ) ;
426 |          if (ret_val) {
427 |             audit ( "" ) ;
428 |             audit ( "ERROR - Serious CUDA error (3) in MLFN_CUDA.CPP trial_error_cuda" ) ;
429 |             return -1.e40 ;
430 |             }
431 |          CudaTimers.mlfn_softmax += timeGetTime() - timer ;
432 |          }
433 | 
434 | /*
435 |    Backward pass
436 | */
437 | 
438 |       ++CudaTimers.mlfn_ncalls_outdelta ;
439 |       timer = timeGetTime() ;
440 |       ret_val = cuda_output_delta ( istart , istop , classifier , ntarg ) ;
441 |       if (ret_val) {
442 |          audit ( "" ) ;
443 |          audit ( "ERROR - Serious CUDA error (4) in MLFN_CUDA.CPP gradient_cuda" ) ;
444 |          return -1.e40 ;
445 |          }
446 |       CudaTimers.mlfn_outdelta += timeGetTime() - timer ;
447 | 
448 |       ++CudaTimers.mlfn_ncalls_outgrad ;
449 |       timer = timeGetTime() ;
450 |       ret_val = cuda_output_gradient ( n_in_batch , nhid_all[n_all-2] , n_all-2 , ntarg ) ;
451 |       if (ret_val) {
452 |          audit ( "" ) ;
453 |          audit ( "ERROR - Serious CUDA error (5) in MLFN_CUDA.CPP gradient_cuda" ) ;
454 |          return -1.e40 ;
455 |          }
456 |       CudaTimers.mlfn_outgrad += timeGetTime() - timer ;
457 | 
458 |       for (ilayer=n_all-2 ; ilayer>0 ; ilayer--) {
459 |          ++CudaTimers.mlfn_ncalls_subgrad[ilayer-1] ;
460 |          timer = timeGetTime() ;
461 |          ret_val = cuda_subsequent_hidden_gradient ( n_in_batch , ilayer ,
462 |                               nhid_all[ilayer] , nhid_all[ilayer-1] , ilayer==n_all-2 ) ;
463 |          if (ret_val) {
464 |             audit ( "" ) ;
465 |             sprintf ( msg, "ERROR - Serious CUDA error (6 - %d) in MLFN_CUDA.CPP gradient_cuda", ilayer ) ;
466 |             audit ( msg ) ;
467 |             return -1.e40 ;
468 |             }
469 |          CudaTimers.mlfn_subgrad[ilayer-1] += timeGetTime() - timer ;
470 |          }
471 | 
472 |       ++CudaTimers.mlfn_ncalls_firstgrad ;
473 |       timer = timeGetTime() ;
474 |       ret_val = cuda_first_hidden_gradient ( istart , istop , n_model_inputs , nhid_all[0] , n_all==2 ) ;
475 |       if (ret_val) {
476 |          audit ( "" ) ;
477 |          audit ( "ERROR - Serious CUDA error (7) in MLFN_CUDA.CPP gradient_cuda" ) ;
478 |          return -1.e40 ;
479 |          }
480 |       CudaTimers.mlfn_firstgrad += timeGetTime() - timer ;
481 | 
482 |       ++CudaTimers.mlfn_ncalls_fetchgrad ;
483 |       timer = timeGetTime() ;
484 |       ret_val = cuda_fetch_gradient ( n_in_batch , grad ) ;
485 |       if (ret_val) {
486 |          audit ( "" ) ;
487 |          audit ( "ERROR - Serious CUDA error (8) in MLFN_CUDA.CPP gradient_cuda" ) ;
488 |          return -1.e40 ;
489 |          }
490 |       CudaTimers.mlfn_fetchgrad += timeGetTime() - timer ;
491 | 
492 |       n_done += n_in_batch ;
493 |       istart = istop ;
494 |       }  // For all batches
495 | 
496 |    for (i=0 ; i<n_all_weights ; i++)
497 |       grad[i] /= nc * ntarg ;
498 | 
499 | 
500 |    if (classifier) {
501 |       ++CudaTimers.mlfn_ncalls_ll ;
502 |       timer = timeGetTime() ;
503 |       ret_val = cuda_ll ( nc , &mse ) ;
504 |       CudaTimers.mlfn_ll += timeGetTime() - timer ;
505 |       mse /= ntarg ;  // cuda_ll() divided by n but not ntarg
506 |       }
507 |    else {
508 |       ++CudaTimers.mlfn_ncalls_mse ;
509 |       timer = timeGetTime() ;
510 |       ret_val = cuda_mse ( nc * ntarg , &mse ) ;
511 |       CudaTimers.mlfn_mse += timeGetTime() - timer ;
512 |       }
513 | 
514 |    if (ret_val) {
515 |       audit ( "" ) ;
516 |       audit ( "ERROR - Serious CUDA error (9) in MLFN_CUDA.CPP gradient_cuda" ) ;
517 |       return -1.e40 ;
518 |       }
519 | 
520 | 
521 | /*
522 |    Deal with weight penalty
523 |    First block of code does hidden layers, second does output layer
524 | */
525 | 
526 |    wpen = TrainParams.wpen / n_all_weights ;
527 |    penalty = 0.0 ;
528 |    nin_this_layer = n_model_inputs ;
529 | 
530 |    ++CudaTimers.mlfn_ncalls_wpen ;
531 |    timer = timeGetTime() ;
532 |    for (ilayer=0 ; ilayer<n_all-1 ; ilayer++) {  // Do all hidden layers
533 |       for (ineuron=0 ; ineuron<nhid_all[ilayer] ; ineuron++) {
534 |          wptr = weights_opt[ilayer] + ineuron*(nin_this_layer+1) ;  // Weights for this neuron in this layer
535 |          gptr = grad_ptr[ilayer] + ineuron*(nin_this_layer+1) ;     // Ditto grad
536 |          for (ivar=0 ; ivar<nin_this_layer ; ivar++) {              // Do not include bias
537 |             penalty += wptr[ivar] * wptr[ivar] ;
538 |             gptr[ivar] -= 2.0 * wpen * wptr[ivar] ;
539 |             }
540 |          }
541 |       nin_this_layer = nhid_all[ilayer] ;
542 |       }
543 | 
544 |    for (ineuron=0 ; ineuron<ntarg ; ineuron++) {
545 |       wptr = final_layer_weights + ineuron * n_final_layer_weights ;
546 |       gptr = grad_ptr[n_all-1] + ineuron * n_final_layer_weights ;
547 |       for (ivar=0 ; ivar<nin_this_layer ; ivar++) {                 // Do not include bias
548 |          penalty += wptr[ivar] * wptr[ivar] ;
549 |          gptr[ivar] -= 2.0 * wpen * wptr[ivar] ;
550 |          }
551 |       }
552 |    CudaTimers.mlfn_wpen += timeGetTime() - timer ;
553 | 
554 |    penalty *= wpen ;
555 |    return mse + penalty ;
556 | }


--------------------------------------------------------------------------------
/V2 Source/SVDCMP.TXT:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  SVDCMP  -  SingularValueDecomp class for computing the singular value     */
  4 | /*             decomposition of a rectangular matrix having at least as many  */
  5 | /*             rows as columns.                                               */
  6 | /*             This also includes a back-substitution routine for computing   */
  7 | /*             solutions to linear systems.                                   */
  8 | /*                                                                            */
  9 | /******************************************************************************/
 10 | 
 11 | 
 12 | /*
 13 | --------------------------------------------------------------------------------
 14 | 
 15 |    SingularValueDecomp - Singular value decomposition
 16 | 
 17 |    The following steps are needed to compute a least-squares solution
 18 |    to a (possibly overdetermined) linear system:
 19 |      1) Create a SingularValueDecomp object.  The constructor will allocate
 20 |         memory for the design matrix 'a', the right-hand-side 'b', and all
 21 |         scratch memory that it needs.  Optionally, the user can flag the
 22 |         constructor to preserve 'a' and return the decomposition in 'u'.
 23 |         Normally, 'a' is overwritten.
 24 |      2) The design matrix must be placed in 'a' and svdcmp called.
 25 |      3) Place the right-hand-side in 'b'
 26 |      4) Allocate a vector where the solution is to be placed.
 27 |         Call backsub with a pointer to this vector.
 28 | 
 29 | --------------------------------------------------------------------------------
 30 | */
 31 | 
 32 | inline double root_ss ( double x , double y )
 33 | {
 34 |    double ratio ;
 35 |    if (x < 0.0)
 36 |       x = -x ;
 37 |    if (y < 0.0)
 38 |       y = -y ;
 39 |    if (x > y) {
 40 |       ratio = y / x ;
 41 |       return x * sqrt ( ratio * ratio + 1.0 ) ;
 42 |       }
 43 |    else if (y == 0.0)
 44 |       return 0.0 ;
 45 |    else {
 46 |       ratio = x / y ;
 47 |       return y * sqrt ( ratio * ratio + 1.0 ) ;
 48 |       }
 49 | }
 50 | 
 51 | /*
 52 | --------------------------------------------------------------------------------
 53 | 
 54 |    Constructor - Allocate input/output and scratch memory.
 55 |                  Normally, this returns ok=1.  If not, the user called it with
 56 |                  more columns than rows, or there was insufficient memory.
 57 | 
 58 | --------------------------------------------------------------------------------
 59 | */
 60 | 
 61 | SingularValueDecomp::SingularValueDecomp ( int nr , int nc , int save_a )
 62 | {
 63 |    char msg[256] ;
 64 | 
 65 |    if (nc > nr) {             // Illegal
 66 |       rows = cols = ok = 0 ;
 67 |       return ;
 68 |       }
 69 | 
 70 |    a = (double *) malloc ( nr * nc * sizeof(double) ) ;
 71 |    w = (double *) malloc ( nc * sizeof(double) ) ;
 72 |    v = (double *) malloc ( nc * nc * sizeof(double) ) ;
 73 |    b = (double *) malloc ( nr * sizeof(double) ) ;
 74 |    work = (double *) malloc ( nc * sizeof(double) ) ;
 75 |    if (save_a)
 76 |       u = (double *) malloc ( nr * nc * sizeof(double) ) ;
 77 |    else
 78 |       u = NULL ;
 79 | 
 80 |    if ((a == NULL)  ||  (w == NULL)  ||  (v == NULL)  ||  (b == NULL)  ||
 81 |        (work == NULL)  ||  (save_a && (u == NULL))) {
 82 |       if (a != NULL)
 83 |          free ( a ) ;
 84 |       if (w != NULL)
 85 |          free ( w ) ;
 86 |       if (v != NULL)
 87 |          free ( v ) ;
 88 |       if (b != NULL)
 89 |          free ( b ) ;
 90 |       if (work != NULL)
 91 |          free ( work ) ;
 92 |       if (u != NULL)
 93 |          free ( u ) ;
 94 |       rows = cols = ok = 0 ;
 95 | //      MEMTEXT ( "ERROR: SingularValueDecomp failed" ) ;
 96 |       return ;
 97 |       }
 98 | 
 99 |    ok = 1 ;       // Flag to user that all went well
100 |    rows = nr ;
101 |    cols = nc ;
102 | }
103 | 
104 | /*
105 | --------------------------------------------------------------------------------
106 | 
107 |    Destructor - Free memory
108 | 
109 | --------------------------------------------------------------------------------
110 | */
111 | 
112 | SingularValueDecomp::~SingularValueDecomp ()
113 | {
114 |    if (! ok)    // If constructor's mallocs failed
115 |       return ;  // there is nothing to free
116 | 
117 |    free ( a ) ;
118 |    free ( w ) ;
119 |    free ( v ) ;
120 |    free ( b ) ;
121 |    free ( work ) ;
122 |    if (u != NULL)
123 |       free ( u ) ;
124 | }
125 | 
126 | 
127 | /*
128 | --------------------------------------------------------------------------------
129 | 
130 |    svdcmp - Singular value decomposition of 'a'
131 | 
132 | --------------------------------------------------------------------------------
133 | */
134 | 
135 | void SingularValueDecomp::svdcmp ()
136 | {
137 |    int i, sval, split, iter_limit ;
138 |    double *matrix ;
139 | 
140 |    if (u != NULL) {   // Must we keep 'a' intact?
141 |       memcpy ( u , a , rows * cols * sizeof(double) ) ;  // If so, copy it
142 |       matrix = u ;                                       // And work on copy
143 |       }
144 |    else              // If not, operate directly on 'a'
145 |       matrix = a ;
146 | 
147 |    bidiag ( matrix ) ;       // Reduce to bidiagonal
148 |    right ( matrix ) ;        // Accumulate right transforms
149 |    left ( matrix ) ;         // And left
150 | 
151 |    sval = cols ;
152 |    while (sval--) {    // Loop over the singular values in reverse order
153 |       iter_limit = 50 ;
154 |       while (iter_limit--) {  // Avoid nearly endless loop (very rare!)
155 |          split = sval + 1 ;
156 |          while (--split) {    // Keep splitting as long as possible
157 |             if (norm + fabs (work[split]) == norm) {
158 |                break ;
159 |                }
160 |             if (norm + fabs (w[split-1]) == norm) {
161 |                cancel ( split , sval , matrix ) ;
162 |                break ;
163 |                }
164 |             }
165 |          if (split == sval) {     // Converged?
166 |             if (w[sval] < 0.0) {  // Keep them nonnegative
167 |                w[sval] = -w[sval] ;
168 |                for (i=0 ; i<cols ; i++)
169 |                   v[i*cols+sval] = -v[i*cols+sval] ;
170 |                }
171 |             break ;
172 |             }
173 |          qr ( split , sval , matrix ) ;
174 |          }
175 |       }
176 | }
177 | 
178 | /*
179 | --------------------------------------------------------------------------------
180 | 
181 |    bidiag - Householder reduction to bidiagonal
182 | 
183 | --------------------------------------------------------------------------------
184 | */
185 | 
186 | void SingularValueDecomp::bidiag ( double *matrix )
187 | {
188 |    int col, k ;
189 |    double temp, testnorm, scale ;
190 | 
191 |    norm = temp = scale = 0.0 ;
192 | 
193 |    for (col=0 ; col<cols ; col++) {
194 | 
195 |       work[col] = scale * temp ;
196 | 
197 |       scale = 0.0 ;
198 |       for (k=col ; k<rows ; k++)
199 |          scale += fabs ( matrix[k*cols+col] ) ;
200 | 
201 |       if (scale > 0.0)
202 |          w[col] = scale * bid1 ( col , matrix , scale ) ;
203 |       else 
204 |          w[col] = 0.0 ;
205 | 
206 |       scale = 0.0 ;
207 |       for (k=col+1 ; k<cols ; k++)
208 |          scale += fabs ( matrix[col*cols+k] ) ;
209 | 
210 |       if (scale > 0.0)
211 |          temp = bid2 ( col , matrix , scale ) ;
212 |       else
213 |          temp = 0.0 ;
214 | 
215 |       testnorm = fabs (w[col]) + fabs (work[col]) ;
216 |       if (testnorm > norm)
217 |          norm = testnorm ;
218 |       }
219 | }
220 | 
221 | double SingularValueDecomp::bid1 ( int col , double *matrix , double scale )
222 | {
223 |    int i, j ;
224 |    double diag, rv, fac, sum ;
225 | 
226 |    sum = 0.0 ;
227 |    for (i=col ; i<rows ; i++) {
228 |       fac = (matrix[i*cols+col] /= scale) ;
229 |       sum += fac * fac ;
230 |       }
231 |    rv = sqrt ( sum ) ;
232 |    diag = matrix[col*cols+col] ;
233 |    if (diag > 0.0)
234 |       rv = -rv ;
235 |    fac = 1.0 / (diag * rv - sum) ;
236 |    matrix[col*cols+col] = diag - rv ;
237 | 
238 |    for (j=col+1 ; j<cols ; j++) {
239 |       sum = 0.0 ;
240 |       for (i=col ; i<rows ; i++)
241 |          sum += matrix[i*cols+col] * matrix[i*cols+j] ;
242 |       sum *= fac ;
243 |       for (i=col ; i<rows ; i++)
244 |          matrix[i*cols+j] += sum * matrix[i*cols+col] ;
245 |       }
246 | 
247 |    for (i=col ; i<rows ; i++)
248 |       matrix[i*cols+col] *= scale ;
249 | 
250 |    return rv ;
251 | }
252 | 
253 | double SingularValueDecomp::bid2 ( int col , double *matrix , double scale )
254 | {
255 |    int i, j ;
256 |    double fac, diag, rv, sum ;
257 | 
258 |    sum = 0.0 ;
259 |    for (i=col+1 ; i<cols ; i++) {
260 |       fac = (matrix[col*cols+i] /= scale) ;
261 |       sum += fac * fac ;
262 |       }
263 | 
264 |    rv = sqrt ( sum ) ;
265 |    diag = matrix[col*cols+col+1] ;
266 |    if (diag > 0.0)
267 |       rv = -rv ;
268 | 
269 |    matrix[col*cols+col+1] = diag - rv ;
270 |    fac = 1.0 / (diag * rv - sum) ;
271 |    for (i=col+1 ; i<cols ; i++)
272 |       work[i] = fac * matrix[col*cols+i] ;
273 | 
274 |    for (j=col+1 ; j<rows ; j++) {
275 |       sum = 0.0 ;
276 |       for (i=col+1 ; i<cols ; i++)
277 |          sum += matrix[j*cols+i] * matrix[col*cols+i] ;
278 |       for (i=col+1 ; i<cols ; i++)
279 |          matrix[j*cols+i] += sum * work[i] ;
280 |       }
281 |    for (i=col+1 ; i<cols ; i++)
282 |       matrix[col*cols+i] *= scale ;
283 |    return rv ;
284 | }
285 | 
286 | 
287 | /*
288 | --------------------------------------------------------------------------------
289 | 
290 |    Cumulate right and left transforms
291 | 
292 | --------------------------------------------------------------------------------
293 | */
294 | 
295 | void SingularValueDecomp::right ( double *matrix )
296 | {
297 |    int col, i, j ;
298 |    double temp, denom, sum ;
299 | 
300 |    denom = 0.0 ;
301 |    col = cols ;
302 |    while (col--) {
303 |       if (denom != 0.0) {
304 |          temp = 1.0 / matrix[col*cols+col+1] ;
305 |          for (i=col+1 ; i<cols ; i++)  // Double division avoids underflow
306 |             v[i*cols+col] = temp * matrix[col*cols+i] / denom ;
307 |          for (i=col+1 ; i<cols ; i++) {
308 |             sum = 0.0 ;
309 |             for (j=col+1 ; j<cols ; j++)
310 |                sum += v[j*cols+i] * matrix[col*cols+j] ;
311 |             for (j=col+1 ; j<cols ; j++)
312 |                v[j*cols+i] += sum * v[j*cols+col] ;
313 |             }
314 |          }
315 | 
316 |       denom = work[col] ;
317 | 
318 |       for (i=col+1 ; i<cols ; i++)
319 |          v[col*cols+i] = v[i*cols+col] = 0.0 ;
320 |       v[col*cols+col] = 1.0 ;
321 |       }
322 | }
323 | 
324 | void SingularValueDecomp::left ( double *matrix )
325 | {
326 |    int col, i, j ;
327 |    double temp, fac, sum ;
328 | 
329 |    col = cols ;
330 |    while (col--) {
331 | 
332 |       for (i=col+1 ; i<cols ; i++)
333 |          matrix[col*cols+i] = 0.0 ;
334 | 
335 |       if (w[col] == 0.0) {
336 |          for (i=col ; i<rows ; i++)
337 |             matrix[i*cols+col] = 0.0 ;
338 |          }
339 | 
340 |       else {
341 |          fac = 1.0 / w[col] ;
342 |          temp = fac / matrix[col*cols+col]  ;
343 | 
344 |          for (i=col+1 ; i<cols ; i++) {
345 |             sum = 0.0 ;
346 |             for (j=col+1 ; j<rows ; j++)
347 |                sum += matrix[j*cols+col] * matrix[j*cols+i] ;
348 |             sum *= temp ;
349 |             for (j=col ; j<rows ; j++)
350 |                matrix[j*cols+i] += sum * matrix[j*cols+col] ;
351 |             }
352 |          for (i=col ; i<rows ; i++)
353 |             matrix[i*cols+col] *= fac ;
354 |          }
355 | 
356 |       matrix[col*cols+col] += 1.0 ;
357 |       }
358 | }
359 | 
360 | 
361 | /*
362 | --------------------------------------------------------------------------------
363 | 
364 |    cancel
365 | 
366 | --------------------------------------------------------------------------------
367 | */
368 | 
369 | void SingularValueDecomp::cancel (
370 |    int low ,
371 |    int high ,
372 |    double *matrix
373 |    )
374 | {
375 |    int col, row, lm1 ;
376 |    double sine, cosine, leg1, leg2, svhypot, y, x, *mpt1, *mpt2 ;
377 | 
378 |    lm1 = low - 1 ;
379 |    sine = 1.0 ;
380 |    for (col=low ; col<=high ; col++) {
381 |       leg1 = sine * work[col] ;
382 |       if (fabs (leg1) + norm != norm) {
383 |          leg2 = w[col] ;
384 |          w[col] = svhypot = root_ss ( leg1 , leg2 ) ;
385 |          sine = -leg1 / svhypot ;
386 |          cosine =  leg2 / svhypot ;
387 |          for (row=0 ; row<rows ; row++) {
388 |             mpt1 = matrix + row * cols + col ;
389 |             mpt2 = matrix + row * cols + lm1 ;
390 |             x = *mpt1 ;
391 |             y = *mpt2 ;
392 |             *mpt1 = x * cosine  -  y * sine ;
393 |             *mpt2 = x * sine  +  y * cosine ;
394 |             }
395 |          }
396 |       }
397 | }
398 | 
399 | /*
400 | --------------------------------------------------------------------------------
401 | 
402 |    qr
403 | 
404 | --------------------------------------------------------------------------------
405 | */
406 | 
407 | void SingularValueDecomp::qr (
408 |    int low ,
409 |    int high ,
410 |    double *matrix )
411 | {
412 |    int col ;
413 |    double sine, cosine, wk, tx, ty, x, y, svhypot, temp, ww, wh, wkh, whm1, wkhm1;
414 | 
415 |    wh = w[high] ;
416 |    whm1 = w[high-1] ;
417 |    wkh = work[high] ;
418 |    wkhm1 = work[high-1] ;
419 |    temp = 2.0 * wkh * whm1 ;
420 |    if (temp != 0.0)
421 |       temp = ((whm1+wh) * (whm1-wh) + (wkhm1+wkh) * (wkhm1-wkh)) / temp ;
422 |    else
423 |       temp = 0.0 ;
424 | 
425 |    svhypot = root_ss ( temp , 1.0 ) ;
426 |    if (temp < 0.0)
427 |       svhypot = -svhypot ;
428 | 
429 |    ww = w[low] ;
430 |    wk = wkh * (whm1 / (temp + svhypot) - wkh)  +  (ww+wh) * (ww-wh) ;
431 |    if (ww != 0.0)
432 |       wk /= ww ;
433 |    else
434 |       wk = 0.0 ;
435 | 
436 |    sine = cosine = 1.0 ;
437 | 
438 |    for (col=low ; col<high ; col++) {
439 |       x = work[col+1] ;
440 |       ty = sine * x ;
441 |       x *= cosine ;
442 |       svhypot = root_ss ( wk , ty ) ;
443 |       work[col] = svhypot ;
444 |       cosine = wk / svhypot ;
445 |       sine = ty / svhypot ;
446 |       tx = ww * cosine  +  x * sine ;
447 |       x = x * cosine  -  ww * sine ;
448 |       y = w[col+1] ;
449 |       ty = y * sine ;
450 |       y *= cosine ;
451 |       qr_vrot ( col , sine , cosine ) ;
452 |       w[col] = svhypot = root_ss ( tx , ty ) ;
453 |       if (svhypot != 0.0) {
454 |          cosine = tx / svhypot ;
455 |          sine = ty / svhypot ;
456 |          }
457 |       qr_mrot ( col , sine , cosine , matrix ) ;
458 |       wk = cosine * x  +  sine * y ;
459 |       ww = cosine * y  -  sine * x ;
460 |       }
461 |    work[low] = 0.0 ;
462 |    work[high] = wk ;
463 |    w[high] = ww ;
464 | }
465 | 
466 | void SingularValueDecomp::qr_vrot ( int col , double sine , double cosine )
467 | {
468 |    int row ;
469 |    double x, y, *vptr ;
470 | 
471 |    for (row=0 ; row<cols ; row++) {
472 |       vptr = v + row * cols + col ;
473 |       x = *vptr ;
474 |       y = *(vptr+1) ;
475 |       *vptr = x * cosine  +  y * sine ;
476 |       *(vptr+1) = y * cosine  -  x * sine ;
477 |       }
478 | }
479 | 
480 | void SingularValueDecomp::qr_mrot ( int col , double sine , double cosine ,
481 |                                     double *matrix )
482 | {
483 |    int row ;
484 |    double x, y, *mptr ;
485 | 
486 |    for (row=0 ; row<rows ; row++) {
487 |       mptr = matrix + row * cols + col ;
488 |       x = *mptr ;
489 |       y = *(mptr+1) ;
490 |       *mptr = x * cosine  +  y * sine ;
491 |       *(mptr+1) = y * cosine  -  x * sine ;
492 |       }
493 | }
494 | 
495 | /*
496 | --------------------------------------------------------------------------------
497 | 
498 |    Backsubstitution algorithm for solving Ax=b where A generated u, w, v
499 |    Inputs are not destroyed, so it may be called with several b's.
500 |    The user must have filled in the public RHS 'b' before calling this.
501 | 
502 | --------------------------------------------------------------------------------
503 | */
504 | 
505 | void SingularValueDecomp::backsub (
506 |    double limit ,  // SV limit (about sqrt machine precision is good)
507 |    double *soln    // Output: solution
508 |    )
509 | {
510 |    int i, j ;
511 |    double sum, wmax, *matrix ;
512 | 
513 |    if (u != NULL)    // If we preserved 'a', use 'u'
514 |       matrix = u ;
515 |    else              // Else 'u' is in 'a'
516 |       matrix = a ;
517 | 
518 | /*
519 |    Scale the threshold to make it relative to the norm
520 | */
521 | 
522 |    for (i=0 ; i<cols ; i++) {
523 |       if ((i == 0)  ||  (w[i] > wmax))
524 |          wmax = w[i] ;
525 |       }
526 | 
527 |    limit = limit * wmax  +  1.e-60 ;
528 | 
529 | /*
530 |    Find U'b
531 | */
532 | 
533 |    for (i=0 ; i<cols ; i++) {
534 |       sum = 0.0 ;
535 |       if (w[i] > limit) {
536 |          for (j=0 ; j<rows ; j++)
537 |             sum += matrix[j*cols+i] * b[j] ;
538 |          sum /= w[i] ;
539 |          }
540 |       work[i] = sum ;
541 |       }
542 | 
543 | /*
544 |    Multiply by V to complete the solution
545 | */
546 | 
547 |    for (i=0 ; i<cols ; i++) {
548 |       sum = 0.0 ;
549 |       for (j=0 ; j<cols ; j++)
550 |          sum += v[i*cols+j] * work[j] ;
551 |       soln[i] = sum ;
552 |       }
553 | }
554 | 
555 | 
556 | #if 0
557 | /*
558 | --------------------------------------------------------------------------------
559 | 
560 |    Optional main to test it
561 | 
562 | --------------------------------------------------------------------------------
563 | */
564 | 
565 | #define RANDMAX 32767
566 | 
567 | void main ( int argc , char *argv[] )
568 | {
569 |    int rep, m, n, i, j, k, reps ;
570 |    double *x, *sa, sum, err, wmin, wmax ;
571 |    SingularValueDecomp *s ;
572 | 
573 |    if (argc != 4) {
574 |       printf ( "\nUSAGE: test rows cols reps" ) ;
575 |       exit ( 0 ) ;
576 |       }
577 | 
578 |    m = atoi ( argv[1] ) ;
579 |    n = atoi ( argv[2] ) ;
580 |    reps = atoi ( argv[3] ) ;
581 | 
582 |    if (m <= 0  ||  n <= 0  ||  reps <= 0)
583 |       exit ( 0 ) ;
584 | 
585 |    sa = (double *) malloc ( m * n * sizeof(double) ) ;
586 |    x = (double *) malloc ( n * sizeof(double) ) ;
587 |    s = new SingularValueDecomp ( m , n , 1 ) ;
588 | 
589 |    if (! s->ok) {
590 |       printf ( "\nError" ) ;
591 |       exit ( 1 ) ;
592 |       }
593 | 
594 |    for (rep=0 ; rep < reps ; rep++) {
595 | 
596 |       if (_kbhit()) {
597 |          if (_getch() == 27)
598 |             exit ( 0 ) ;
599 |          }
600 | 
601 |       if ((m == n)  &&  ! rep) {  // Ill cond
602 |          for (i=0 ; i<m ; i++) {
603 |             for (j=0 ; j<n ; j++)
604 |                sa[i*n+j] = s->a[i*n+j] = 1.0 / (i + j + 1.0) ;
605 |             s->b[i] = (double) (rand() - RANDMAX/2) / (double) RANDMAX ;
606 |             }
607 |          }
608 |       else {
609 |          for (i=0 ; i<m ; i++) {
610 |             for (j=0 ; j<n ; j++) {
611 |                if (j > 100  &&  j % 10 == 0)
612 |                   s->a[i*n+j] = 0.0 ;
613 |                else if (j > 100  &&  j % 10 == 5)
614 |                   s->a[i*n+j] = s->a[i*n+j-1] + s->a[i*n+j-2] ;
615 |                else
616 |                   s->a[i*n+j] = (double) (rand() - RANDMAX/2) / (double) RANDMAX ;
617 |                sa[i*n+j] = s->a[i*n+j] ;
618 |                }
619 |             s->b[i] = (double) (rand() - RANDMAX/2) / (double) RANDMAX ;
620 |             }
621 |          }
622 | 
623 |       s->svdcmp () ;
624 | 
625 |       wmin = 1.e30 ;
626 |       wmax = -1.e30 ;
627 |       for (i=0 ; i<n ; i++) {
628 |          if (s->w[i] < wmin)
629 |             wmin = s->w[i] ;
630 |          if (s->w[i] > wmax)
631 |             wmax = s->w[i] ;
632 |          }
633 | 
634 |       printf ( "\n%d %d (%.2le %.2le)", m, n, wmin, wmax ) ;
635 | 
636 |       err = 0.0 ;
637 |       for (i=0 ; i<m ; i++) {
638 |          for (j=0 ; j<n ; j++) {
639 |             sum = 0.0 ;
640 |             for (k=0 ; k<n ; k++)
641 |                sum += s->u[i*n+k] * s->w[k] * s->v[j*n+k] ;
642 |             err += fabs ( sum - sa[i*n+j] ) ;
643 |             }
644 |          }
645 | 
646 |       printf ( " Rep=%.8lf", err ) ;
647 | //      if (fabs(err) > 1.e-10) {
648 | //         printf ( "\a" ) ;
649 | //         _getch() ;
650 | //         }
651 | 
652 |       err = 0.0 ;
653 |       for (i=0 ; i<n ; i++) {
654 |          for (j=0 ; j<n ; j++) {
655 |             sum = 0.0 ;
656 |             for (k=0 ; k<m ; k++)
657 |                sum += s->u[k*n+i] * s->u[k*n+j] ;
658 |             if (i == j)
659 |                err += fabs ( sum - 1.0 ) ;
660 |             else 
661 |                err += fabs ( sum ) ;
662 |             }
663 |          for (j=0 ; j<n ; j++) {
664 |             sum = 0.0 ;
665 |             for (k=0 ; k<n ; k++)
666 |                sum += s->v[k*n+i] * s->v[k*n+j] ;
667 |             if (i == j)
668 |                err += fabs ( sum - 1.0 ) ;
669 |             else 
670 |                err += fabs ( sum ) ;
671 |             }
672 |          }
673 |       printf ( " Orthog=%.8lf", err ) ;
674 | //      if (fabs(err) > 1.e-10) {
675 | //         printf ( "\a" ) ;
676 | //         _getch() ;
677 | //         }
678 | 
679 |       if (m == n) {
680 |          s->backsub ( 1.e-8 , x ) ;
681 |          err = 0.0 ;
682 | 
683 |          for (i=0 ; i<m ; i++) {
684 |             sum = 0.0 ;
685 |             for (j=0 ; j<n ; j++)
686 |                sum += x[j] * sa[i*n+j] ;
687 |             err += fabs ( sum - s->b[i] ) ;
688 |             }
689 | 
690 |          printf ( " Back=%.8lf", err ) ;
691 | //         if (fabs(err) > 1.e-10) {
692 | //            printf ( "\a" ) ;
693 | //            _getch() ;
694 | //            }
695 |          }
696 | 
697 |       err = 0.0 ;
698 |       for (i=0 ; i<m ; i++) {
699 |          for (j=0 ; j<n ; j++)
700 |             err += fabs ( sa[i*n+j] - s->a[i*n+j] ) ;
701 |          }
702 |       printf ( " Save=%.8lf", err ) ;
703 | //      if (fabs(err) > 1.e-10) {
704 | //         printf ( "\a" ) ;
705 | //         _getch() ;
706 | //         }
707 |       }
708 | 
709 |    free ( sa ) ;
710 |    free ( x ) ;
711 |    delete s ;
712 | }
713 | #endif
714 | 


--------------------------------------------------------------------------------
/V1 Source/SVDCMP.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  SVDCMP  -  SingularValueDecomp class for computing the singular value     */
  4 | /*             decomposition of a rectangular matrix having at least as many  */
  5 | /*             rows as columns.                                               */
  6 | /*             This also includes a back-substitution routine for computing   */
  7 | /*             solutions to linear systems.                                   */
  8 | /*                                                                            */
  9 | /*  This is based on the implementation in Press "Numerical Recipes."         */
 10 | /*                                                                            */
 11 | /******************************************************************************/
 12 | 
 13 | #define STRICT
 14 | #include <windows.h>
 15 | #include <commctrl.h>
 16 | #include <assert.h>
 17 | #include <stdlib.h>
 18 | #include <stdio.h>
 19 | #include <math.h>
 20 | #include <string.h>
 21 | #include <ctype.h>
 22 | #include <malloc.h>
 23 | #include <new.h>
 24 | #include <float.h>
 25 | #include <process.h>
 26 | 
 27 | #include "deep.rh"
 28 | #include "const.h"
 29 | #include "classes.h"
 30 | #include "extern.h"
 31 | #include "funcdefs.h"
 32 | 
 33 | /*
 34 | --------------------------------------------------------------------------------
 35 | 
 36 |    SingularValueDecomp - Singular value decomposition
 37 | 
 38 |    The following steps are needed to compute a least-squares solution
 39 |    to a (possibly overdetermined) linear system:
 40 |      1) Create a SingularValueDecomp object.  The constructor will allocate
 41 |         memory for the design matrix 'a', the right-hand-side 'b', and all
 42 |         scratch memory that it needs.  Optionally, the user can flag the
 43 |         constructor to preserve 'a' and return the decomposition in 'u'.
 44 |         Normally, 'a' is overwritten.
 45 |      2) The design matrix must be placed in 'a' and svdcmp called.
 46 |      3) Place the right-hand-side in 'b'
 47 |      4) Allocate a vector where the solution is to be placed.
 48 |         Call backsub with a pointer to this vector.
 49 | 
 50 | --------------------------------------------------------------------------------
 51 | */
 52 | 
 53 | inline double root_ss ( double x , double y )
 54 | {
 55 |    double ratio ;
 56 |    if (x < 0.0)
 57 |       x = -x ;
 58 |    if (y < 0.0)
 59 |       y = -y ;
 60 |    if (x > y) {
 61 |       ratio = y / x ;
 62 |       return x * sqrt ( ratio * ratio + 1.0 ) ;
 63 |       }
 64 |    else if (y == 0.0)
 65 |       return 0.0 ;
 66 |    else {
 67 |       ratio = x / y ;
 68 |       return y * sqrt ( ratio * ratio + 1.0 ) ;
 69 |       }
 70 | }
 71 | 
 72 | /*
 73 | --------------------------------------------------------------------------------
 74 | 
 75 |    Constructor - Allocate input/output and scratch memory.
 76 |                  Normally, this returns ok=1.  If not, the user called it with
 77 |                  more columns than rows, or there was insufficient memory.
 78 | 
 79 | --------------------------------------------------------------------------------
 80 | */
 81 | 
 82 | SingularValueDecomp::SingularValueDecomp ( int nr , int nc , int save_a )
 83 | {
 84 |    char msg[256] ;
 85 | 
 86 |    if (nc > nr) {             // Illegal
 87 |       rows = cols = ok = 0 ;
 88 |       return ;
 89 |       }
 90 | 
 91 |    a = (double *) memallocX ( nr * nc * sizeof(double) ) ;
 92 |    w = (double *) memallocX ( nc * sizeof(double) ) ;
 93 |    v = (double *) memallocX ( nc * nc * sizeof(double) ) ;
 94 |    b = (double *) memallocX ( nr * sizeof(double) ) ;
 95 |    work = (double *) memallocX ( nc * sizeof(double) ) ;
 96 |    if (save_a)
 97 |       u = (double *) memallocX ( nr * nc * sizeof(double) ) ;
 98 |    else
 99 |       u = NULL ;
100 | 
101 |    if ((a == NULL)  ||  (w == NULL)  ||  (v == NULL)  ||  (b == NULL)  ||
102 |        (work == NULL)  ||  (save_a && (u == NULL))) {
103 |       if (a != NULL)
104 |          memfreeX ( a ) ;
105 |       if (w != NULL)
106 |          memfreeX ( w ) ;
107 |       if (v != NULL)
108 |          memfreeX ( v ) ;
109 |       if (b != NULL)
110 |          memfreeX ( b ) ;
111 |       if (work != NULL)
112 |          memfreeX ( work ) ;
113 |       if (u != NULL)
114 |          memfreeX ( u ) ;
115 |       rows = cols = ok = 0 ;
116 |       return ;
117 |       }
118 | 
119 |    ok = 1 ;       // Flag to user that all went well
120 |    rows = nr ;
121 |    cols = nc ;
122 | }
123 | 
124 | /*
125 | --------------------------------------------------------------------------------
126 | 
127 |    Destructor - Free memory
128 | 
129 | --------------------------------------------------------------------------------
130 | */
131 | 
132 | SingularValueDecomp::~SingularValueDecomp ()
133 | {
134 |    if (! ok)    // If constructor's mallocs failed
135 |       return ;  // there is nothing to free
136 | 
137 |    memfreeX ( a ) ;
138 |    memfreeX ( w ) ;
139 |    memfreeX ( v ) ;
140 |    memfreeX ( b ) ;
141 |    memfreeX ( work ) ;
142 |    if (u != NULL)
143 |       memfreeX ( u ) ;
144 | }
145 | 
146 | 
147 | /*
148 | --------------------------------------------------------------------------------
149 | 
150 |    svdcmp - Singular value decomposition of 'a'
151 | 
152 | --------------------------------------------------------------------------------
153 | */
154 | 
155 | void SingularValueDecomp::svdcmp ()
156 | {
157 |    int i, sval, split, iter_limit ;
158 |    double *matrix ;
159 | 
160 |    if (u != NULL) {   // Must we keep 'a' intact?
161 |       memcpy ( u , a , rows * cols * sizeof(double) ) ;  // If so, copy it
162 |       matrix = u ;                                       // And work on copy
163 |       }
164 |    else              // If not, operate directly on 'a'
165 |       matrix = a ;
166 | 
167 |    bidiag ( matrix ) ;       // Reduce to bidiagonal
168 |    right ( matrix ) ;        // Accumulate right transforms
169 |    left ( matrix ) ;         // And left
170 | 
171 |    sval = cols ;
172 |    while (sval--) {    // Loop over the singular values in reverse order
173 |       iter_limit = 50 ;
174 |       while (iter_limit--) {  // Avoid nearly endless loop (very rare!)
175 |          split = sval + 1 ;
176 |          while (--split) {    // Keep splitting as long as possible
177 |             if (norm + fabs (work[split]) == norm) {
178 |                break ;
179 |                }
180 |             if (norm + fabs (w[split-1]) == norm) {
181 |                cancel ( split , sval , matrix ) ;
182 |                break ;
183 |                }
184 |             }
185 |          if (split == sval) {     // Converged?
186 |             if (w[sval] < 0.0) {  // Keep them nonnegative
187 |                w[sval] = -w[sval] ;
188 |                for (i=0 ; i<cols ; i++)
189 |                   v[i*cols+sval] = -v[i*cols+sval] ;
190 |                }
191 |             break ;
192 |             }
193 |          qr ( split , sval , matrix ) ;
194 |          }
195 |       }
196 | }
197 | 
198 | /*
199 | --------------------------------------------------------------------------------
200 | 
201 |    bidiag - Householder reduction to bidiagonal
202 | 
203 | --------------------------------------------------------------------------------
204 | */
205 | 
206 | void SingularValueDecomp::bidiag ( double *matrix )
207 | {
208 |    int col, k ;
209 |    double temp, testnorm, scale ;
210 | 
211 |    norm = temp = scale = 0.0 ;
212 | 
213 |    for (col=0 ; col<cols ; col++) {
214 | 
215 |       work[col] = scale * temp ;
216 | 
217 |       scale = 0.0 ;
218 |       for (k=col ; k<rows ; k++)
219 |          scale += fabs ( matrix[k*cols+col] ) ;
220 | 
221 |       if (scale > 0.0)
222 |          w[col] = scale * bid1 ( col , matrix , scale ) ;
223 |       else 
224 |          w[col] = 0.0 ;
225 | 
226 |       scale = 0.0 ;
227 |       for (k=col+1 ; k<cols ; k++)
228 |          scale += fabs ( matrix[col*cols+k] ) ;
229 | 
230 |       if (scale > 0.0)
231 |          temp = bid2 ( col , matrix , scale ) ;
232 |       else
233 |          temp = 0.0 ;
234 | 
235 |       testnorm = fabs (w[col]) + fabs (work[col]) ;
236 |       if (testnorm > norm)
237 |          norm = testnorm ;
238 |       }
239 | }
240 | 
241 | double SingularValueDecomp::bid1 ( int col , double *matrix , double scale )
242 | {
243 |    int i, j ;
244 |    double diag, rv, fac, sum ;
245 | 
246 |    sum = 0.0 ;
247 |    for (i=col ; i<rows ; i++) {
248 |       fac = (matrix[i*cols+col] /= scale) ;
249 |       sum += fac * fac ;
250 |       }
251 |    rv = sqrt ( sum ) ;
252 |    diag = matrix[col*cols+col] ;
253 |    if (diag > 0.0)
254 |       rv = -rv ;
255 |    fac = 1.0 / (diag * rv - sum) ;
256 |    matrix[col*cols+col] = diag - rv ;
257 | 
258 |    for (j=col+1 ; j<cols ; j++) {
259 |       sum = 0.0 ;
260 |       for (i=col ; i<rows ; i++)
261 |          sum += matrix[i*cols+col] * matrix[i*cols+j] ;
262 |       sum *= fac ;
263 |       for (i=col ; i<rows ; i++)
264 |          matrix[i*cols+j] += sum * matrix[i*cols+col] ;
265 |       }
266 | 
267 |    for (i=col ; i<rows ; i++)
268 |       matrix[i*cols+col] *= scale ;
269 | 
270 |    return rv ;
271 | }
272 | 
273 | double SingularValueDecomp::bid2 ( int col , double *matrix , double scale )
274 | {
275 |    int i, j ;
276 |    double fac, diag, rv, sum ;
277 | 
278 |    sum = 0.0 ;
279 |    for (i=col+1 ; i<cols ; i++) {
280 |       fac = (matrix[col*cols+i] /= scale) ;
281 |       sum += fac * fac ;
282 |       }
283 | 
284 |    rv = sqrt ( sum ) ;
285 |    diag = matrix[col*cols+col+1] ;
286 |    if (diag > 0.0)
287 |       rv = -rv ;
288 | 
289 |    matrix[col*cols+col+1] = diag - rv ;
290 |    fac = 1.0 / (diag * rv - sum) ;
291 |    for (i=col+1 ; i<cols ; i++)
292 |       work[i] = fac * matrix[col*cols+i] ;
293 | 
294 |    for (j=col+1 ; j<rows ; j++) {
295 |       sum = 0.0 ;
296 |       for (i=col+1 ; i<cols ; i++)
297 |          sum += matrix[j*cols+i] * matrix[col*cols+i] ;
298 |       for (i=col+1 ; i<cols ; i++)
299 |          matrix[j*cols+i] += sum * work[i] ;
300 |       }
301 |    for (i=col+1 ; i<cols ; i++)
302 |       matrix[col*cols+i] *= scale ;
303 |    return rv ;
304 | }
305 | 
306 | 
307 | /*
308 | --------------------------------------------------------------------------------
309 | 
310 |    Cumulate right and left transforms
311 | 
312 | --------------------------------------------------------------------------------
313 | */
314 | 
315 | void SingularValueDecomp::right ( double *matrix )
316 | {
317 |    int col, i, j ;
318 |    double temp, denom, sum ;
319 | 
320 |    denom = 0.0 ;
321 |    col = cols ;
322 |    while (col--) {
323 |       if (denom != 0.0) {
324 |          temp = 1.0 / matrix[col*cols+col+1] ;
325 |          for (i=col+1 ; i<cols ; i++)  // Double division avoids underflow
326 |             v[i*cols+col] = temp * matrix[col*cols+i] / denom ;
327 |          for (i=col+1 ; i<cols ; i++) {
328 |             sum = 0.0 ;
329 |             for (j=col+1 ; j<cols ; j++)
330 |                sum += v[j*cols+i] * matrix[col*cols+j] ;
331 |             for (j=col+1 ; j<cols ; j++)
332 |                v[j*cols+i] += sum * v[j*cols+col] ;
333 |             }
334 |          }
335 | 
336 |       denom = work[col] ;
337 | 
338 |       for (i=col+1 ; i<cols ; i++)
339 |          v[col*cols+i] = v[i*cols+col] = 0.0 ;
340 |       v[col*cols+col] = 1.0 ;
341 |       }
342 | }
343 | 
344 | void SingularValueDecomp::left ( double *matrix )
345 | {
346 |    int col, i, j ;
347 |    double temp, fac, sum ;
348 | 
349 |    col = cols ;
350 |    while (col--) {
351 | 
352 |       for (i=col+1 ; i<cols ; i++)
353 |          matrix[col*cols+i] = 0.0 ;
354 | 
355 |       if (w[col] == 0.0) {
356 |          for (i=col ; i<rows ; i++)
357 |             matrix[i*cols+col] = 0.0 ;
358 |          }
359 | 
360 |       else {
361 |          fac = 1.0 / w[col] ;
362 |          temp = fac / matrix[col*cols+col]  ;
363 | 
364 |          for (i=col+1 ; i<cols ; i++) {
365 |             sum = 0.0 ;
366 |             for (j=col+1 ; j<rows ; j++)
367 |                sum += matrix[j*cols+col] * matrix[j*cols+i] ;
368 |             sum *= temp ;
369 |             for (j=col ; j<rows ; j++)
370 |                matrix[j*cols+i] += sum * matrix[j*cols+col] ;
371 |             }
372 |          for (i=col ; i<rows ; i++)
373 |             matrix[i*cols+col] *= fac ;
374 |          }
375 | 
376 |       matrix[col*cols+col] += 1.0 ;
377 |       }
378 | }
379 | 
380 | 
381 | /*
382 | --------------------------------------------------------------------------------
383 | 
384 |    cancel
385 | 
386 | --------------------------------------------------------------------------------
387 | */
388 | 
389 | void SingularValueDecomp::cancel (
390 |    int low ,
391 |    int high ,
392 |    double *matrix
393 |    )
394 | {
395 |    int col, row, lm1 ;
396 |    double sine, cosine, leg1, leg2, svhypot, y, x, *mpt1, *mpt2 ;
397 | 
398 |    lm1 = low - 1 ;
399 |    sine = 1.0 ;
400 |    for (col=low ; col<=high ; col++) {
401 |       leg1 = sine * work[col] ;
402 |       if (fabs (leg1) + norm != norm) {
403 |          leg2 = w[col] ;
404 |          w[col] = svhypot = root_ss ( leg1 , leg2 ) ;
405 |          sine = -leg1 / svhypot ;
406 |          cosine =  leg2 / svhypot ;
407 |          for (row=0 ; row<rows ; row++) {
408 |             mpt1 = matrix + row * cols + col ;
409 |             mpt2 = matrix + row * cols + lm1 ;
410 |             x = *mpt1 ;
411 |             y = *mpt2 ;
412 |             *mpt1 = x * cosine  -  y * sine ;
413 |             *mpt2 = x * sine  +  y * cosine ;
414 |             }
415 |          }
416 |       }
417 | }
418 | 
419 | /*
420 | --------------------------------------------------------------------------------
421 | 
422 |    qr
423 | 
424 | --------------------------------------------------------------------------------
425 | */
426 | 
427 | void SingularValueDecomp::qr (
428 |    int low ,
429 |    int high ,
430 |    double *matrix )
431 | {
432 |    int col ;
433 |    double sine, cosine, wk, tx, ty, x, y, svhypot, temp, ww, wh, wkh, whm1, wkhm1;
434 | 
435 |    wh = w[high] ;
436 |    whm1 = w[high-1] ;
437 |    wkh = work[high] ;
438 |    wkhm1 = work[high-1] ;
439 |    temp = 2.0 * wkh * whm1 ;
440 |    if (temp != 0.0)
441 |       temp = ((whm1+wh) * (whm1-wh) + (wkhm1+wkh) * (wkhm1-wkh)) / temp ;
442 |    else
443 |       temp = 0.0 ;
444 | 
445 |    svhypot = root_ss ( temp , 1.0 ) ;
446 |    if (temp < 0.0)
447 |       svhypot = -svhypot ;
448 | 
449 |    ww = w[low] ;
450 |    wk = wkh * (whm1 / (temp + svhypot) - wkh)  +  (ww+wh) * (ww-wh) ;
451 |    if (ww != 0.0)
452 |       wk /= ww ;
453 |    else
454 |       wk = 0.0 ;
455 | 
456 |    sine = cosine = 1.0 ;
457 | 
458 |    for (col=low ; col<high ; col++) {
459 |       x = work[col+1] ;
460 |       ty = sine * x ;
461 |       x *= cosine ;
462 |       svhypot = root_ss ( wk , ty ) ;
463 |       work[col] = svhypot ;
464 |       cosine = wk / svhypot ;
465 |       sine = ty / svhypot ;
466 |       tx = ww * cosine  +  x * sine ;
467 |       x = x * cosine  -  ww * sine ;
468 |       y = w[col+1] ;
469 |       ty = y * sine ;
470 |       y *= cosine ;
471 |       qr_vrot ( col , sine , cosine ) ;
472 |       w[col] = svhypot = root_ss ( tx , ty ) ;
473 |       if (svhypot != 0.0) {
474 |          cosine = tx / svhypot ;
475 |          sine = ty / svhypot ;
476 |          }
477 |       qr_mrot ( col , sine , cosine , matrix ) ;
478 |       wk = cosine * x  +  sine * y ;
479 |       ww = cosine * y  -  sine * x ;
480 |       }
481 |    work[low] = 0.0 ;
482 |    work[high] = wk ;
483 |    w[high] = ww ;
484 | }
485 | 
486 | void SingularValueDecomp::qr_vrot ( int col , double sine , double cosine )
487 | {
488 |    int row ;
489 |    double x, y, *vptr ;
490 | 
491 |    for (row=0 ; row<cols ; row++) {
492 |       vptr = v + row * cols + col ;
493 |       x = *vptr ;
494 |       y = *(vptr+1) ;
495 |       *vptr = x * cosine  +  y * sine ;
496 |       *(vptr+1) = y * cosine  -  x * sine ;
497 |       }
498 | }
499 | 
500 | void SingularValueDecomp::qr_mrot ( int col , double sine , double cosine ,
501 |                                     double *matrix )
502 | {
503 |    int row ;
504 |    double x, y, *mptr ;
505 | 
506 |    for (row=0 ; row<rows ; row++) {
507 |       mptr = matrix + row * cols + col ;
508 |       x = *mptr ;
509 |       y = *(mptr+1) ;
510 |       *mptr = x * cosine  +  y * sine ;
511 |       *(mptr+1) = y * cosine  -  x * sine ;
512 |       }
513 | }
514 | 
515 | /*
516 | --------------------------------------------------------------------------------
517 | 
518 |    Backsubstitution algorithm for solving Ax=b where A generated u, w, v
519 |    Inputs are not destroyed, so it may be called with several b's.
520 |    The user must have filled in the public RHS 'b' before calling this.
521 | 
522 | --------------------------------------------------------------------------------
523 | */
524 | 
525 | void SingularValueDecomp::backsub (
526 |    double limit ,  // SV limit (about sqrt machine precision is good)
527 |    double *soln    // Output: solution
528 |    )
529 | {
530 |    int i, j ;
531 |    double sum, wmax, *matrix ;
532 | 
533 |    if (u != NULL)    // If we preserved 'a', use 'u'
534 |       matrix = u ;
535 |    else              // Else 'u' is in 'a'
536 |       matrix = a ;
537 | 
538 | /*
539 |    Scale the threshold to make it relative to the norm
540 | */
541 | 
542 |    for (i=0 ; i<cols ; i++) {
543 |       if ((i == 0)  ||  (w[i] > wmax))
544 |          wmax = w[i] ;
545 |       }
546 | 
547 |    limit = limit * wmax  +  1.e-60 ;
548 | 
549 | /*
550 |    Find U'b
551 | */
552 | 
553 |    for (i=0 ; i<cols ; i++) {
554 |       sum = 0.0 ;
555 |       if (w[i] > limit) {
556 |          for (j=0 ; j<rows ; j++)
557 |             sum += matrix[j*cols+i] * b[j] ;
558 |          sum /= w[i] ;
559 |          }
560 |       work[i] = sum ;
561 |       }
562 | 
563 | /*
564 |    Multiply by V to complete the solution
565 | */
566 | 
567 |    for (i=0 ; i<cols ; i++) {
568 |       sum = 0.0 ;
569 |       for (j=0 ; j<cols ; j++)
570 |          sum += v[i*cols+j] * work[j] ;
571 |       soln[i] = sum ;
572 |       }
573 | }
574 | 
575 | 
576 | #if 0
577 | /*
578 | --------------------------------------------------------------------------------
579 | 
580 |    Optional main to test it
581 | 
582 | --------------------------------------------------------------------------------
583 | */
584 | 
585 | #define RANDMAX 32767
586 | 
587 | void main ( int argc , char *argv[] )
588 | {
589 |    int rep, m, n, i, j, k, reps ;
590 |    double *x, *sa, sum, err, wmin, wmax ;
591 |    SingularValueDecomp *s ;
592 | 
593 |    if (argc != 4) {
594 |       printf ( "\nUSAGE: test rows cols reps" ) ;
595 |       exit ( 0 ) ;
596 |       }
597 | 
598 |    m = atoi ( argv[1] ) ;
599 |    n = atoi ( argv[2] ) ;
600 |    reps = atoi ( argv[3] ) ;
601 | 
602 |    if (m <= 0  ||  n <= 0  ||  reps <= 0)
603 |       exit ( 0 ) ;
604 | 
605 |    sa = (double *) malloc ( m * n * sizeof(double) ) ;
606 |    x = (double *) malloc ( n * sizeof(double) ) ;
607 |    s = new SingularValueDecomp ( m , n , 1 ) ;
608 | 
609 |    if (! s->ok) {
610 |       printf ( "\nError" ) ;
611 |       exit ( 1 ) ;
612 |       }
613 | 
614 |    for (rep=0 ; rep < reps ; rep++) {
615 | 
616 |       if (_kbhit()) {
617 |          if (_getch() == 27)
618 |             exit ( 0 ) ;
619 |          }
620 | 
621 |       if ((m == n)  &&  ! rep) {  // Ill cond
622 |          for (i=0 ; i<m ; i++) {
623 |             for (j=0 ; j<n ; j++)
624 |                sa[i*n+j] = s->a[i*n+j] = 1.0 / (i + j + 1.0) ;
625 |             s->b[i] = (double) (rand() - RANDMAX/2) / (double) RANDMAX ;
626 |             }
627 |          }
628 |       else {
629 |          for (i=0 ; i<m ; i++) {
630 |             for (j=0 ; j<n ; j++) {
631 |                if (j > 100  &&  j % 10 == 0)
632 |                   s->a[i*n+j] = 0.0 ;
633 |                else if (j > 100  &&  j % 10 == 5)
634 |                   s->a[i*n+j] = s->a[i*n+j-1] + s->a[i*n+j-2] ;
635 |                else
636 |                   s->a[i*n+j] = (double) (rand() - RANDMAX/2) / (double) RANDMAX ;
637 |                sa[i*n+j] = s->a[i*n+j] ;
638 |                }
639 |             s->b[i] = (double) (rand() - RANDMAX/2) / (double) RANDMAX ;
640 |             }
641 |          }
642 | 
643 |       s->svdcmp () ;
644 | 
645 |       wmin = 1.e30 ;
646 |       wmax = -1.e30 ;
647 |       for (i=0 ; i<n ; i++) {
648 |          if (s->w[i] < wmin)
649 |             wmin = s->w[i] ;
650 |          if (s->w[i] > wmax)
651 |             wmax = s->w[i] ;
652 |          }
653 | 
654 |       printf ( "\n%d %d (%.2le %.2le)", m, n, wmin, wmax ) ;
655 | 
656 |       err = 0.0 ;
657 |       for (i=0 ; i<m ; i++) {
658 |          for (j=0 ; j<n ; j++) {
659 |             sum = 0.0 ;
660 |             for (k=0 ; k<n ; k++)
661 |                sum += s->u[i*n+k] * s->w[k] * s->v[j*n+k] ;
662 |             err += fabs ( sum - sa[i*n+j] ) ;
663 |             }
664 |          }
665 | 
666 |       printf ( " Rep=%.8lf", err ) ;
667 | //      if (fabs(err) > 1.e-10) {
668 | //         printf ( "\a" ) ;
669 | //         _getch() ;
670 | //         }
671 | 
672 |       err = 0.0 ;
673 |       for (i=0 ; i<n ; i++) {
674 |          for (j=0 ; j<n ; j++) {
675 |             sum = 0.0 ;
676 |             for (k=0 ; k<m ; k++)
677 |                sum += s->u[k*n+i] * s->u[k*n+j] ;
678 |             if (i == j)
679 |                err += fabs ( sum - 1.0 ) ;
680 |             else 
681 |                err += fabs ( sum ) ;
682 |             }
683 |          for (j=0 ; j<n ; j++) {
684 |             sum = 0.0 ;
685 |             for (k=0 ; k<n ; k++)
686 |                sum += s->v[k*n+i] * s->v[k*n+j] ;
687 |             if (i == j)
688 |                err += fabs ( sum - 1.0 ) ;
689 |             else 
690 |                err += fabs ( sum ) ;
691 |             }
692 |          }
693 |       printf ( " Orthog=%.8lf", err ) ;
694 | //      if (fabs(err) > 1.e-10) {
695 | //         printf ( "\a" ) ;
696 | //         _getch() ;
697 | //         }
698 | 
699 |       if (m == n) {
700 |          s->backsub ( 1.e-8 , x ) ;
701 |          err = 0.0 ;
702 | 
703 |          for (i=0 ; i<m ; i++) {
704 |             sum = 0.0 ;
705 |             for (j=0 ; j<n ; j++)
706 |                sum += x[j] * sa[i*n+j] ;
707 |             err += fabs ( sum - s->b[i] ) ;
708 |             }
709 | 
710 |          printf ( " Back=%.8lf", err ) ;
711 | //         if (fabs(err) > 1.e-10) {
712 | //            printf ( "\a" ) ;
713 | //            _getch() ;
714 | //            }
715 |          }
716 | 
717 |       err = 0.0 ;
718 |       for (i=0 ; i<m ; i++) {
719 |          for (j=0 ; j<n ; j++)
720 |             err += fabs ( sa[i*n+j] - s->a[i*n+j] ) ;
721 |          }
722 |       printf ( " Save=%.8lf", err ) ;
723 | //      if (fabs(err) > 1.e-10) {
724 | //         printf ( "\a" ) ;
725 | //         _getch() ;
726 | //         }
727 |       }
728 | 
729 |    free ( sa ) ;
730 |    free ( x ) ;
731 |    delete s ;
732 | }
733 | #endif
734 | 


--------------------------------------------------------------------------------
/V2 Source/THREADED_GRAD.TXT:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 | --------------------------------------------------------------------------------
  4 | 
  5 |    Local routine to compute activation (real version with logistic response)
  6 | 
  7 | --------------------------------------------------------------------------------
  8 | */
  9 | 
 10 | 
 11 | static void activity (
 12 |    double *input ,   // This neuron's input vector, ninputs long
 13 |    double *coefs ,   // Weight vector, ninputs+1 long (bias is at end)
 14 |    double *output ,  // Achieved activation of this neuron
 15 |    int ninputs ,     // Number of inputs
 16 |    int outlin        // Activation function is identity if nonzero, else logistic
 17 |    )
 18 | {
 19 |    double sum ;
 20 | 
 21 |    sum = dotprod ( ninputs , input , coefs ) ;
 22 |    sum += coefs[ninputs] ;      // Bias term
 23 | 
 24 |    if (outlin)
 25 |       *output = sum ;
 26 |     else
 27 |       *output = 1.0 / (1.0 + exp(-sum)) ;
 28 | }
 29 | 
 30 | 
 31 | /*
 32 | -----------------------------------------------------------------------
 33 | 
 34 |    activity_cc() - Compute complex-valued activity of a neuron
 35 |                    with complex inputs and hyperbolic tangent response
 36 | 
 37 | -----------------------------------------------------------------------
 38 | */
 39 | 
 40 | static void activity_cc (
 41 |    double *input ,   // This neuron's input vector, 2 * ninputs long
 42 |    double *coefs ,   // Weight vector, 2 * (ninputs+1) long (bias is at end)
 43 |    double *output ,  // Achieved activation of this neuron (real, imag)
 44 |    double *d_rr ,    // If non-null, returns partial of real activation wrt real input
 45 |    double *d_ii ,    // Ditto, imag wrt imag
 46 |    double *d_ri ,    // Ditto, real wrt imag, which equals imag wrt real
 47 |    int ninputs ,     // Number of possibly complex inputs; actual is double this
 48 |    int outlin        // Activation function is identity if nonzero, else tanh
 49 |    )
 50 | {
 51 |    double rsum, isum, raw_length, squashed_length, ratio, deriv, len_sq, temp ;
 52 | 
 53 |    dotprodc ( ninputs , input , coefs , &rsum , &isum ) ;
 54 |    rsum += coefs[2*ninputs] ;      // Bias term
 55 |    isum += coefs[2*ninputs+1] ;
 56 | 
 57 |    if (outlin) {
 58 |       *output = rsum ;
 59 |       *(output+1) = isum ;
 60 |       return ;
 61 |       }
 62 | 
 63 |    len_sq = rsum * rsum + isum * isum + 1.e-60 ;
 64 |    raw_length = sqrt ( len_sq ) ;
 65 |    squashed_length = tanh ( 1.5 * raw_length ) ;
 66 |    ratio = squashed_length / raw_length ;
 67 | 
 68 |    *output = rsum * ratio ;
 69 |    *(output+1) = isum * ratio ;
 70 | 
 71 |    if (d_rr == NULL)
 72 |       return ;
 73 | 
 74 |    deriv = 1.5 * (1.0 - squashed_length * squashed_length) ;
 75 |    temp = (deriv - ratio) / len_sq ;
 76 | 
 77 |    *d_rr = ratio + rsum * rsum * temp ;
 78 |    *d_ii = ratio + isum * isum * temp ;
 79 |    *d_ri = rsum * isum * temp ;
 80 | }
 81 | 
 82 | 
 83 | 
 84 | /*
 85 | --------------------------------------------------------------------------------
 86 | 
 87 |    trial_thr - Compute the output for a given input by evaluating network
 88 |                It optionally also computes partial derivatives.
 89 |                This is a strictly local version for threading.
 90 | 
 91 |                When called for a complex net, nin and nout refer to complex
 92 |                numbers, so they must be doubled to reflect actual counts,
 93 |                and the output will always be complex.
 94 | 
 95 | --------------------------------------------------------------------------------
 96 | */
 97 | 
 98 | static void trial_thr (
 99 |    double *input ,                 // Input vector nin long
100 |    int n_layers ,                  // Number of layers, including output, not including input
101 |    int nin ,                       // Number of possibly complex inputs to the model (actual is double this if complex)
102 |    double *outputs ,               // Output vector of the model
103 |    int nout ,                      // Number of possibly complex outputs (actual is double this if complex)
104 |    int *nhid ,                     // nhid[i] is the number of hidden neurons in hidden layer i
105 |    double *weights[] ,             // weights[i] points to the weight vector for hidden layer i
106 |    double *hid_act[] ,             // hid_act[i] points to the vector of activations of hidden layer i
107 |    double *hid_rr[] ,              // Partial of real activation wrt real input
108 |    double *hid_ii[] ,              // Ditto, imaginary
109 |    double *hid_ri[] ,              // Ditto, real wrt imaginary = imaginary wrt real
110 |    double *last_layer_weights ,    // Weights of final layer
111 |    int complex ,                   // Is this a complex network?
112 |    int classifier                  // If nonzero use SoftMax output; else use linear output
113 |    )
114 | {
115 |    int i, ilayer ;
116 |    double sum ;
117 | 
118 |    for (ilayer=0 ; ilayer<n_layers ; ilayer++) {
119 | 
120 |       if (ilayer == 0  &&  n_layers == 1) {        // Direct input to output?
121 |          for (i=0 ; i<nout ; i++) {
122 |             if (complex)
123 |                activity_cc ( input , last_layer_weights+i*2*(nin+1) , outputs+2*i ,
124 |                              NULL , NULL , NULL , nin , 1 ) ;
125 |             else
126 |                activity ( input , last_layer_weights+i*(nin+1) , outputs+i , nin , 1 ) ;
127 |             }
128 |          }
129 | 
130 |       else if (ilayer == 0) {                   // First hidden layer?
131 |          for (i=0 ; i<nhid[ilayer] ; i++) {
132 |             if (complex) {
133 |                if (hid_rr != NULL)
134 |                   activity_cc ( input , weights[ilayer]+i*2*(nin+1) , hid_act[ilayer]+2*i ,
135 |                                 hid_rr[ilayer]+i , hid_ii[ilayer]+i , hid_ri[ilayer]+i , nin , 0 ) ;
136 |                else
137 |                   activity_cc ( input , weights[ilayer]+i*2*(nin+1) , hid_act[ilayer]+2*i ,
138 |                                 NULL , NULL , NULL , nin , 0 ) ;
139 |                }
140 |             else
141 |                activity ( input , weights[ilayer]+i*(nin+1) , hid_act[ilayer]+i , nin , 0 ) ;
142 |             }
143 |          }
144 | 
145 |       else if (ilayer < n_layers-1) {              // Subsequent hidden layer?
146 |          for (i=0 ; i<nhid[ilayer] ; i++) {
147 |             if (complex) {
148 |                if (hid_rr != NULL)
149 |                   activity_cc ( hid_act[ilayer-1] , weights[ilayer]+i*2*(nhid[ilayer-1]+1) , hid_act[ilayer]+2*i ,
150 |                                 hid_rr[ilayer]+i , hid_ii[ilayer]+i , hid_ri[ilayer]+i , nhid[ilayer-1] , 0 );
151 |                else
152 |                   activity_cc ( hid_act[ilayer-1] , weights[ilayer]+i*2*(nhid[ilayer-1]+1) , hid_act[ilayer]+2*i ,
153 |                                 NULL , NULL , NULL , nhid[ilayer-1] , 0 );
154 |                }
155 |             else
156 |                activity ( hid_act[ilayer-1] , weights[ilayer]+i*(nhid[ilayer-1]+1) , hid_act[ilayer]+i , nhid[ilayer-1] , 0 );
157 |             }
158 |          }
159 | 
160 |       else {                                    // Final layer
161 |          for (i=0 ; i<nout ; i++) {
162 |             if (complex)
163 |                activity_cc ( hid_act[ilayer-1] , last_layer_weights+i*2*(nhid[ilayer-1]+1) , outputs+2*i ,
164 |                              NULL , NULL , NULL , nhid[ilayer-1] , 1 );
165 |             else
166 |                activity ( hid_act[ilayer-1] , last_layer_weights+i*(nhid[ilayer-1]+1) , outputs+i , nhid[ilayer-1] , 1 );
167 |             }
168 |          }
169 |       }
170 | 
171 |    if (classifier) {  // Classifier is always SoftMax
172 |       if (complex) {
173 |          sum = 0.0 ;
174 |          for (i=0 ; i<nout ; i++) {  // For all outputs
175 |             if (outputs[2*i] < 300.0)
176 |                outputs[2*i] = exp ( outputs[2*i] ) ;
177 |             else
178 |                outputs[2*i] = exp ( 300.0 ) ;
179 |             sum += outputs[2*i] ;
180 |             }
181 |          for (i=0 ; i<nout ; i++)
182 |             outputs[2*i] /= sum ;
183 |          }
184 | 
185 |       else {         // Real domain
186 |          sum = 0.0 ;
187 |          for (i=0 ; i<nout ; i++) {  // For all outputs
188 |             if (outputs[i] < 300.0)
189 |                outputs[i] = exp ( outputs[i] ) ;
190 |             else
191 |                outputs[i] = exp ( 300.0 ) ;
192 |             sum += outputs[i] ;
193 |             }
194 |          for (i=0 ; i<nout ; i++)
195 |             outputs[i] /= sum ;
196 |          }
197 |       } // If classifier
198 | }
199 | 
200 | 
201 | /*
202 | --------------------------------------------------------------------------------
203 | 
204 |    batch_gradient - Cumulate the gradient for a given subset of inputs
205 | 
206 |    Note: grad is all gradients as a vector, and grad_ptr[ilayer] points to
207 |          the entry in grad that is for the first weight in a layer
208 | 
209 | --------------------------------------------------------------------------------
210 | */
211 | 
212 | 
213 | static double batch_gradient (
214 |    int istart ,                    // Index of starting case in input matrix
215 |    int istop ,                     // And one past last case
216 |    double *input ,                 // Input matrix; each case is max_neurons long
217 |    double *targets ,               // Target matrix; strictly real, so each case is nout long
218 |    int *class_ids ,                // Class id vector if classifier (ignored if not)
219 |    int n_layers ,                  // Number of layers, including output, not including input
220 |    int n_weights ,                 // Total number of weights, including final layer and all bias terms
221 |    int nin ,                       // Number of possibly complex inputs to the model; Input matrix may have more columns (actual is double this if complex)
222 |    double *outputs ,               // Output vector of the model; used as work vector here
223 |    int nout ,                      // Number of possibly complex outputs (actual is double this if complex)
224 |    int *nhid ,                     // nhid[i] is the number of hidden neurons in hidden layer i
225 |    double *weights[] ,             // weights[i] points to the weight vector for hidden layer i
226 |    double *hid_act[] ,             // hid_act[i] points to the vector of activations of hidden layer i
227 |    double *hid_rr[] ,              // Partial of real activation wrt real input
228 |    double *hid_ii[] ,              // Ditto, imaginary
229 |    double *hid_ri[] ,              // Ditto, real wrt imaginary = imaginary wrt real
230 |    int max_neurons ,               // Number of columns in input matrix; may exceed nin; this is actual if complex
231 |    double *this_delta ,            // Delta for the current layer
232 |    double *prior_delta ,           // And saved for use in the prior (next to be processed) layer
233 |    double **grad_ptr ,             // grad_ptr[i] points to gradient for layer i
234 |    double *last_layer_weights ,    // Weights of final layer
235 |    double *grad ,                  // All computed gradients, strung out as a single long vector
236 |    int complex ,                   // Is this a complex network?
237 |    int classifier                  // If nonzero use SoftMax output; else use linear output
238 |    )
239 | {
240 |    int i, j, icase, ilayer, nprev, nthis, nnext, mult, iclass ;
241 |    double diff, *dptr, error, *targ_ptr, *prevact, *gradptr ;
242 |    double rsum, isum, delta, rdelta, idelta, *nextcoefs, tval ;
243 |    double *rr_ptr, *ii_ptr, *ri_ptr ;
244 | 
245 |    mult = complex  ?  2 : 1 ; // Numbers per neuron
246 | 
247 |    for (i=0 ; i<n_weights ; i++)  // Zero gradient for summing
248 |       grad[i] = 0.0 ;             // All layers are strung together here
249 | 
250 |    error = 0.0 ;  // Will cumulate total error here
251 | 
252 |    for (icase=istart ; icase<istop ; icase++) {
253 | 
254 |       dptr = input + icase * max_neurons ; // Point to this sample; max_neurons is actual number of numbers
255 |       trial_thr ( dptr , n_layers , nin , outputs ,  nout , nhid ,
256 |                   weights , hid_act , hid_rr , hid_ii , hid_ri , 
257 |                   last_layer_weights , complex , classifier ) ;
258 | 
259 | 
260 |       if (classifier) {               // SoftMax
261 |          iclass = class_ids[icase] ;
262 |          for (i=0 ; i<nout ; i++) {
263 |             tval = (i == iclass)  ?  1.0 : 0.0 ;
264 |             this_delta[mult*i] = tval - outputs[mult*i] ; // Neg deriv of cross entropy wrt input (logit) i
265 |             if (complex)
266 |                this_delta[2*i+1] = 0.0 ;
267 |             }
268 |          error -= log ( outputs[mult*iclass] + 1.e-30 ) ;
269 |          }
270 | 
271 |       else if (targets != NULL) {              // Training final model
272 |          targ_ptr = targets + icase * nout ;   // Targets are strictly real
273 |          for (i=0 ; i<nout ; i++) {
274 |             diff = outputs[mult*i] - targ_ptr[i] ;   // Real part of prediction is compared to target
275 |             error += diff * diff ;
276 |             this_delta[mult*i] = -2.0 * diff ;       // Neg deriv of squared error wrt input to neuron i
277 |             if (complex)
278 |                this_delta[2*i+1] = 0.0 ;             // Target is real so ignore imaginary prediction
279 |             }
280 |          }
281 | 
282 |       else {                                  // Training an autoencoder
283 |          targ_ptr = input + icase * max_neurons ; // Point to this sample
284 |          for (i=0 ; i<mult*nout ; i++) {
285 |             diff = outputs[i] - targ_ptr[i] ;
286 |             error += diff * diff ;
287 |             this_delta[i] = -2.0 * diff ; // Neg deriv of squared error wrt input to neuron i
288 |             }
289 |          }
290 | 
291 | /*
292 |    Cumulate output gradient
293 |    If complex, actual is double nprev
294 | */
295 | 
296 |       if (n_layers == 1) {                        // No hidden layer
297 |          nprev = nin ;                            // Number of possibly complex inputs to the output layer
298 |          prevact = input + icase * max_neurons ;  // Point to this sample
299 |          }
300 |       else {
301 |          nprev = nhid[n_layers-2] ;               // n_layers-2 is the last hidden layer
302 |          prevact = hid_act[n_layers-2] ;          // Point to layer feeding the output layer
303 |          }
304 |       gradptr = grad_ptr[n_layers-1] ;            // Point to output gradient in grand gradient vector
305 |       for (i=0 ; i<nout ; i++) {                  // For all output neurons
306 |          if (complex) {
307 |             rdelta = this_delta[2*i] ;
308 |             idelta = this_delta[2*i+1] ;
309 |             for (j=0 ; j<nprev ; j++) {
310 |                *gradptr++ +=  rdelta * prevact[2*j]   + idelta * prevact[2*j+1] ;
311 |                *gradptr++ += -rdelta * prevact[2*j+1] + idelta * prevact[2*j] ;
312 |                }
313 |             *gradptr++ += rdelta ;                 // Bias activation is always 1
314 |             *gradptr++ += idelta ;
315 |             }
316 |          else {
317 |             delta = this_delta[i] ;               // Neg deriv of criterion wrt logit
318 |             for (j=0 ; j<nprev ; j++)
319 |                *gradptr++ += delta * prevact[j] ; // Cumulate for all training cases
320 |             *gradptr++ += delta ;                 // Bias activation is always 1
321 |             }
322 |          }
323 | 
324 |       nnext = nout ;                       // Prepare for moving back one layer
325 |       nextcoefs = last_layer_weights ;
326 | 
327 | /*
328 |    Cumulate hidden gradients
329 | */
330 | 
331 |       for (ilayer=n_layers-2 ; ilayer>=0 ; ilayer--) {   // For each hidden layer, working backwards
332 |          nthis = nhid[ilayer] ;        // Number of neurons in this hidden layer
333 |          gradptr = grad_ptr[ilayer] ;      // Point to gradient for this layer
334 | 
335 |          if (complex) {
336 |             rr_ptr = hid_rr[ilayer] ;
337 |             ii_ptr = hid_ii[ilayer] ;
338 |             ri_ptr = hid_ri[ilayer] ;
339 |             }
340 | 
341 |          for (i=0 ; i<nthis ; i++) {       // For each neuron in this layer
342 | 
343 |             if (complex) {
344 | 
345 |                rsum = isum = 0.0 ;
346 |                for (j=0 ; j<nnext ; j++) {
347 |                   rsum +=  this_delta[2*j]   * nextcoefs[j*2*(nthis+1)+2*i] +
348 |                            this_delta[2*j+1] * nextcoefs[j*2*(nthis+1)+2*i+1] ;
349 |                   isum += -this_delta[2*j]   * nextcoefs[j*2*(nthis+1)+2*i+1] +
350 |                            this_delta[2*j+1] * nextcoefs[j*2*(nthis+1)+2*i] ;
351 |                   }
352 | 
353 |                rdelta = rsum * rr_ptr[i] + isum * ri_ptr[i] ;
354 |                idelta = rsum * ri_ptr[i] + isum * ii_ptr[i] ;
355 |                prior_delta[2*i]   = rdelta ;                    // Save it for the next layer back
356 |                prior_delta[2*i+1] = idelta ;
357 | 
358 |                if (ilayer == 0) {                          // First hidden layer?
359 |                   prevact = input + icase * max_neurons ;  // Point to this sample
360 |                   for (j=0 ; j<nin ; j++) {
361 |                      *gradptr++ +=  rdelta * prevact[2*j]   + idelta * prevact[2*j+1] ;
362 |                      *gradptr++ += -rdelta * prevact[2*j+1] + idelta * prevact[2*j] ;
363 |                      }
364 |                   }
365 |                else {      // There is at least one more hidden layer prior to this one
366 |                   prevact = hid_act[ilayer-1] ;
367 |                   for (j=0 ; j<nhid[ilayer-1] ; j++) {
368 |                      *gradptr++ +=  rdelta * prevact[2*j]   + idelta * prevact[2*j+1] ;
369 |                      *gradptr++ += -rdelta * prevact[2*j+1] + idelta * prevact[2*j] ;
370 |                      }
371 |                   }
372 |                *gradptr++ += rdelta ;   // Bias activation is always 1
373 |                *gradptr++ += idelta ;
374 |                }    // Complex
375 | 
376 |             else {  // Real
377 |                delta = 0.0 ;
378 |                for (j=0 ; j<nnext ; j++)
379 |                   delta += this_delta[j] * nextcoefs[j*(nthis+1)+i] ;
380 |                delta *= hid_act[ilayer][i] * (1.0 - hid_act[ilayer][i]) ;  // Derivative
381 |                prior_delta[i] = delta ;                    // Save it for the next layer back
382 |                if (ilayer == 0) {                          // First hidden layer?
383 |                   prevact = input + icase * max_neurons ;  // Point to this sample
384 |                   for (j=0 ; j<nin ; j++)
385 |                      *gradptr++ += delta * prevact[j] ;
386 |                   }
387 |                else {      // There is at least one more hidden layer prior to this one
388 |                   prevact = hid_act[ilayer-1] ;
389 |                   for (j=0 ; j<nhid[ilayer-1] ; j++)
390 |                      *gradptr++ += delta * prevact[j] ;
391 |                   }
392 |                *gradptr++ += delta ;   // Bias activation is always 1
393 |                }
394 | 
395 |             }  // For all neurons in this hidden layer
396 | 
397 |          for (i=0 ; i<mult*nthis ; i++)        // These will be delta for the next layer back
398 |             this_delta[i] = prior_delta[i] ;
399 | 
400 |          nnext = nhid[ilayer] ;                // Prepare for the next layer back
401 |          nextcoefs = weights[ilayer] ;
402 |          }  // For all layers, working backwards
403 | 
404 |       } // for all cases
405 | 
406 |    return error ;  // MSE or negative log likelihood
407 | }
408 | 
409 | 
410 | 
411 | typedef struct {
412 |    int istart ;
413 |    int istop ;
414 |    int complex ;
415 |    int classifier ;
416 |    int n_layers ;
417 |    int n_weights ;
418 |    int nin ;
419 |    int nout ;
420 |    int *nhid ;
421 |    int max_neurons ;
422 |    double *input ;
423 |    double *targets ;
424 |    int *class_ids ;
425 |    double *outputs ;
426 |    double **weights ;
427 |    double **hid_act ;
428 |    double **hid_rr ;
429 |    double **hid_ii ;
430 |    double **hid_ri ;
431 |    double *this_delta ;
432 |    double *prior_delta ;
433 |    double **grad_ptr ;
434 |    double *last_layer_weights ;
435 |    double *grad ;
436 |    double error ;
437 | } GRAD_THR_PARAMS ;
438 | 
439 | static unsigned int __stdcall batch_gradient_wrapper ( LPVOID dp )
440 | {
441 | ((GRAD_THR_PARAMS *) dp)->error = batch_gradient (
442 |                           ((GRAD_THR_PARAMS *) dp)->istart ,
443 |                           ((GRAD_THR_PARAMS *) dp)->istop ,
444 |                           ((GRAD_THR_PARAMS *) dp)->input ,
445 |                           ((GRAD_THR_PARAMS *) dp)->targets ,
446 |                           ((GRAD_THR_PARAMS *) dp)->class_ids ,
447 |                           ((GRAD_THR_PARAMS *) dp)->n_layers ,
448 |                           ((GRAD_THR_PARAMS *) dp)->n_weights ,
449 |                           ((GRAD_THR_PARAMS *) dp)->nin ,
450 |                           ((GRAD_THR_PARAMS *) dp)->outputs ,
451 |                           ((GRAD_THR_PARAMS *) dp)->nout ,
452 |                           ((GRAD_THR_PARAMS *) dp)->nhid ,
453 |                           ((GRAD_THR_PARAMS *) dp)->weights ,
454 |                           ((GRAD_THR_PARAMS *) dp)->hid_act ,
455 |                           ((GRAD_THR_PARAMS *) dp)->hid_rr ,
456 |                           ((GRAD_THR_PARAMS *) dp)->hid_ii ,
457 |                           ((GRAD_THR_PARAMS *) dp)->hid_ri ,
458 |                           ((GRAD_THR_PARAMS *) dp)->max_neurons ,
459 |                           ((GRAD_THR_PARAMS *) dp)->this_delta ,
460 |                           ((GRAD_THR_PARAMS *) dp)->prior_delta ,
461 |                           ((GRAD_THR_PARAMS *) dp)->grad_ptr ,
462 |                           ((GRAD_THR_PARAMS *) dp)->last_layer_weights ,
463 |                           ((GRAD_THR_PARAMS *) dp)->grad ,
464 |                           ((GRAD_THR_PARAMS *) dp)->complex ,
465 |                           ((GRAD_THR_PARAMS *) dp)->classifier ) ;
466 |    return 0 ;
467 | }
468 | 
469 | /*
470 | --------------------------------------------------------------------------------
471 | 
472 |    gradient_thr() - Gradient for entire model
473 | 
474 | --------------------------------------------------------------------------------
475 | */
476 | 
477 | double CpxAuto::gradient_thr (
478 |    int nc ,             // Number of cases
479 |    int nin ,            // Number of possibly complex inputs
480 |    double *input ,      // Nc by max_neurons input matrix
481 |    int nout ,           // Number of possibly complex outputs
482 |    double *target ,     // Nc by nout target matrix, or autoencoding if NULL
483 |    int n_layers ,       // Number of layers
484 |    int *nhid ,          // Number of hidden neurons in each layer
485 |    int n_weights ,      // Total (actual) number of weights, including final layers and bias
486 |    double *weights[] ,  // Weight matrices for layers
487 |    int use_final_layer_weights , // Use final_layer_weights (vs last weight layer)?
488 |    double *grad         // Concatenated gradient vector, which is computed here
489 |    )
490 | {
491 |    int i, j, ilayer, ineuron, ivar, n, istart, istop, n_done, ithread, mult ;
492 |    int n_in_batch, n_threads, ret_val, nin_this_layer, n_last_layer_weights ;
493 |    double error, *wptr, *gptr, factor, *hid_act_ptr[MAX_THREADS][MAX_LAYERS], *grad_ptr_ptr[MAX_THREADS][MAX_LAYERS] ;
494 |    double *hid_rr_ptr[MAX_THREADS][MAX_LAYERS], *hid_ii_ptr[MAX_THREADS][MAX_LAYERS], *hid_ri_ptr[MAX_THREADS][MAX_LAYERS] ;
495 |    double wpen, *last_layer_weights ;
496 |    char msg[256] ;
497 |    GRAD_THR_PARAMS params[MAX_THREADS] ;
498 |    HANDLE threads[MAX_THREADS] ;
499 | 
500 |    mult = is_complex  ?  2 : 1 ;
501 | 
502 |    if (use_final_layer_weights) {                      // Full CpxAuto model
503 |       last_layer_weights = final_layer_weights ;
504 |       n_last_layer_weights = n_final_layer_weights ;   // Per output, not total; If complex, this is actual
505 |       }
506 | 
507 |    else {                                              // Greedily training a single layer
508 |       last_layer_weights = weights[n_layers-1] ;
509 |       n_last_layer_weights = mult * (nhid[n_layers-2] + 1) ;
510 |       }
511 | 
512 |    wpen = TrainParams.wpen / n_weights ;
513 | 
514 | /*
515 |    Compute length of grad vector and gradient positions in it.
516 | */
517 | 
518 |    gptr = grad ;  // CONJGRAD.CPP allocated this n_weights * max_threads long
519 | 
520 |    for (ilayer=0 ; ilayer<n_layers ; ilayer++) {
521 |       grad_ptr[ilayer] = gptr ;
522 | 
523 |       if (ilayer == 0  &&  n_layers == 1) {          // Direct input to output?
524 |          n = nout * mult * (nin+1) ;                 // This many inputs to each neuron in this layer
525 |          gptr += n ;                                 // Not needed, but it illustrates the process
526 |          }
527 | 
528 |       else if (ilayer == 0) {                        // First hidden layer?
529 |          n = nhid[ilayer] * mult * (nin+1) ;         // This many inputs to each neuron in this layer
530 |          gptr += n ;
531 |          }
532 | 
533 |       else if (ilayer < n_layers-1) {                   // Subsequent hidden layer?
534 |          n = nhid[ilayer] * mult * (nhid[ilayer-1]+1) ; // This many inputs to each neuron in this layer
535 |          gptr += n ;
536 |          }
537 | 
538 |       else
539 |          n = nout * mult * (nhid[ilayer-1]+1) ;         // This many inputs to each neuron in this layer
540 |       } // For all layers, including output
541 | 
542 | 
543 | 
544 |    for (i=0 ; i<max_threads ; i++) {
545 |       params[i].input = input ;
546 |       params[i].targets = target ;          // Will be NULL for autoencoding
547 |       params[i].class_ids = class_ids ;
548 |       params[i].n_layers = n_layers ;
549 |       params[i].n_weights = n_weights ;
550 |       params[i].nin = nin ;                 // If complex, double for actual
551 |       params[i].nout = nout ;               // Ditto
552 |       params[i].nhid = nhid ;
553 |       params[i].max_neurons = max_neurons ; // Already sized for complex
554 |       params[i].weights = weights ;
555 |       params[i].last_layer_weights = last_layer_weights ;
556 | 
557 |       // Outputs is used strictly for scratch in each thread, not for saving predictions
558 |       if (use_final_layer_weights)
559 |          params[i].outputs = outputs + i * mult * nout ;
560 |       else
561 |          params[i].outputs = autoencode_out + i * mult * nin ;  // Autoencoding layer
562 | 
563 |       params[i].this_delta = this_layer + i * max_neurons ;
564 |       params[i].prior_delta = prior_layer + i * max_neurons ;
565 |       params[i].grad = grad + i * n_weights ;
566 |       for (j=0 ; j<n_layers ; j++) {
567 |          hid_act_ptr[i][j] = hid_act[j] + i * max_neurons ;
568 |          grad_ptr_ptr[i][j] = grad_ptr[j] + i * n_weights ;
569 |          if (is_complex) {
570 |             hid_rr_ptr[i][j] = hid_rr[j] + i * max_neurons / 2 ;  // These are real
571 |             hid_ii_ptr[i][j] = hid_ii[j] + i * max_neurons / 2 ;
572 |             hid_ri_ptr[i][j] = hid_ri[j] + i * max_neurons / 2 ;
573 |             }
574 |          }
575 |       params[i].hid_act = hid_act_ptr[i] ;
576 |       params[i].grad_ptr = grad_ptr_ptr[i] ;
577 |       if (is_complex) {
578 |          params[i].hid_rr = hid_rr_ptr[i] ;
579 |          params[i].hid_ii = hid_ii_ptr[i] ;
580 |          params[i].hid_ri = hid_ri_ptr[i] ;
581 |          }
582 |       else
583 |          params[i].hid_rr = params[i].hid_ii = params[i].hid_ri = NULL ;
584 | 
585 |       params[i].complex = is_complex ;
586 | 
587 |       if (target == NULL)            // Autoencoding
588 |          params[i].classifier = 0 ;
589 |       else
590 |          params[i].classifier = classifier ;
591 |       }
592 | 
593 | /*
594 | ------------------------------------------------------------------------------------------------
595 | 
596 |    Batch loop uses a different thread for each batch
597 | 
598 | ------------------------------------------------------------------------------------------------
599 | */
600 | 
601 |    n_threads = max_threads ;    // Try to use as many as possible
602 |    if (nc / n_threads < 100)    // But because threads have overhead
603 |       n_threads = 1 ;           // Avoid using them if the batch is small
604 | 
605 |    istart = 0 ;         // Batch start = training data start
606 |    n_done = 0 ;         // Number of training cases done in this epoch so far
607 | 
608 |    for (ithread=0 ; ithread<n_threads ; ithread++) {
609 |       n_in_batch = (nc - n_done) / (n_threads - ithread) ;  // Cases left to do / batches left to do
610 |       istop = istart + n_in_batch ;                         // Stop just before this index
611 | 
612 |       // Set the pointers that vary with the batch
613 | 
614 |       params[ithread].istart = istart ;
615 |       params[ithread].istop = istop ;
616 | 
617 |       threads[ithread] = (HANDLE) _beginthreadex ( NULL , 0 , batch_gradient_wrapper , &params[ithread] , 0 , NULL ) ;
618 |       if (threads[ithread] == NULL) {
619 |          for (i=0 ; i<n_threads ; i++) {
620 |             if (threads[i] != NULL)
621 |                CloseHandle ( threads[i] ) ;
622 |             }
623 |          return -1.e40 ;
624 |          }
625 | 
626 |       n_done += n_in_batch ;
627 |       istart = istop ;
628 |       } // For all threads / batches
629 | 
630 | /*
631 |    Wait for threads to finish, and then cumulate all results into [0]
632 | */
633 | 
634 |    ret_val = WaitForMultipleObjects ( n_threads , threads , TRUE , 1200000 ) ;
635 |    if (ret_val == WAIT_TIMEOUT  ||  ret_val == WAIT_FAILED  ||  ret_val < 0  ||  ret_val >= n_threads)
636 |       return -1.e40 ;
637 | 
638 |    CloseHandle ( threads[0] ) ;
639 |    for (ithread=1 ; ithread<n_threads ; ithread++) {
640 |       params[0].error += params[ithread].error ;
641 |       for (i=0 ; i<n_weights ; i++)
642 |          params[0].grad[i] += params[ithread].grad[i] ;
643 |       CloseHandle ( threads[ithread] ) ;
644 |       }
645 | 
646 | 
647 | /*
648 |    Find the mean per presentation.  Also, compensate for nout if that was
649 |    not done implicitly in the error computation.
650 | */
651 | 
652 |    factor = 1.0 / (nc * mult * nout) ;
653 | 
654 |    error = factor * params[0].error ;
655 | 
656 |    for (i=0 ; i<n_weights ; i++)
657 |       grad[i] = factor * params[0].grad[i] ;   // Note that grad and params[0].grad are the same!
658 | 
659 | 
660 | /*
661 |    Deal with weight penalty
662 |    First block of code does hidden layers, second does output layer
663 | */
664 | 
665 |    penalty = 0.0 ;
666 | 
667 |    nin_this_layer = nin ;
668 |    for (ilayer=0 ; ilayer<n_layers-1 ; ilayer++) {  // Do all hidden layers
669 | 
670 |       for (ineuron=0 ; ineuron<nhid[ilayer] ; ineuron++) {
671 |          wptr =  weights[ilayer] + ineuron * mult * (nin_this_layer+1) ;  // Weights for this neuron in this layer
672 |          gptr = grad_ptr[ilayer] + ineuron * mult * (nin_this_layer+1) ;  // Ditto grad
673 |          for (ivar=0 ; ivar<mult*nin_this_layer ; ivar++) {               // Do not include bias
674 |             penalty += wptr[ivar] * wptr[ivar] ;
675 |             gptr[ivar] -= 2.0 * wpen * wptr[ivar] ;
676 |             }
677 |          }
678 |       nin_this_layer = nhid[ilayer] ;
679 |       }
680 | 
681 |    for (ineuron=0 ; ineuron<nout ; ineuron++) {
682 |       wptr = last_layer_weights + ineuron * n_last_layer_weights ;
683 |       gptr = grad_ptr[n_layers-1] + ineuron * n_last_layer_weights ;
684 |       for (ivar=0 ; ivar<mult*nin_this_layer ; ivar++) {             // Do not include bias
685 |          penalty += wptr[ivar] * wptr[ivar] ;
686 |          gptr[ivar] -= 2.0 * wpen * wptr[ivar] ;
687 |          }
688 |       }
689 | 
690 |    penalty *= wpen ;
691 |    return error + penalty ;
692 | }
693 | 


--------------------------------------------------------------------------------
/V1 Source/MLFN_THR.CPP:
--------------------------------------------------------------------------------
  1 | /******************************************************************************/
  2 | /*                                                                            */
  3 | /*  MLFN_THR - MLFN routines modified for threading                           */
  4 | /*                                                                            */
  5 | /******************************************************************************/
  6 | 
  7 | #define STRICT
  8 | #include <windows.h>
  9 | #include <commctrl.h>
 10 | #include <assert.h>
 11 | #include <stdlib.h>
 12 | #include <stdio.h>
 13 | #include <math.h>
 14 | #include <string.h>
 15 | #include <ctype.h>
 16 | #include <malloc.h>
 17 | #include <new.h>
 18 | #include <float.h>
 19 | #include <process.h>
 20 | 
 21 | #include "deep.rh"
 22 | #include "const.h"
 23 | #include "classes.h"
 24 | #include "extern.h"
 25 | #include "funcdefs.h"
 26 | 
 27 | 
 28 | /*
 29 | --------------------------------------------------------------------------------
 30 | 
 31 |    Local routine to compute activation
 32 | 
 33 | --------------------------------------------------------------------------------
 34 | */
 35 | 
 36 | 
 37 | void activity (
 38 |    double *input ,   // This neuron's input vector, ninputs long
 39 |    double *coefs ,   // Weight vector, ninputs+1 long (bias is at end)
 40 |    double *output ,  // Achieved activation of this neuron
 41 |    int ninputs ,     // Number of inputs
 42 |    int outlin        // Activation function is identity if nonzero, else logistic
 43 |    )
 44 | {
 45 |    double sum ;
 46 | 
 47 |    sum = dotprod ( ninputs , input , coefs ) ;
 48 |    sum += coefs[ninputs] ;      // Bias term
 49 | 
 50 |    if (outlin)
 51 |       *output = sum ;
 52 |     else
 53 |       *output = 1.0 / (1.0 + exp(-sum)) ;
 54 | }
 55 | 
 56 | 
 57 | /*
 58 | --------------------------------------------------------------------------------
 59 | 
 60 |    trial - Compute the output for a given input by evaluating network
 61 |            This is the Model version, callable from outside here.
 62 | 
 63 | --------------------------------------------------------------------------------
 64 | */
 65 | 
 66 | void Model::trial ( double *input )
 67 | {
 68 |    int i, ilayer ;
 69 |    double sum ;
 70 | 
 71 |    for (ilayer=0 ; ilayer<n_all ; ilayer++) {
 72 | 
 73 |       if (ilayer == 0  &&  n_all == 1) {                 // Direct input to output?
 74 |          for (i=0 ; i<ntarg ; i++)
 75 |             activity ( input , final_layer_weights+i*(n_model_inputs+1) , outputs+i , n_model_inputs , 1 ) ;
 76 |          }
 77 | 
 78 |       else if (ilayer == 0) {                            // First hidden layer?
 79 |          for (i=0 ; i<nhid_all[ilayer] ; i++)
 80 |             activity ( input , weights_opt[ilayer]+i*(n_model_inputs+1) , hid_act[ilayer]+i , n_model_inputs , 0 ) ;
 81 |          }
 82 | 
 83 |       else if (ilayer < n_all-1) {                       // Subsequent hidden layer?
 84 |          for (i=0 ; i<nhid_all[ilayer] ; i++)
 85 |             activity ( hid_act[ilayer-1] , weights_opt[ilayer]+i*(nhid_all[ilayer-1]+1) ,
 86 |                        hid_act[ilayer]+i , nhid_all[ilayer-1] , 0 );
 87 |          }
 88 | 
 89 |       else {                                             // Final layer
 90 |          for (i=0 ; i<ntarg ; i++)
 91 |             activity ( hid_act[ilayer-1] , final_layer_weights+i*(nhid_all[ilayer-1]+1) ,
 92 |                        outputs+i , nhid_all[ilayer-1] , 1 );
 93 |          }
 94 |       }
 95 | 
 96 |    if (classifier) {  // Classifier is always SoftMax
 97 |       sum = 0.0 ;
 98 |       for (i=0 ; i<ntarg ; i++) {
 99 |          if (outputs[i] < 300.0)
100 |             outputs[i] = exp ( outputs[i] ) ;
101 |          else
102 |             outputs[i] = exp ( 300.0 ) ;
103 |          sum += outputs[i] ;
104 |          }
105 |       for (i=0 ; i<ntarg ; i++)
106 |          outputs[i] /= sum ;
107 |       }
108 | }
109 | 
110 | 
111 | /*
112 | --------------------------------------------------------------------------------
113 | 
114 |    trial_thr - Compute the output for a given input by evaluating network
115 |                This is a strictly local version for threading.
116 | 
117 | --------------------------------------------------------------------------------
118 | */
119 | 
120 | static void trial_thr (
121 |    double *input ,                 // Input vector n_model_inputs long
122 |    int n_all ,                     // Number of layers, including output, not including input
123 |    int n_model_inputs ,            // Number of inputs to the model
124 |    double *outputs ,               // Output vector of the model
125 |    int ntarg ,                     // Number of outputs
126 |    int *nhid_all ,                 // nhid_all[i] is the number of hidden neurons in hidden layer i
127 |    double *weights_opt[] ,         // weights_opt[i] points to the weight vector for hidden layer i
128 |    double *hid_act[] ,             // hid_act[i] points to the vector of activations of hidden layer i
129 |    double *final_layer_weights ,   // Weights of final layer
130 |    int classifier                  // If nonzero use SoftMax output; else use linear output
131 |    )
132 | {
133 |    int i, ilayer ;
134 |    double sum ;
135 | 
136 |    for (ilayer=0 ; ilayer<n_all ; ilayer++) {
137 | 
138 |       if (ilayer == 0  &&  n_all == 1) {        // Direct input to output?
139 |          for (i=0 ; i<ntarg ; i++)
140 |             activity ( input , final_layer_weights+i*(n_model_inputs+1) , outputs+i , n_model_inputs , 1 ) ;
141 |          }
142 | 
143 |       else if (ilayer == 0) {                   // First hidden layer?
144 |          for (i=0 ; i<nhid_all[ilayer] ; i++)
145 |             activity ( input , weights_opt[ilayer]+i*(n_model_inputs+1) , hid_act[ilayer]+i , n_model_inputs , 0 ) ;
146 |          }
147 | 
148 |       else if (ilayer < n_all-1) {              // Subsequent hidden layer?
149 |          for (i=0 ; i<nhid_all[ilayer] ; i++)
150 |             activity ( hid_act[ilayer-1] , weights_opt[ilayer]+i*(nhid_all[ilayer-1]+1) ,
151 |                        hid_act[ilayer]+i , nhid_all[ilayer-1] , 0 );
152 |          }
153 | 
154 |       else {                                    // Final layer
155 |          for (i=0 ; i<ntarg ; i++)
156 |             activity ( hid_act[ilayer-1] , final_layer_weights+i*(nhid_all[ilayer-1]+1) ,
157 |                        outputs+i , nhid_all[ilayer-1] , 1 );
158 |          }
159 |       }
160 | 
161 |    if (classifier) {  // Classifier is always SoftMax
162 |       sum = 0.0 ;
163 |       for (i=0 ; i<ntarg ; i++) {  // For all outputs
164 |          if (outputs[i] < 300.0)
165 |             outputs[i] = exp ( outputs[i] ) ;
166 |          else
167 |             outputs[i] = exp ( 300.0 ) ;
168 |          sum += outputs[i] ;
169 |          }
170 |       for (i=0 ; i<ntarg ; i++)
171 |          outputs[i] /= sum ;
172 |       }
173 | }
174 | 
175 | 
176 | /*
177 | ------------------------------------------------------------------------------------------------
178 | 
179 |    Threaded routine that cumulates error for a batch
180 | 
181 | ------------------------------------------------------------------------------------------------
182 | */
183 | 
184 | static double batch_error (
185 |    int istart ,                    // Index of starting case in input matrix
186 |    int istop ,                     // And one past last case
187 |    int max_neurons ,               // Number of columns in input matrix; max exceed n_model_inputs
188 |    double *input ,                 // Input matrix; each case is max_neurons long
189 |    int n_all ,                     // Number of layers, including output, not including input
190 |    int n_model_inputs ,            // Number of inputs to the model; Input matrix may have more columns
191 |    double *outputs ,               // Output vector of the model; used as work vector here
192 |    int ntarg ,                     // Number of outputs
193 |    int *nhid_all ,                 // nhid_all[i] is the number of hidden neurons in hidden layer i
194 |    double *weights_opt[] ,         // weights_opt[i] points to the weight vector for hidden layer i
195 |    double *hid_act[] ,             // hid_act[i] points to the vector of activations of hidden layer i
196 |    double *final_layer_weights ,   // Weights of final layer
197 |    double *targets ,               // Target matrix; each case is ntarg long
198 |    int classifier                  // If nonzero use SoftMax output; else use linear output
199 |    )
200 | {
201 |    int i, icase, imax ;
202 |    double err, tot_err, *dptr, diff, tmax ;
203 | 
204 |    tot_err = 0.0 ;  // Total error will be cumulated here
205 | 
206 |    for (icase=istart ; icase<istop ; icase++) {  // Do all samples
207 | 
208 |       dptr = input + icase * max_neurons ; // Point to this sample
209 |       trial_thr ( dptr , n_all , n_model_inputs , outputs ,  ntarg , nhid_all ,
210 |                   weights_opt , hid_act , final_layer_weights , classifier ) ;
211 |       err = 0.0 ;
212 | 
213 |       dptr = targets + icase * ntarg ;
214 | 
215 |       if (classifier) {               // SoftMax
216 |          tmax = -1.e30 ;
217 |          for (i=0 ; i<ntarg ; i++) {  // Find the true class as that having max target
218 |             if (*dptr > tmax) {
219 |                imax = i ;
220 |                tmax = *dptr ;
221 |                }
222 |             ++dptr ;
223 |             }
224 |          err = -log ( outputs[imax] + 1.e-30 ) ;
225 |          }
226 | 
227 |       else {
228 |          for (i=0 ; i<ntarg ; i++) {
229 |             diff = *dptr++ - outputs[i] ;
230 |             err += diff * diff ;
231 |             }
232 |          }
233 | 
234 |       tot_err += err ;
235 |       } // for all cases
236 | 
237 |    return tot_err ;
238 | }
239 | 
240 | 
241 | /*
242 | --------------------------------------------------------------------------------
243 | 
244 |    batch_gradient - Cumulate the gradient for a given subset of inputs
245 | 
246 |    Note: grad is all gradients as a vector, and grad_ptr[ilayer] points to
247 |          the entry in grad that is for the first weight in a layer
248 | 
249 | --------------------------------------------------------------------------------
250 | */
251 | 
252 | 
253 | static double batch_gradient (
254 |    int istart ,                    // Index of starting case in input matrix
255 |    int istop ,                     // And one past last case
256 |    double *input ,                 // Input matrix; each case is max_neurons long
257 |    double *targets ,               // Target matrix; each case is ntarg long
258 |    int n_all ,                     // Number of layers, including output, not including input
259 |    int n_all_weights ,             // Total number of weights, including final layer and all bias terms
260 |    int n_model_inputs ,            // Number of inputs to the model; Input matrix may have more columns
261 |    double *outputs ,               // Output vector of the model; used as work vector here
262 |    int ntarg ,                     // Number of outputs
263 |    int *nhid_all ,                 // nhid_all[i] is the number of hidden neurons in hidden layer i
264 |    double *weights_opt[] ,         // weights_opt[i] points to the weight vector for hidden layer i
265 |    double *hid_act[] ,             // hid_act[i] points to the vector of activations of hidden layer i
266 |    int max_neurons ,               // Number of columns in input matrix; may exceed n_model_inputs
267 |    double *this_delta ,            // Delta for the current layer
268 |    double *prior_delta ,           // And saved for use in the prior (next to be processed) layer
269 |    double **grad_ptr ,             // grad_ptr[i] points to gradient for layer i
270 |    double *final_layer_weights ,   // Weights of final layer
271 |    double *grad ,                  // All computed gradients, strung out as a single long vector
272 |    int classifier                  // If nonzero use SoftMax output; else use linear output
273 |    )
274 | {
275 |    int i, j, icase, ilayer, nprev, nthis, nnext, imax ;
276 |    double diff, *dptr, error, *targ_ptr, *prevact, *gradptr, delta, *nextcoefs, tmax ;
277 | 
278 |    for (i=0 ; i<n_all_weights ; i++)  // Zero gradient for summing
279 |       grad[i] = 0.0 ;                 // All layers are strung together here
280 | 
281 |    error = 0.0 ;  // Will cumulate total error here
282 | 
283 |    for (icase=istart ; icase<istop ; icase++) {
284 | 
285 |       dptr = input + icase * max_neurons ; // Point to this sample
286 |       trial_thr ( dptr , n_all , n_model_inputs , outputs ,  ntarg , nhid_all ,
287 |                   weights_opt , hid_act , final_layer_weights , classifier ) ;
288 | 
289 |       targ_ptr = targets + icase * ntarg ;
290 | 
291 |       if (classifier) {               // SoftMax
292 |          tmax = -1.e30 ;
293 |          for (i=0 ; i<ntarg ; i++) {  // Find the true class as that having max target
294 |             if (targ_ptr[i] > tmax) { // To save a small amount of time we could precompute this
295 |                imax = i ;
296 |                tmax = targ_ptr[i] ;
297 |                }
298 |             this_delta[i] = targ_ptr[i] - outputs[i] ; // Neg deriv of cross entropy wrt input (logit) i
299 |             }
300 |          error -= log ( outputs[imax] + 1.e-30 ) ;
301 |          }
302 | 
303 |       else {
304 |          for (i=0 ; i<ntarg ; i++) {
305 |             diff = outputs[i] - targ_ptr[i] ;
306 |             error += diff * diff ;
307 |             this_delta[i] = -2.0 * diff ; // Neg deriv of squared error wrt input to neuron i
308 |             }
309 |          }
310 | 
311 | /*
312 |    Cumulate output gradient
313 | */
314 | 
315 |       if (n_all == 1) {                           // No hidden layer
316 |          nprev = n_model_inputs ;                 // Number of inputs to the output layer
317 |          prevact = input + icase * max_neurons ;  // Point to this sample
318 |          }
319 |       else {
320 |          nprev = nhid_all[n_all-2] ;        // n_all-2 is the last hidden layer
321 |          prevact = hid_act[n_all-2] ;       // Point to layer feeding the output layer
322 |          }
323 |       gradptr = grad_ptr[n_all-1] ;         // Point to output gradient in grand gradient vector
324 |       for (i=0 ; i<ntarg ; i++) {           // For all output neurons
325 |          delta = this_delta[i] ;            // Neg deriv of criterion wrt logit
326 |          for (j=0 ; j<nprev ; j++)
327 |             *gradptr++ += delta * prevact[j] ; // Cumulate for all training cases
328 |          *gradptr++ += delta ;              // Bias activation is always 1
329 |          }
330 | 
331 |       nnext = ntarg ;                       // Prepare for moving back one layer
332 |       nextcoefs = final_layer_weights ;
333 | 
334 | /*
335 |    Cumulate hidden gradients
336 | */
337 | 
338 |       for (ilayer=n_all-2 ; ilayer>=0 ; ilayer--) {   // For each hidden layer, working backwards
339 |          nthis = nhid_all[ilayer] ;        // Number of neurons in this hidden layer
340 |          gradptr = grad_ptr[ilayer] ;      // Point to gradient for this layer
341 |          for (i=0 ; i<nthis ; i++) {       // For each neuron in this layer
342 |             delta = 0.0 ;
343 |             for (j=0 ; j<nnext ; j++)
344 |                delta += this_delta[j] * nextcoefs[j*(nthis+1)+i] ;
345 |             delta *= hid_act[ilayer][i] * (1.0 - hid_act[ilayer][i]) ;  // Derivative
346 |             prior_delta[i] = delta ;                    // Save it for the next layer back
347 |             if (ilayer == 0) {                          // First hidden layer?
348 |                prevact = input + icase * max_neurons ;  // Point to this sample
349 |                for (j=0 ; j<n_model_inputs ; j++)
350 |                   *gradptr++ += delta * prevact[j] ;
351 |                }
352 |             else {      // There is at least one more hidden layer prior to this one
353 |                prevact = hid_act[ilayer-1] ;
354 |                for (j=0 ; j<nhid_all[ilayer-1] ; j++)
355 |                   *gradptr++ += delta * prevact[j] ;
356 |                }
357 |             *gradptr++ += delta ;   // Bias activation is always 1
358 |             }  // For all neurons in this hidden layer
359 | 
360 |          for (i=0 ; i<nthis ; i++)           // These will be delta for the next layer back
361 |             this_delta[i] = prior_delta[i] ;
362 | 
363 |          nnext = nhid_all[ilayer] ;          // Prepare for the next layer back
364 |          nextcoefs = weights_opt[ilayer] ;
365 |          }  // For all layers, working backwards
366 | 
367 |       } // for all cases
368 | 
369 |    return error ;  // MSE or negative log likelihood
370 | }
371 | 
372 | 
373 | /*
374 | --------------------------------------------------------------------------------
375 | 
376 |    Thread stuff...
377 |       Structure for passing information to/from threaded code
378 |       Threaded code is called by the main subroutine
379 | 
380 | --------------------------------------------------------------------------------
381 | */
382 | 
383 | typedef struct {
384 |    int istart ;            // First case in this batch
385 |    int istop ;             // One past last case
386 |    int classifier ;
387 |    int max_neurons ;
388 |    int n_all ;
389 |    int n_model_inputs ;
390 |    int ntarg ;
391 |    int *nhid_all ;
392 |    double *input ;
393 |    double *outputs ;
394 |    double **weights_opt ;
395 |    double **hid_act ;
396 |    double *final_layer_weights ;
397 |    double *target ;
398 |    double error ;
399 | } ERR_THR_PARAMS ;
400 | 
401 | static unsigned int __stdcall batch_error_wrapper ( LPVOID dp )
402 | {
403 | ((ERR_THR_PARAMS *) dp)->error = batch_error (
404 |                           ((ERR_THR_PARAMS *) dp)->istart ,
405 |                           ((ERR_THR_PARAMS *) dp)->istop ,
406 |                           ((ERR_THR_PARAMS *) dp)->max_neurons ,
407 |                           ((ERR_THR_PARAMS *) dp)->input ,
408 |                           ((ERR_THR_PARAMS *) dp)->n_all ,
409 |                           ((ERR_THR_PARAMS *) dp)->n_model_inputs ,
410 |                           ((ERR_THR_PARAMS *) dp)->outputs ,
411 |                           ((ERR_THR_PARAMS *) dp)->ntarg ,
412 |                           ((ERR_THR_PARAMS *) dp)->nhid_all ,
413 |                           ((ERR_THR_PARAMS *) dp)->weights_opt ,
414 |                           ((ERR_THR_PARAMS *) dp)->hid_act ,
415 |                           ((ERR_THR_PARAMS *) dp)->final_layer_weights ,
416 |                           ((ERR_THR_PARAMS *) dp)->target ,
417 |                           ((ERR_THR_PARAMS *) dp)->classifier ) ;
418 |    return 0 ;
419 | }
420 | 
421 | 
422 | typedef struct {
423 |    int istart ;            // First case in this batch
424 |    int istop ;             // One past last case
425 |    int classifier ;
426 |    int n_all ;
427 |    int n_all_weights ;
428 |    int n_model_inputs ;
429 |    int ntarg ;
430 |    int *nhid_all ;
431 |    int max_neurons ;
432 |    double *input ;
433 |    double *targets ;
434 |    double *outputs ;
435 |    double **weights_opt ;
436 |    double **hid_act ;
437 |    double *this_delta ;
438 |    double *prior_delta ;
439 |    double **grad_ptr ;
440 |    double *final_layer_weights ;
441 |    double *grad ;
442 |    double error ;
443 | } GRAD_THR_PARAMS ;
444 | 
445 | static unsigned int __stdcall batch_gradient_wrapper ( LPVOID dp )
446 | {
447 | ((GRAD_THR_PARAMS *) dp)->error = batch_gradient (
448 |                           ((GRAD_THR_PARAMS *) dp)->istart ,
449 |                           ((GRAD_THR_PARAMS *) dp)->istop ,
450 |                           ((GRAD_THR_PARAMS *) dp)->input ,
451 |                           ((GRAD_THR_PARAMS *) dp)->targets ,
452 |                           ((GRAD_THR_PARAMS *) dp)->n_all ,
453 |                           ((GRAD_THR_PARAMS *) dp)->n_all_weights ,
454 |                           ((GRAD_THR_PARAMS *) dp)->n_model_inputs ,
455 |                           ((GRAD_THR_PARAMS *) dp)->outputs ,
456 |                           ((GRAD_THR_PARAMS *) dp)->ntarg ,
457 |                           ((GRAD_THR_PARAMS *) dp)->nhid_all ,
458 |                           ((GRAD_THR_PARAMS *) dp)->weights_opt ,
459 |                           ((GRAD_THR_PARAMS *) dp)->hid_act ,
460 |                           ((GRAD_THR_PARAMS *) dp)->max_neurons ,
461 |                           ((GRAD_THR_PARAMS *) dp)->this_delta ,
462 |                           ((GRAD_THR_PARAMS *) dp)->prior_delta ,
463 |                           ((GRAD_THR_PARAMS *) dp)->grad_ptr ,
464 |                           ((GRAD_THR_PARAMS *) dp)->final_layer_weights ,
465 |                           ((GRAD_THR_PARAMS *) dp)->grad ,
466 |                           ((GRAD_THR_PARAMS *) dp)->classifier ) ;
467 |    return 0 ;
468 | }
469 | 
470 | /*
471 | --------------------------------------------------------------------------------
472 | 
473 |    gradient() - Gradient for entire model
474 | 
475 | --------------------------------------------------------------------------------
476 | */
477 | 
478 | double Model::gradient_thr (
479 |    int nc ,              // Number of cases
480 |    double *input ,       // Inputs, nc rows and max_neurons columns, of which the first n_model_inputs are used
481 |    double *target ,      // Targets, nc rows and ntarg columns
482 |    double *grad          // Concatenated gradient vector, which is computed here
483 |    )
484 | {
485 |    int i, j, ilayer, ineuron, ivar, n, istart, istop, n_done, ithread ;
486 |    int n_in_batch, n_threads, ret_val, nin_this_layer ;
487 |    int k=0 ;   // Can remove this when final assert is assured
488 |    double error, *wptr, *gptr, factor, *hid_act_ptr[MAX_THREADS][MAX_LAYERS], *grad_ptr_ptr[MAX_THREADS][MAX_LAYERS] ;
489 |    double wpen ;
490 |    char msg[256] ;
491 |    GRAD_THR_PARAMS params[MAX_THREADS] ;
492 |    HANDLE threads[MAX_THREADS] ;
493 | 
494 |    wpen = TrainParams.wpen / n_all_weights ;
495 | 
496 | /*
497 |    Compute length of grad vector and gradient positions in it.
498 |    If I ever make grad a permanent member of the model, rather than a temporary
499 |    allocated in CONJGRAD.CPP, I can do this in the constructor.  But it's very fast.
500 | */
501 | 
502 |    gptr = grad ;  // CONJGRAD.CPP allocated this n_all_weights * max_threads long
503 | 
504 |    for (ilayer=0 ; ilayer<n_all ; ilayer++) {
505 |       grad_ptr[ilayer] = gptr ;
506 | 
507 |       if (ilayer == 0  &&  n_all == 1) {             // Direct input to output?
508 |          n = ntarg * (n_model_inputs+1) ;            // This many inputs to each neuron in this layer
509 |          gptr += n ;                                 // Not needed, but it illustrates the process
510 |          k += n ;   // Can remove this when final assert is assured
511 |          }
512 | 
513 |       else if (ilayer == 0) {                        // First hidden layer?
514 |          n = nhid_all[ilayer] * (n_model_inputs+1) ; // This many inputs to each neuron in this layer
515 |          gptr += n ;
516 |          k += n ;   // Can remove this when final assert is assured
517 |          }
518 | 
519 |       else if (ilayer < n_all-1) {                       // Subsequent hidden layer?
520 |          n = nhid_all[ilayer] * (nhid_all[ilayer-1]+1) ; // This many inputs to each neuron in this layer
521 |          gptr += n ;
522 |          k += n ;   // Can remove this when final assert is assured
523 |          }
524 | 
525 |       else {
526 |          assert ( (nhid_all[ilayer-1]+1) == n_final_layer_weights ) ;
527 |          n = ntarg * (nhid_all[ilayer-1]+1) ; // This many inputs to each neuron in this layer
528 |          k += n ;   // Can remove this when final assert is assured
529 |          }
530 |       } // For all layers, including output
531 | 
532 |    assert ( k == n_all_weights ) ;
533 | 
534 |    for (i=0 ; i<max_threads ; i++) {
535 |       params[i].input = input ;
536 |       params[i].targets = targets ;
537 |       params[i].n_all = n_all ;
538 |       params[i].n_all_weights = n_all_weights ;
539 |       params[i].n_model_inputs = n_model_inputs ;
540 |       params[i].ntarg = ntarg ;
541 |       params[i].nhid_all = nhid_all ;
542 |       params[i].max_neurons = max_neurons ;
543 |       params[i].weights_opt = weights_opt ;
544 |       params[i].final_layer_weights = final_layer_weights ;
545 | 
546 |       params[i].this_delta = this_layer + i * max_neurons ;
547 |       params[i].prior_delta = prior_layer + i * max_neurons ;
548 |       params[i].outputs = outputs + i * ntarg ;
549 |       params[i].grad = grad + i * n_all_weights ;
550 |       for (j=0 ; j<n_all ; j++) {
551 |          hid_act_ptr[i][j] = hid_act[j] + i * max_neurons ;
552 |          grad_ptr_ptr[i][j] = grad_ptr[j] + i * n_all_weights ;
553 |          }
554 |       params[i].hid_act = hid_act_ptr[i] ;
555 |       params[i].grad_ptr = grad_ptr_ptr[i] ;
556 |       params[i].classifier = classifier ;
557 |       }
558 | 
559 | /*
560 | ------------------------------------------------------------------------------------------------
561 | 
562 |    Batch loop uses a different thread for each batch
563 | 
564 | ------------------------------------------------------------------------------------------------
565 | */
566 | 
567 |    n_threads = max_threads ;    // Try to use as many as possible
568 |    if (nc / n_threads < 100)    // But because threads have overhead
569 |       n_threads = 1 ;           // Avoid using them if the batch is small
570 | 
571 |    istart = 0 ;         // Batch start = training data start
572 |    n_done = 0 ;         // Number of training cases done in this epoch so far
573 | 
574 |    for (ithread=0 ; ithread<n_threads ; ithread++) {
575 |       n_in_batch = (nc - n_done) / (n_threads - ithread) ;  // Cases left to do / batches left to do
576 |       istop = istart + n_in_batch ;                         // Stop just before this index
577 | 
578 |       // Set the pointers that vary with the batch
579 | 
580 |       params[ithread].istart = istart ;
581 |       params[ithread].istop = istop ;
582 | 
583 |       threads[ithread] = (HANDLE) _beginthreadex ( NULL , 0 , batch_gradient_wrapper , &params[ithread] , 0 , NULL ) ;
584 |       if (threads[ithread] == NULL) {
585 |          audit ( "Internal ERROR: bad thread creation in MLFN_THR" ) ;
586 |          for (i=0 ; i<n_threads ; i++) {
587 |             if (threads[i] != NULL)
588 |                CloseHandle ( threads[i] ) ;
589 |             }
590 |          return -1.e40 ;
591 |          }
592 | 
593 |       n_done += n_in_batch ;
594 |       istart = istop ;
595 |       } // For all threads / batches
596 | 
597 | /*
598 |    Wait for threads to finish, and then cumulate all results into [0]
599 | */
600 | 
601 |    ret_val = WaitForMultipleObjects ( n_threads , threads , TRUE , 1200000 ) ;
602 |    if (ret_val == WAIT_TIMEOUT  ||  ret_val == WAIT_FAILED  ||  ret_val < 0  ||  ret_val >= n_threads) {
603 |       sprintf ( msg, "INTERNAL ERROR!!!  Thread wait 1 failed (%d) in MLFN_THR.CPP", ret_val ) ;
604 |       audit ( msg ) ;
605 |       MEMTEXT ( msg ) ;
606 |       if (ret_val == WAIT_TIMEOUT)
607 |          audit ( "Timeout waiting for computation to finish; problem too large" ) ;
608 |       return -1.e40 ;
609 |       }
610 | 
611 |    CloseHandle ( threads[0] ) ;
612 |    for (ithread=1 ; ithread<n_threads ; ithread++) {
613 |       params[0].error += params[ithread].error ;
614 |       for (i=0 ; i<n_all_weights ; i++)
615 |          params[0].grad[i] += params[ithread].grad[i] ;
616 |       CloseHandle ( threads[ithread] ) ;
617 |       }
618 | 
619 | 
620 | /*
621 |    Find the mean per presentation.  Also, compensate for nout if that was
622 |    not done implicitly in the error computation.
623 | */
624 | 
625 |    factor = 1.0 / (nc * ntarg) ;
626 | 
627 |    error = factor * params[0].error ;
628 | 
629 |    for (i=0 ; i<n_all_weights ; i++)
630 |       grad[i] = factor * params[0].grad[i] ;   // Note that grad and params[0].grad are the same!
631 | 
632 | 
633 | /*
634 |    Deal with weight penalty
635 |    First block of code does hidden layers, second does output layer
636 | */
637 | 
638 |    penalty = 0.0 ;
639 |    nin_this_layer = n_model_inputs ;
640 |    for (ilayer=0 ; ilayer<n_all-1 ; ilayer++) {  // Do all hidden layers
641 |       for (ineuron=0 ; ineuron<nhid_all[ilayer] ; ineuron++) {
642 |          wptr = weights_opt[ilayer] + ineuron*(nin_this_layer+1) ;  // Weights for this neuron in this layer
643 |          gptr = grad_ptr[ilayer] + ineuron*(nin_this_layer+1) ;     // Ditto grad
644 |          for (ivar=0 ; ivar<nin_this_layer ; ivar++) {              // Do not include bias
645 |             penalty += wptr[ivar] * wptr[ivar] ;
646 |             gptr[ivar] -= 2.0 * wpen * wptr[ivar] ;
647 |             }
648 |          }
649 |       nin_this_layer = nhid_all[ilayer] ;
650 |       }
651 | 
652 |    for (ineuron=0 ; ineuron<ntarg ; ineuron++) {
653 |       wptr = final_layer_weights + ineuron * n_final_layer_weights ;
654 |       gptr = grad_ptr[n_all-1] + ineuron * n_final_layer_weights ;
655 |       for (ivar=0 ; ivar<nin_this_layer ; ivar++) {                 // Do not include bias
656 |          penalty += wptr[ivar] * wptr[ivar] ;
657 |          gptr[ivar] -= 2.0 * wpen * wptr[ivar] ;
658 |          }
659 |       }
660 | 
661 |    penalty *= wpen ;
662 |    return error + penalty ;
663 | }
664 | 
665 | 
666 | /*
667 | --------------------------------------------------------------------------------
668 | 
669 |    trial_error_thr - Compute the mean square error for the entire training set
670 | 
671 | --------------------------------------------------------------------------------
672 | */
673 | 
674 | double Model::trial_error_thr (
675 |    int nc ,
676 |    double *input ,
677 |    double *target
678 |    )
679 | {
680 |    int i, j, ineuron, ivar, ithread, n_threads, n_in_batch, n_done, istart, istop, ret_val ;
681 |    int ilayer, nin_this_layer ;
682 |    double error, *wptr, *hid_act_ptr[MAX_THREADS][MAX_LAYERS], wpen ;
683 |    char msg[256] ;
684 |    ERR_THR_PARAMS params[MAX_THREADS] ;
685 |    HANDLE threads[MAX_THREADS] ;
686 | 
687 |    wpen = TrainParams.wpen / n_all_weights ;
688 | 
689 | /*
690 |    Initialize parameters that will not change for threads.
691 | */
692 | 
693 |    for (i=0 ; i<max_threads ; i++) {
694 |       params[i].ntarg = ntarg ;
695 |       params[i].nhid_all = nhid_all ;
696 |       params[i].max_neurons = max_neurons ;
697 |       params[i].n_all = n_all ;
698 |       params[i].n_model_inputs = n_model_inputs ;
699 |       params[i].input = input ;
700 |       params[i].weights_opt = weights_opt ;
701 |       params[i].final_layer_weights = final_layer_weights ;
702 |       params[i].target = target ;
703 |       params[i].outputs = outputs + i * ntarg ;
704 |       for (j=0 ; j<n_all ; j++)
705 |          hid_act_ptr[i][j] = hid_act[j] + i * max_neurons ;
706 |       params[i].hid_act = hid_act_ptr[i] ;
707 |       params[i].classifier = classifier ;
708 |       }
709 | 
710 | 
711 | /*
712 | ------------------------------------------------------------------------------------------------
713 | 
714 |    Batch loop uses a different thread for each batch
715 | 
716 | ------------------------------------------------------------------------------------------------
717 | */
718 | 
719 |    n_threads = max_threads ;    // Try to use as many as possible
720 |    if (nc / n_threads < 100)    // But because threads have overhead
721 |       n_threads = 1 ;           // Avoid using them if the batch is small
722 | 
723 |    istart = 0 ;         // Batch start = training data start
724 |    n_done = 0 ;         // Number of training cases done in this epoch so far
725 | 
726 |    for (ithread=0 ; ithread<n_threads ; ithread++) {
727 |       n_in_batch = (nc - n_done) / (n_threads - ithread) ;  // Cases left to do / batches left to do
728 |       istop = istart + n_in_batch ;                         // Stop just before this index
729 | 
730 |       // Set the pointers that vary with the batch
731 | 
732 |       params[ithread].istart = istart ;
733 |       params[ithread].istop = istop ;
734 | 
735 |       threads[ithread] = (HANDLE) _beginthreadex ( NULL , 0 , batch_error_wrapper , &params[ithread] , 0 , NULL ) ;
736 |       if (threads[ithread] == NULL) {
737 |          audit ( "Internal ERROR: bad thread creation in MLFN_THR" ) ;
738 |          for (i=0 ; i<n_threads ; i++) {
739 |             if (threads[i] != NULL)
740 |                CloseHandle ( threads[i] ) ;
741 |             }
742 |          return -1.e40 ;
743 |          }
744 | 
745 |       n_done += n_in_batch ;
746 |       istart = istop ;
747 |       } // For all threads / batches
748 | 
749 | /*
750 |    Wait for threads to finish
751 | */
752 | 
753 |    ret_val = WaitForMultipleObjects ( n_threads , threads , TRUE , 1200000 ) ;
754 |    if (ret_val == WAIT_TIMEOUT  ||  ret_val == WAIT_FAILED  ||  ret_val < 0  ||  ret_val >= n_threads) {
755 |       sprintf ( msg, "INTERNAL ERROR!!!  Thread wait 2 failed (%d) in MLFN_THR.CPP", ret_val ) ;
756 |       audit ( msg ) ;
757 |       MEMTEXT ( msg ) ;
758 |       if (ret_val == WAIT_TIMEOUT)
759 |          audit ( "Timeout waiting for computation to finish; problem too large" ) ;
760 |       return -1.e40 ;
761 |       }
762 | 
763 |    error = 0.0 ;        // Cumulates squared reproduction error or negative log likelihood (for classifier)
764 |    for (ithread=0 ; ithread<n_threads ; ithread++) {
765 |       error += params[ithread].error ;
766 |       CloseHandle ( threads[ithread] ) ;
767 |       }
768 | 
769 | 
770 |    error /= nc * ntarg ;
771 | 
772 | 
773 | /*
774 |    Deal with weight penalty
775 | */
776 | 
777 |    penalty = 0.0 ;
778 |    nin_this_layer = n_model_inputs ;
779 |    for (ilayer=0 ; ilayer<n_all-1 ; ilayer++) {  // Do all hidden layers
780 |       for (ineuron=0 ; ineuron<nhid_all[ilayer] ; ineuron++) {
781 |          wptr = weights_opt[ilayer]+ineuron*(nin_this_layer+1) ;  // Weights for this neuron in this layer
782 |          for (ivar=0 ; ivar<nin_this_layer ; ivar++)
783 |             penalty += wptr[ivar] * wptr[ivar] ;
784 |          }
785 |       nin_this_layer = nhid_all[ilayer] ;
786 |       }
787 | 
788 |    for (ineuron=0 ; ineuron<ntarg ; ineuron++) {
789 |       wptr = final_layer_weights + ineuron * n_final_layer_weights ;
790 |       for (ivar=0 ; ivar<nin_this_layer ; ivar++)
791 |          penalty += wptr[ivar] * wptr[ivar] ;
792 |       }
793 | 
794 |    penalty *= wpen ;
795 |    return error + penalty ;
796 | }


--------------------------------------------------------------------------------