├── .gitignore
├── DataSequence.py
├── README.md
├── SemanticModel.py
├── SpeechModelTutorial - Pre-run.html
├── SpeechModelTutorial.ipynb
├── dsutils.py
├── english1000.py
├── features.py
├── interpdata.py
├── npp.py
├── ridge.py
├── stimulus_utils.py
├── textgrid.py
├── util.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/DataSequence.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import itertools as itools
  3 | from interpdata import sincinterp2D, gabor_xfm2D, lanczosinterp2D
  4 | 
  5 | class DataSequence(object):
  6 |     """DataSequence class provides a nice interface for handling data that is both continuous
  7 |     and discretely chunked. For example, semantic projections of speech stimuli must be
  8 |     considered both at the level of single words (which are continuous throughout the stimulus)
  9 |     and at the level of TRs (which contain discrete chunks of words).
 10 |     """
 11 |     def __init__(self, data, split_inds, data_times=None, tr_times=None):
 12 |         """Initializes the DataSequence with the given [data] object (which can be any iterable)
 13 |         and a collection of [split_inds], which should be the indices where the data is split into
 14 |         separate TR chunks.
 15 |         """
 16 |         self.data = data
 17 |         self.split_inds = split_inds
 18 |         self.data_times = data_times
 19 |         self.tr_times = tr_times
 20 | 
 21 |     def mapdata(self, fun):
 22 |         """Creates a new DataSequence where each element of [data] is produced by mapping the
 23 |         function [fun] onto this DataSequence's [data].
 24 | 
 25 |         The [split_inds] are preserved exactly.
 26 |         """
 27 |         return DataSequence(self, map(fun, self.data), self.split_inds)
 28 | 
 29 |     def chunks(self):
 30 |         """Splits the stored [data] into the discrete chunks and returns them.
 31 |         """
 32 |         return np.split(self.data, self.split_inds)
 33 | 
 34 |     def data_to_chunk_ind(self, dataind):
 35 |         """Returns the index of the chunk containing the data with the given index.
 36 |         """
 37 |         zc = np.zeros((len(self.data),))
 38 |         zc[dataind] = 1.0
 39 |         ch = np.array([ch.sum() for ch in np.split(zc, self.split_inds)])
 40 |         return np.nonzero(ch)[0][0]
 41 | 
 42 |     def chunk_to_data_ind(self, chunkind):
 43 |         """Returns the indexes of the data contained in the chunk with the given index.
 44 |         """
 45 |         return list(np.split(np.arange(len(self.data)), self.split_inds)[chunkind])
 46 | 
 47 |     def chunkmeans(self):
 48 |         """Splits the stored [data] into the discrete chunks, then takes the mean of each chunk
 49 |         (this is assuming that [data] is a numpy array) and returns the resulting matrix with
 50 |         one row per chunk.
 51 |         """
 52 |         dsize = self.data.shape[1]
 53 |         outmat = np.zeros((len(self.split_inds)+1, dsize))
 54 |         for ci, c in enumerate(self.chunks()):
 55 |             if len(c):
 56 |                 outmat[ci] = np.vstack(c).mean(0)
 57 | 
 58 |         return outmat
 59 | 
 60 |     def chunksums(self, interp="rect", **kwargs):
 61 |         """Splits the stored [data] into the discrete chunks, then takes the sum of each chunk
 62 |         (this is assuming that [data] is a numpy array) and returns the resulting matrix with
 63 |         one row per chunk.
 64 | 
 65 |         If [interp] is "sinc", the signal will be downsampled using a truncated sinc filter
 66 |         instead of a rectangular filter.
 67 | 
 68 |         if [interp] is "lanczos", the signal will be downsampled using a Lanczos filter.
 69 | 
 70 |         [kwargs] are passed to the interpolation function.
 71 |         """
 72 |         if interp=="sinc":
 73 |             ## downsample using sinc filter
 74 |             return sincinterp2D(self.data, self.data_times, self.tr_times, **kwargs)
 75 |         elif interp=="lanczos":
 76 |             ## downsample using Lanczos filter
 77 |             return lanczosinterp2D(self.data, self.data_times, self.tr_times, **kwargs)
 78 |         elif interp=="gabor":
 79 |             ## downsample using Gabor filter
 80 |             return np.abs(gabor_xfm2D(self.data.T, self.data_times, self.tr_times, **kwargs)).T
 81 |         else:
 82 |             dsize = self.data.shape[1]
 83 |             outmat = np.zeros((len(self.split_inds)+1, dsize))
 84 |             for ci, c in enumerate(self.chunks()):
 85 |                 if len(c):
 86 |                     outmat[ci] = np.vstack(c).sum(0)
 87 |                     
 88 |             return outmat
 89 | 
 90 |     def copy(self):
 91 |         """Returns a copy of this DataSequence.
 92 |         """
 93 |         return DataSequence(list(self.data), self.split_inds.copy(), self.data_times, self.tr_times)
 94 |     
 95 |     @classmethod
 96 |     def from_grid(cls, grid_transcript, trfile):
 97 |         """Creates a new DataSequence from a [grid_transript] and a [trfile].
 98 |         grid_transcript should be the product of the 'make_simple_transcript' method of TextGrid.
 99 |         """
100 |         data_entries = list(zip(*grid_transcript))[2]
101 |         if isinstance(data_entries[0], str):
102 |             data = list(map(str.lower, list(zip(*grid_transcript))[2]))
103 |         else:
104 |             data = data_entries
105 |         word_starts = np.array(list(map(float, list(zip(*grid_transcript))[0])))
106 |         word_ends = np.array(list(map(float, list(zip(*grid_transcript))[1])))
107 |         word_avgtimes = (word_starts + word_ends)/2.0
108 |         
109 |         tr = trfile.avgtr
110 |         trtimes = trfile.get_reltriggertimes()
111 |         
112 |         split_inds = [(word_starts<(t+tr)).sum() for t in trtimes][:-1]
113 |         return cls(data, split_inds, word_avgtimes, trtimes+tr/2.0)
114 | 
115 |     @classmethod
116 |     def from_chunks(cls, chunks):
117 |         """The inverse operation of DataSequence.chunks(), this function concatenates
118 |         the [chunks] and infers split_inds.
119 |         """
120 |         lens = map(len, chunks)
121 |         split_inds = np.cumsum(lens)[:-1]
122 |         #data = reduce(list.__add__, map(list, chunks)) ## 2.26s for 10k 6-w chunks
123 |         data = list(itools.chain(*map(list, chunks))) ## 19.6ms for 10k 6-w chunks
124 |         return cls(data, split_inds)
125 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # speechmodeltutorial
 2 | 
 3 | Originally given as a tutorial at EACL 2014 by Alex Huth.
 4 | 
 5 | In this tutorial you will step through a voxel-wise modeling analysis. You will use computational models to extract semantic features from a natural speech stimulus. Then these features will be used to build linear models of fMRI data, and model weights and prediction performance will be visualized.
 6 | 
 7 | If you so desire, you can step through this entire tutorial without modifying any code. But there are a few points where you will be able to make simple modifications and then see what effect those modifications have on the results. Additionally, at the end you can re-run the model using phoneme features instead of semantic features.
 8 | 
 9 | #### Acknowledgements
10 | This fMRI data used in this tutorial was collected by Alex Huth and Wendy de Heer at the University of California, Berkeley. All work was supervised by professors Jack Gallant and Frederic Theunissen of the UC Berkeley Psychology Department. Please do not redistribute the code or data used here. Visualization is done using [pycortex](https://pycortex.github.io/).
11 | 
12 | #### Citation
13 | The analysis demonstrated in this tutorial forms the basis of this paper:
14 | [Huth, A. G. et al., "Natural speech reveals the semantic maps that tile human cerebral cortex" (2016) _Nature_.](https://www.nature.com/articles/nature17637)
15 | 
16 | Installation
17 | ------------
18 | 1. Download the [data files](https://utexas.box.com/shared/static/4n3lemyec0wlj5rcr80991nxwflsbks9.zip) and unzip in this directory. Should create a directory called `data`.
19 | 2. (If not using Anaconda) install dependencies:
20 | `sudo apt-get update`
21 | `sudo apt-get install -y ipython ipython-notebook python-numpy python-scipy python-matplotlib cython python-pip python-pip python-dev python-h5py python-nibabel python-lxml python-shapely python-html5lib mayavi2 python-tables git`
22 | 
23 |     (If using Conda): `conda install python 'cython=0.29.36' pytables h5py jupyter matplotlib numpy scipy` (NOTE: some packages may be missing from this list)
24 | 
25 |     (The cython requirement is from this issue: https://github.com/gallantlab/pycortex/issues/490#issuecomment-1644641810 )
26 |   
27 | 4. Fetch and install pycortex:
28 | `git clone https://github.com/gallantlab/pycortex.git`
29 | `cd pycortex; python setup.py install`
30 | 5. Start a Jupyter notebook server in this directory (if you don't have one):
31 | `jupyter notebook`
32 | 


--------------------------------------------------------------------------------
/SemanticModel.py:
--------------------------------------------------------------------------------
  1 | import tables
  2 | import pickle
  3 | import numpy as np
  4 | 
  5 | import logging
  6 | logger = logging.getLogger("SemanticModel")
  7 | 
  8 | class SemanticModel(object):
  9 |     """This class defines a semantic vector-space model based on HAL or LSA with some
 10 |     prescribed preprocessing pipeline.
 11 |     
 12 |     It contains two important variables: vocab and data.
 13 |     vocab is a 1D list (or array) of words.
 14 |     data is a 2D array (features by words) of word-feature values.
 15 |     """
 16 |     def __init__(self, data, vocab):
 17 |         """Initializes a SemanticModel with the given [data] and [vocab].
 18 |         """
 19 |         self.data = data
 20 |         self.vocab = vocab
 21 |     
 22 |     def get_ndim(self):
 23 |         """Returns the number of dimensions in this model.
 24 |         """
 25 |         return self.data.shape[0]
 26 |     ndim = property(get_ndim)
 27 | 
 28 |     def get_vindex(self):
 29 |         """Return {vocab: index} dictionary.
 30 |         """
 31 |         if "_vindex" not in dir(self):
 32 |             self._vindex = dict([(v,i) for (i,v) in enumerate(self.vocab)])
 33 |         return self._vindex
 34 |     vindex = property(get_vindex)
 35 | 
 36 |     def __getitem__(self, word):
 37 |         """Returns the vector corresponding to the given [word].
 38 |         """
 39 |         return self.data[:,self.vindex[word]]
 40 |     
 41 |     def load_root(self, rootfile, vocab):
 42 |         """Load the SVD-generated semantic vector space from [rootfile], assumed to be
 43 |         an HDF5 file.
 44 |         """
 45 |         roothf = tables.openFile(rootfile)
 46 |         self.data = roothf.getNode("/R").read()
 47 |         self.vocab = vocab
 48 |         roothf.close()
 49 | 
 50 |     def load_ascii_root(self, rootfile, vocab):
 51 |         """Loads the SVD-generated semantic vector space from [rootfile], assumed to be
 52 |         an ASCII dense matrix output from SDVLIBC.
 53 |         """
 54 |         vtfile = open(rootfile)
 55 |         nrows, ncols = map(int, vtfile.readline().split())
 56 |         Vt = np.zeros((nrows,ncols))
 57 |         nrows_done = 0
 58 |         for row in vtfile:
 59 |             Vt[nrows_done,:] = map(float, row.split())
 60 |             nrows_done += 1
 61 | 
 62 |         self.data = Vt
 63 |         self.vocab = vocab
 64 |     
 65 |     def restrict_by_occurrence(self, min_rank=60, max_rank=60000):
 66 |         """Restricts the data to words that have an occurrence rank lower than
 67 |         [min_rank] and higher than [max_rank].
 68 |         """
 69 |         logger.debug("Restricting words by occurrence..")
 70 |         nwords = self.data.shape[1]
 71 |         wordranks = np.argsort(np.argsort(self.data[0,:]))
 72 |         goodwords = np.nonzero(np.logical_and((nwords-wordranks)>min_rank,
 73 |                                               (nwords-wordranks)<max_rank))[0]
 74 | 
 75 |         self.data = self.data[:,goodwords]
 76 |         self.vocab = [self.vocab[i] for i in goodwords]
 77 |         logger.debug("Done restricting words..")
 78 | 
 79 |     def pca_reduce(self, ndims):
 80 |         """Reduces the dimensionality of the vector-space using PCA.
 81 |         """
 82 |         logger.debug("Reducing with PCA to %d dimensions"%ndims)
 83 |         U,S,Vh = np.linalg.svd(self.data, full_matrices=False)
 84 |         self.data = np.dot(Vh[:ndims].T, np.diag(S[:ndims])).T
 85 |         logger.debug("Done with PCA..")
 86 | 
 87 |     def pca_reduce_multi(self, ndimlist):
 88 |         """Reduces the dimensionality of the vector-space using PCA for many
 89 |         different numbers of dimensions.  More efficient than running
 90 |         pca_reduce many times.
 91 |         
 92 |         Instead of modifying this object, this function returns a list of new
 93 |         SemanticModels with the specified numbers of dimensions.
 94 |         """
 95 |         logger.debug("Reducing with PCA to fewer dimensions..")
 96 |         U,S,Vh = np.linalg.svd(self.data, full_matrices=False)
 97 |         newmodels = []
 98 |         for nd in ndimlist:
 99 |             newmodel = SemanticModel()
100 |             newmodel.vocab = list(self.vocab)
101 |             newmodel.data = np.dot(Vh[:nd].T, np.diag(S[:nd])).T
102 |             newmodels.append(newmodel)
103 |         return newmodels
104 |     
105 |     def save(self, filename):
106 |         """Saves this semantic model at the given filename.
107 |         """
108 |         logger.debug("Saving file: %s"%filename)
109 |         shf = tables.openFile(filename, mode="w", title="SemanticModel")
110 |         shf.createArray("/", "data", self.data)
111 |         shf.createArray("/", "vocab", self.vocab)
112 |         shf.close()
113 |         logger.debug("Done saving file..")
114 | 
115 |     @classmethod
116 |     def load(cls, filename):
117 |         """Loads a semantic model from the given filename.
118 |         """
119 |         logger.debug("Loading file: %s"%filename)
120 |         shf = tables.open_file(filename)
121 | 
122 |         newsm = cls(None, None)
123 |         newsm.data = shf.get_node("/data").read()
124 |         newsm.vocab = [s.decode('utf-8') for s in shf.get_node("/vocab").read()]
125 |         
126 |         shf.close()
127 |         logger.debug("Done loading file..")
128 |         return newsm
129 |     
130 |     def copy(self):
131 |         """Returns a copy of this model.
132 |         """
133 |         logger.debug("Copying model..")
134 |         cp = SemanticModel(self.data.copy(), list(self.vocab))
135 |         logger.debug("Done copying model..")
136 |         return cp
137 | 
138 |     def project_stims(self, stimwords):
139 |         """Projects the stimuli given in [stimwords], which should be a list of lists
140 |         of words, into this feature space. Returns the average feature vector across
141 |         all the words in each stimulus.
142 |         """
143 |         logger.debug("Projecting stimuli..")
144 |         stimlen = len(stimwords)
145 |         ndim = self.data.shape[0]
146 |         pstim = np.zeros((stimlen, ndim))
147 |         vset = set(self.vocab)
148 |         for t in range(stimlen):
149 |             dropped = 0
150 |             for w in stimwords[t]:
151 |                 dropped = 0
152 |                 if w in vset:
153 |                     pstim[t] += self[w]
154 |                 else:
155 |                     dropped += 1
156 |             
157 |             pstim[t] /= (len(stimwords[t])-dropped)
158 | 
159 |         return pstim
160 | 
161 |     def uniformize(self):
162 |         """Uniformizes each feature.
163 |         """
164 |         logger.debug("Uniformizing features..")
165 |         R = np.zeros_like(self.data).astype(np.uint32)
166 |         for ri in range(self.data.shape[0]):
167 |             R[ri] = np.argsort(np.argsort(self.data[ri]))
168 |         
169 |         self.data = R.astype(np.float64)
170 |         logger.debug("Done uniformizing...")
171 | 
172 |     def gaussianize(self):
173 |         """Gaussianizes each feature.
174 |         """
175 |         logger.debug("Gaussianizing features..")
176 |         self.data = gaussianize_mat(self.data.T).T
177 |         logger.debug("Done gaussianizing..")
178 | 
179 |     def zscore(self, axis=0):
180 |         """Z-scores either each feature (if axis is 0) or each word (if axis is 1).
181 |         If axis is None nothing will be Z-scored.
182 |         """
183 |         if axis is None:
184 |             logger.debug("Not Z-scoring..")
185 |             return
186 |         
187 |         logger.debug("Z-scoring on axis %d"%axis)
188 |         if axis==1:
189 |             self.data = zscore(self.data.T).T
190 |         elif axis==0:
191 |             self.data = zscore(self.data)
192 |     
193 |     def rectify(self):
194 |         """Rectifies the features.
195 |         """
196 |         self.data = np.vstack([-np.clip(self.data, -np.inf, 0), np.clip(self.data, 0, np.inf)])
197 |     
198 |     def clip(self, sds):
199 |         """Clips feature values more than [sds] standard deviations away from the mean
200 |         to that value.  Another method for dealing with outliers.
201 |         """
202 |         logger.debug("Truncating features to %d SDs.."%sds)
203 |         fsds = self.data.std(1)
204 |         fms = self.data.mean(1)
205 |         newdata = np.zeros(self.data.shape)
206 |         for fi in range(self.data.shape[0]):
207 |             newdata[fi] = np.clip(self.data[fi],
208 |                                   fms[fi]-sds*fsds[fi],
209 |                                   fms[fi]+sds*fsds[fi])
210 | 
211 |         self.data = newdata
212 |         logger.debug("Done truncating..")
213 | 
214 |     def find_words_like_word(self, word, n=10):
215 |         """Finds the [n] words most like the given [word].
216 |         """
217 |         return self.find_words_like_vec(self.data[:,self.vocab.index(word)], n)
218 | 
219 |     def find_words_like_vec(self, vec, n=10, corr=True):
220 |         """Finds the [n] words most like the given [vector].
221 |         """
222 |         nwords = len(self.vocab)
223 |         if corr:
224 |             corrs = np.nan_to_num([np.corrcoef(vec, self.data[:,wi])[1,0] for wi in range(nwords)])
225 |             scorrs = np.argsort(corrs)
226 |             words = list(reversed([(corrs[i], self.vocab[i]) for i in scorrs[-n:]]))
227 |         else:
228 |             proj = np.nan_to_num(np.dot(vec, self.data))
229 |             sproj = np.argsort(proj)
230 |             words = list(reversed([(proj[i], self.vocab[i]) for i in sproj[-n:]]))
231 |         return words
232 | 
233 |     def find_words_like_vecs(self, vecs, n=10, corr=True, distance_cull=None):
234 |         """Find the `n` words most like each vector in `vecs`.
235 |         """
236 |         if corr:
237 |             from text.npp import xcorr
238 |             vproj = xcorr(vecs, self.data.T)
239 |         else:
240 |             vproj = np.dot(vecs, self.data)
241 | 
242 |         return np.vstack([self._get_best_words(vp, n, distance_cull) for vp in vproj])
243 | 
244 |     def _get_best_words(self, proj, n=10, distance_cull=None):
245 |         """Find the `n` words corresponding to the highest values in the vector `proj`.
246 |         If `distance_cull` is an int, greedily find words with the following algorithm:
247 |         1. Initialize the possible set of words with all words.
248 |         2. Add the best possible word, w*. Remove w* from the possible set.
249 |         3. Remove the `distance_cull` closest neighbors of w* from the possible set.
250 |         4. Goto 2.
251 |         """
252 |         vocarr = np.array(self.vocab)
253 |         if distance_cull is None:
254 |             return vocarr[np.argsort(proj)[-n:][::-1]]
255 |         elif not isinstance(distance_cull, int):
256 |             raise TypeError("distance_cull should be an integer value, not %s" % str(distance_cull))
257 | 
258 |         poss_set = set(self.vocab)
259 |         poss_set = np.arange(len(self.vocab))
260 |         best_words = []
261 |         while len(best_words) < n:
262 |             # Find best word in poss_set
263 |             best_poss = poss_set[proj[poss_set].argmax()]
264 |             # Add word to best_words
265 |             best_words.append(self.vocab[best_poss])
266 |             # Remove nearby words (by L2-norm..?)
267 |             bwdists = ((self.data.T - self.data[:,best_poss])**2).sum(1)
268 |             nearest_inds = np.argsort(bwdists)[:distance_cull+1]
269 |             poss_set = np.setdiff1d(poss_set, nearest_inds)
270 | 
271 |         return np.array(best_words)
272 |     
273 |     def similarity(self, word1, word2):
274 |         """Returns the correlation between the vectors for [word1] and [word2].
275 |         """
276 |         return np.corrcoef(self.data[:,self.vocab.index(word1)], self.data[:,self.vocab.index(word2)])[0,1]
277 | 
278 |     def print_best_worst(self, ii, n=10):
279 |         vector = self.data[ii]
280 |         sv = np.argsort(self.data[ii])
281 |         print ("Best:")
282 |         print ("-------------")
283 |         for ni in range(1,n+1):
284 |             print ("%s: %0.08f"%(np.array(self.vocab)[sv[-ni]], vector[sv[-ni]]))
285 |             
286 |         print ("\nWorst:")
287 |         print ("-------------")
288 |         for ni in range(n):
289 |             print ("%s: %0.08f"%(np.array(self.vocab)[sv[ni]], vector[sv[ni]]))
290 |             
291 |         print ("\n")
292 | 
293 | 
294 | def gaussianize(vec):
295 |     """Uses a look-up table to force the values in [vec] to be gaussian."""
296 |     import scipy.stats
297 |     ranks = np.argsort(np.argsort(vec))
298 |     cranks = (ranks+1).astype(float)/(ranks.max()+2)
299 |     vals = scipy.stats.norm.isf(1-cranks)
300 |     zvals = vals/vals.std()
301 |     return zvals
302 | 
303 | def gaussianize_mat(mat):
304 |     """Gaussianizes each column of [mat]."""
305 |     gmat = np.empty(mat.shape)
306 |     for ri in range(mat.shape[1]):
307 |         gmat[:,ri] = gaussianize(mat[:,ri])
308 |     return gmat
309 | 
310 | def zscore(mat, return_unzvals=False):
311 |     """Z-scores the rows of [mat] by subtracting off the mean and dividing
312 |     by the standard deviation.
313 |     If [return_unzvals] is True, a matrix will be returned that can be used
314 |     to return the z-scored values to their original state.
315 |     """
316 |     zmat = np.empty(mat.shape)
317 |     unzvals = np.zeros((zmat.shape[0], 2))
318 |     for ri in range(mat.shape[0]):
319 |         unzvals[ri,0] = np.std(mat[ri,:])
320 |         unzvals[ri,1] = np.mean(mat[ri,:])
321 |         zmat[ri,:] = (mat[ri,:]-unzvals[ri,1]) / (1e-10+unzvals[ri,0])
322 |     
323 |     if return_unzvals:
324 |         return zmat, unzvals
325 |     
326 |     return zmat
327 | 


--------------------------------------------------------------------------------
/SpeechModelTutorial.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Modeling the Representation of Speech in the Brain\n",
   8 |     "Originally given as a tutorial at EACL 2014 by Alex Huth.\n",
   9 |     "\n",
  10 |     "In this tutorial you will step through a voxel-wise modeling analysis. You will use computational models to extract semantic features from a natural speech stimulus. Then these features will be used to build linear models of fMRI data, and model weights and prediction performance will be visualized.\n",
  11 |     "\n",
  12 |     "If you so desire, you can step through this entire tutorial without modifying any code. But there are a few points where you will be able to make simply modifications and then see what effect those modifications have on the results. Additionally at the end you can re-run the model using phoneme features instead of semantic features.\n",
  13 |     "\n",
  14 |     "#### Acknowledgements\n",
  15 |     "This fMRI data used in this tutorial was collected by Alex Huth and Wendy de Heer at the University of California, Berkeley. All work was supervised by professors Jack Gallant and Frederic Theunissen of the UC Berkeley Psychology Department. Please do not redistribute the code or data used here. Visualization is done using [pycortex](https://pycortex.github.io/).\n",
  16 |     "\n",
  17 |     "#### Citation\n",
  18 |     "The analysis demonstrated in this tutorial forms the basis of this paper:\n",
  19 |     "[Huth, A. G. et al., \"Natural speech reveals the semantic maps that tile human cerebral cortex\" (2016) _Nature_.](https://www.nature.com/articles/nature17637)\n",
  20 |     "\n",
  21 |     "## The experiment\n",
  22 |     "In this experiment a subject underwent fMRI scanning while they listened to roughly 2 hours of natural narrative speech stimuli. These stimuli were 10-15 minute complete stories drawn from *The Moth Radio Hour*, a radio program where storytellers tell true, autobiographical stories in front of a live audience.\n",
  23 |     "\n",
  24 |     "## This notebook\n",
  25 |     "This notebook is written in python, and is presented using the iPython notebook. **To evaluate a block of code, click in it to select it, and then press shift-Enter.** You can go through the entire tutorial (until the optional section at the end) by just running each block in turn. But you'll learn more if you stop and mess around with each block as you go.\n",
  26 |     "\n",
  27 |     "* If you're totally new to python, you might check out this collection of [simple python programs](https://wiki.python.org/moin/SimplePrograms), which is a very quick intro to python's syntax and data structures. You can also find dozens of python beginner's guides on the internet.\n",
  28 |     "* If you're new to python but know MATLAB, check out [this reference](http://wiki.scipy.org/NumPy_for_Matlab_Users). It has python/numpy equivalents for most common MATLAB functions.\n",
  29 |     "* Plotting is accomplished here using matplotlib, a MATLAB-like plotting library for python. If you want to learn more about the basics of using matplotlib, check out [this notebook](http://nbviewer.ipython.org/github/jrjohansson/scientific-python-lectures/blob/master/Lecture-4-Matplotlib.ipynb).\n",
  30 |     "* Finally, there are a number of points where the notebook imports libraries that I (Alex) have written. If you want to see the code for this, create a new cell (use the menu item `Insert > Insert cell below`), enter `%load libraryname.py`, and evaluate."
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "code",
  35 |    "execution_count": null,
  36 |    "metadata": {},
  37 |    "outputs": [],
  38 |    "source": [
  39 |     "# Run this cell if your computer has a 'retina' or high DPI display. It will make the figures look much nicer.\n",
  40 |     "%config InlineBackend.figure_format = 'retina'"
  41 |    ]
  42 |   },
  43 |   {
  44 |    "cell_type": "code",
  45 |    "execution_count": null,
  46 |    "metadata": {},
  47 |    "outputs": [],
  48 |    "source": [
  49 |     "# This cell imports libraries that you will need\n",
  50 |     "# Run this.\n",
  51 |     "from matplotlib.pyplot import figure, cm\n",
  52 |     "import numpy as np\n",
  53 |     "import logging\n",
  54 |     "logging.basicConfig(level=logging.DEBUG)"
  55 |    ]
  56 |   },
  57 |   {
  58 |    "cell_type": "markdown",
  59 |    "metadata": {},
  60 |    "source": [
  61 |     "## The semantic model: English1000\n",
  62 |     "Here you will load a precomputed vector-space semantic model. This semantic model will transform any word (well, any word it knows about) into a 985-dimensional vector. This 985-dimensional semantic space has the nice property that words that are close together tend to have similar meanings. Although it would have been fun to have tried reconstructing this semantic model in this tutorial, it takes a really long time and it doesn't seem like the parameters matter that much. So today you're just going to work with the preconstructed semantic model.\n",
  63 |     "\n",
  64 |     "The semantic model was constructed using a decently large corpus of text (a couple billion words, comprising the stories used as stimuli here, 604 popular books, 2,405,569 wikipedia pages, and 36,333,459 user comments scraped from reddit.com) and a lexicon of roughly 10,000 words. We selected 985 \"basis words\" from the Wikipedia \"List of 1000 basic words\" (contrary to the title, this lost does not actually contain 1000 words, but this is where the title of the model comes from). These are common words that span many topics.\n",
  65 |     "\n",
  66 |     "We constructed a word co-occurrence matrix, $M$, with 985 rows and 10,470 columns. Iterating through the training corpus, we added 1 to $M_{ij}$ each time word $j$ appeared within 15 words of basis word $i$. The window size of 15 was selected to be large enough to suppress syntactic effects (word order) but no larger. Once the co-occurrence matrix was complete, we log transformed the counts, replacing $M_{ij}$ with $\\log(1 + M_{ij})$. Then each row of M was z-scored to correct for differences in basis word frequency, and finally each column of $M$ was z-scored to correct for word frequency. The resulting matrix is the one you're loading here.\n",
  67 |     "\n",
  68 |     "(As an aside, while I constructed this model in a totally ad hoc and unplanned way, it has properties that are very similar to Mikolov's [word2vec model](https://code.google.com/p/word2vec/) that's recently gained a lot of popularity.)\n",
  69 |     "\n",
  70 |     "Anyway, here you are going to load the model and then play with it a bit to see how it works."
  71 |    ]
  72 |   },
  73 |   {
  74 |    "cell_type": "code",
  75 |    "execution_count": null,
  76 |    "metadata": {},
  77 |    "outputs": [],
  78 |    "source": [
  79 |     "# Load semantic model\n",
  80 |     "# The SemanticModel class is something I wrote to make it easy to deal with vector-space semantic models.\n",
  81 |     "from SemanticModel import SemanticModel\n",
  82 |     "eng1000 = SemanticModel.load(\"data/english1000sm.hf5\")"
  83 |    ]
  84 |   },
  85 |   {
  86 |    "cell_type": "code",
  87 |    "execution_count": null,
  88 |    "metadata": {},
  89 |    "outputs": [],
  90 |    "source": [
  91 |     "# You can get the vector for a word by indexing the model with that word\n",
  92 |     "# For example, the vector for \"finger\":\n",
  93 |     "print(eng1000[\"finger\"])"
  94 |    ]
  95 |   },
  96 |   {
  97 |    "cell_type": "markdown",
  98 |    "metadata": {},
  99 |    "source": [
 100 |     "### Visualizing a word\n",
 101 |     "First let's plot the length 985 vector for one word to see what it looks like."
 102 |    ]
 103 |   },
 104 |   {
 105 |    "cell_type": "code",
 106 |    "execution_count": null,
 107 |    "metadata": {},
 108 |    "outputs": [],
 109 |    "source": [
 110 |     "plot_word = \"finger\"\n",
 111 |     "\n",
 112 |     "f = figure(figsize=(15,5))\n",
 113 |     "ax = f.add_subplot(1,1,1)\n",
 114 |     "ax.plot(eng1000[plot_word], 'k')\n",
 115 |     "ax.axis(\"tight\")\n",
 116 |     "ax.set_title(\"English1000 representation for %s\" % plot_word)\n",
 117 |     "ax.set_xlabel(\"Feature number\")\n",
 118 |     "ax.set_ylabel(\"Feature value\")"
 119 |    ]
 120 |   },
 121 |   {
 122 |    "cell_type": "markdown",
 123 |    "metadata": {},
 124 |    "source": [
 125 |     "### Visualizing more than one word\n",
 126 |     "Next let's plot the vectors for three words: \"finger\", \"fingers\", and \"grief\". Here you will see that \"finger\" (in black) and \"fingers\" (in red) look very similar, but \"grief\" (in blue) looks very different. Neat."
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "code",
 131 |    "execution_count": null,
 132 |    "metadata": {},
 133 |    "outputs": [],
 134 |    "source": [
 135 |     "plot_words = [\"finger\", \"fingers\", \"language\"]\n",
 136 |     "colors = [\"k\", \"r\", \"b\"]\n",
 137 |     "\n",
 138 |     "f = figure(figsize=(15,5))\n",
 139 |     "ax = f.add_subplot(1,1,1)\n",
 140 |     "wordlines = []\n",
 141 |     "\n",
 142 |     "for word, color in zip(plot_words, colors):\n",
 143 |     "    wordlines.append(ax.plot(eng1000[word], color)[0])\n",
 144 |     "\n",
 145 |     "ax.axis(\"tight\")\n",
 146 |     "ax.set_title(\"English1000 representations for some words\")\n",
 147 |     "ax.set_xlabel(\"Feature number\")\n",
 148 |     "ax.set_ylabel(\"Feature value\")\n",
 149 |     "ax.legend(wordlines, plot_words)"
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "markdown",
 154 |    "metadata": {},
 155 |    "source": [
 156 |     "### Semantic smoothness\n",
 157 |     "One nice test of a vector-space semantic model is whether it results in a \"semantically smooth\" representation of the words. That is, do nearby words in the space have intuitively similar meanings? Here you can test that using the method `find_words_like_word`. \n",
 158 |     "\n",
 159 |     "Give any word (that the model knows about), and it will print out the 10 closest words (that it knows about) and their cosine similarities (or correlations, same thing in this case). This includes the word you supplied.\n",
 160 |     "\n",
 161 |     "In this next example it prints the closest words to \"finger\". All of the 10 closest words are semantically related: 9 are nouns, and 1 is a verb (\"stick\"; of course this is also a noun, I'm just assuming that the sense of \"stick\" that's close to \"finger\" is probably the verb sense, but this brings up an important point: this model does nothing to disambiguate between different word senses!).\n",
 162 |     "\n",
 163 |     "You can put different words in here and see what the model comes up with. \n",
 164 |     "\n",
 165 |     "*(Be warned: the model knows some dirty words. It was trained using the internet, after all.)*"
 166 |    ]
 167 |   },
 168 |   {
 169 |    "cell_type": "code",
 170 |    "execution_count": null,
 171 |    "metadata": {},
 172 |    "outputs": [],
 173 |    "source": [
 174 |     "# Test semantic model\n",
 175 |     "eng1000.find_words_like_word(\"finger\")"
 176 |    ]
 177 |   },
 178 |   {
 179 |    "cell_type": "markdown",
 180 |    "metadata": {},
 181 |    "source": [
 182 |     "Here is just another example, but this one an abstract noun, \"language\". Again the model does a pretty good job at finding related words."
 183 |    ]
 184 |   },
 185 |   {
 186 |    "cell_type": "code",
 187 |    "execution_count": null,
 188 |    "metadata": {},
 189 |    "outputs": [],
 190 |    "source": [
 191 |     "eng1000.find_words_like_word(\"language\")"
 192 |    ]
 193 |   },
 194 |   {
 195 |    "cell_type": "code",
 196 |    "execution_count": null,
 197 |    "metadata": {},
 198 |    "outputs": [],
 199 |    "source": [
 200 |     "eng1000.find_words_like_vec(eng1000[\"king\"] - eng1000[\"man\"] + eng1000[\"woman\"])"
 201 |    ]
 202 |   },
 203 |   {
 204 |    "cell_type": "markdown",
 205 |    "metadata": {},
 206 |    "source": [
 207 |     "## The stimuli: Moth stories\n",
 208 |     "Next we're going to load up the stimuli. We're not going to be dealing with the actual audio of the stories that were presented, but instead with aligned transcripts. These were generated using the UPenn forced aligner (P2FA), which figures out when each word was spoken given the transcript and the audio. The transcripts are stored in TextGrid format (native to Praat), which can be loaded directly into Python using some code from the natural language toolkit (NLTK).\n",
 209 |     "\n",
 210 |     "Here you will load the TextGrids for the stories, as well as 'TRfiles', which specify the time points relative to story onset when the fMRI data was collected (roughly every 2 seconds).\n",
 211 |     "\n",
 212 |     "Finally the TextGrids and TRfiles will be combined together into a representation I call a DataSequence. There is nothing interesting going on here scientifically, this is just something to make subsequent steps more manageable."
 213 |    ]
 214 |   },
 215 |   {
 216 |    "cell_type": "code",
 217 |    "execution_count": null,
 218 |    "metadata": {},
 219 |    "outputs": [],
 220 |    "source": [
 221 |     "# These are lists of the stories\n",
 222 |     "# Rstories are the names of the training (or Regression) stories, which we will use to fit our models\n",
 223 |     "Rstories = ['alternateithicatom', 'avatar', 'howtodraw', 'legacy', \n",
 224 |     "            'life', 'myfirstdaywiththeyankees', 'naked', \n",
 225 |     "            'odetostepfather', 'souls', 'undertheinfluence']\n",
 226 |     "\n",
 227 |     "# Pstories are the test (or Prediction) stories (well, story), which we will use to test our models\n",
 228 |     "Pstories = ['wheretheressmoke']\n",
 229 |     "\n",
 230 |     "allstories = Rstories + Pstories\n",
 231 |     "\n",
 232 |     "# Load TextGrids\n",
 233 |     "from stimulus_utils import load_grids_for_stories\n",
 234 |     "grids = load_grids_for_stories(allstories)\n",
 235 |     "\n",
 236 |     "# Load TRfiles\n",
 237 |     "from stimulus_utils import load_generic_trfiles\n",
 238 |     "trfiles = load_generic_trfiles(allstories)\n",
 239 |     "\n",
 240 |     "# Make word and phoneme datasequences\n",
 241 |     "from dsutils import make_word_ds, make_phoneme_ds\n",
 242 |     "wordseqs = make_word_ds(grids, trfiles) # dictionary of {storyname : word DataSequence}\n",
 243 |     "phonseqs = make_phoneme_ds(grids, trfiles) # dictionary of {storyname : phoneme DataSequence}"
 244 |    ]
 245 |   },
 246 |   {
 247 |    "cell_type": "markdown",
 248 |    "metadata": {},
 249 |    "source": [
 250 |     "Before going on, let's play with the DataSequences a bit, both so you can see what the data structure looks like, and also so you can see what the stimuli look like."
 251 |    ]
 252 |   },
 253 |   {
 254 |    "cell_type": "code",
 255 |    "execution_count": null,
 256 |    "metadata": {},
 257 |    "outputs": [],
 258 |    "source": [
 259 |     "naked = wordseqs[\"naked\"]\n",
 260 |     "# The DataSequence stores a lot of information\n",
 261 |     "# naked.data is a list of all the words in the story\n",
 262 |     "print (\"There are %d words in the story called 'naked'\" % len(list(naked.data)))"
 263 |    ]
 264 |   },
 265 |   {
 266 |    "cell_type": "code",
 267 |    "execution_count": null,
 268 |    "metadata": {},
 269 |    "outputs": [],
 270 |    "source": [
 271 |     "# We can print out the first 100 words like this\n",
 272 |     "print (list(naked.data)[:100])"
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "code",
 277 |    "execution_count": null,
 278 |    "metadata": {},
 279 |    "outputs": [],
 280 |    "source": [
 281 |     "# or, if you want it to be more readable, like this\n",
 282 |     "print (\" \".join(list(naked.data)[:100]))"
 283 |    ]
 284 |   },
 285 |   {
 286 |    "cell_type": "code",
 287 |    "execution_count": null,
 288 |    "metadata": {},
 289 |    "outputs": [],
 290 |    "source": [
 291 |     "# the datasequence also stores when exactly each word was spoken (this time corresponds to the middle of each word)\n",
 292 |     "print (naked.data_times[:10])"
 293 |    ]
 294 |   },
 295 |   {
 296 |    "cell_type": "code",
 297 |    "execution_count": null,
 298 |    "metadata": {},
 299 |    "outputs": [],
 300 |    "source": [
 301 |     "# and it also stores the time of the middle of each fMRI acquisition (each acqusition takes 2.0045 seconds)\n",
 302 |     "# these times are relative to story start, so the fMRI scan started 10 seconds before the story\n",
 303 |     "print (naked.tr_times[:10])"
 304 |    ]
 305 |   },
 306 |   {
 307 |    "cell_type": "code",
 308 |    "execution_count": null,
 309 |    "metadata": {},
 310 |    "outputs": [],
 311 |    "source": [
 312 |     "# and it also makes it easy to, for example, find the words that were spoken during each fMRI acquisition\n",
 313 |     "# (the first few are empty because they came before the story started)\n",
 314 |     "print (naked.chunks()[:10])"
 315 |    ]
 316 |   },
 317 |   {
 318 |    "cell_type": "markdown",
 319 |    "metadata": {},
 320 |    "source": [
 321 |     "## Projecting the stimuli into the semantic space\n",
 322 |     "The next step in this analysis is that you need to project each word in the stimulus into the English1000 semantic feature space that you loaded above. I wrote a nice function to do this called `make_semantic_model` that simply takes the word DataSequence and the semantic model, and spits out a new DataSequence where each word is replaced by a 985-dimensional vector."
 323 |    ]
 324 |   },
 325 |   {
 326 |    "cell_type": "code",
 327 |    "execution_count": null,
 328 |    "metadata": {},
 329 |    "outputs": [],
 330 |    "source": [
 331 |     "# Project stimuli\n",
 332 |     "from dsutils import make_semantic_model\n",
 333 |     "semanticseqs = dict() # dictionary to hold projected stimuli {story name : projected DataSequence}\n",
 334 |     "for story in allstories:\n",
 335 |     "    semanticseqs[story] = make_semantic_model(wordseqs[story], eng1000)"
 336 |    ]
 337 |   },
 338 |   {
 339 |    "cell_type": "code",
 340 |    "execution_count": null,
 341 |    "metadata": {},
 342 |    "outputs": [],
 343 |    "source": [
 344 |     "# take a look at the projected stimuli\n",
 345 |     "naked_proj = semanticseqs[\"naked\"]\n",
 346 |     "\n",
 347 |     "print (naked_proj.data.shape) # prints the shape of 'data' as (rows, columns)\n",
 348 |     "print (naked_proj.data[:10]) # print the first 10 rows (this will be truncated)"
 349 |    ]
 350 |   },
 351 |   {
 352 |    "cell_type": "markdown",
 353 |    "metadata": {},
 354 |    "source": [
 355 |     "### Downsample the projected stimuli\n",
 356 |     "In order to build a model, you need to downsample the semantic representations of the stimuli to the same temporal scale as the fMRI responses that you will be modeling. The DataSequence provides a method that does this, called `chunksums`.\n",
 357 |     "\n",
 358 |     "For those of you who are interested, downsampling is accomplished here using a 3-lobe Lanczos filter (see [here](http://en.wikipedia.org/wiki/Lanczos_window) for details about the math). You can try changing the number of lobes, it shouldn't affect the results much."
 359 |    ]
 360 |   },
 361 |   {
 362 |    "cell_type": "code",
 363 |    "execution_count": null,
 364 |    "metadata": {},
 365 |    "outputs": [],
 366 |    "source": [
 367 |     "# Downsample stimuli\n",
 368 |     "interptype = \"lanczos\" # filter type\n",
 369 |     "window = 3 # number of lobes in Lanczos filter\n",
 370 |     "\n",
 371 |     "downsampled_semanticseqs = dict() # dictionary to hold downsampled stimuli\n",
 372 |     "for story in allstories:\n",
 373 |     "    downsampled_semanticseqs[story] = semanticseqs[story].chunksums(interptype, window=window)\n"
 374 |    ]
 375 |   },
 376 |   {
 377 |    "cell_type": "markdown",
 378 |    "metadata": {},
 379 |    "source": [
 380 |     "### Visualizing the downsampling\n",
 381 |     "Next you're going to visualize what the downsampling did. Here you're going to plot the value of one semantic feature (feature 2, which is actually the third feature: zero-based indexing) for each word, and also the downsampled vector."
 382 |    ]
 383 |   },
 384 |   {
 385 |    "cell_type": "code",
 386 |    "execution_count": null,
 387 |    "metadata": {},
 388 |    "outputs": [],
 389 |    "source": [
 390 |     "# Plot the result\n",
 391 |     "s_words = wordseqs[\"naked\"]\n",
 392 |     "s_sem = semanticseqs[\"naked\"]\n",
 393 |     "s_semdown = downsampled_semanticseqs[\"naked\"]\n",
 394 |     "\n",
 395 |     "f = figure(figsize=(15,5))\n",
 396 |     "f.clf()\n",
 397 |     "schan = 2\n",
 398 |     "ax = f.add_subplot(1,1,1)\n",
 399 |     "wordstems = ax.stem(s_sem.data_times, \n",
 400 |     "                    s_sem.data[:,schan] / np.abs(s_sem.data[:,schan]).max(), \n",
 401 |     "                    linefmt=\"k-\", markerfmt=\"k.\", basefmt=\"k-\")\n",
 402 |     "interps = ax.plot(s_sem.tr_times, \n",
 403 |     "                  s_semdown[:,schan] / np.abs(s_semdown[:,schan]).max(), 'r.-')\n",
 404 |     "ax.set_xlim(-6, 60)\n",
 405 |     "ax.set_ylim(-1, 1)\n",
 406 |     "ax.set_xlabel(\"Time (seconds since story start)\")\n",
 407 |     "ax.set_ylabel(\"Semantic feature value\")\n",
 408 |     "ax.legend((wordstems, interps[0]), (\"Individual words\", \"Downsampled feature\"));"
 409 |    ]
 410 |   },
 411 |   {
 412 |    "cell_type": "markdown",
 413 |    "metadata": {},
 414 |    "source": [
 415 |     "### Concatenating features across stories\n",
 416 |     "Next you're going to combine together all the features from all the stories into one big matrix. Within this operation, you're also going to [z-score](http://en.wikipedia.org/wiki/Z-score) each feature within each story. This operation subtracts off the mean and then divides by the standard deviation. This might seem like a weird or incomprehensible thing to do, but I do it because the responses to each story are z-scored individually. Anyway not a big deal.\n",
 417 |     "\n",
 418 |     "The features for each story are also trimmed a bit (the variable `trim` determines how many time points are removed from the beginning and end of each story). The fMRI responses at the beginnings and ends of the stories are often noisier than at other times because of transients and problems with detrending (an fMRI preprocessing step that you don't need to worry about here aside from this point).\n",
 419 |     "\n",
 420 |     "The combined features are stored in big matrices called `Rstim` (with the training, or Regression stimuli) and `Pstim` (with the test, or Prediction stimuli)."
 421 |    ]
 422 |   },
 423 |   {
 424 |    "cell_type": "code",
 425 |    "execution_count": null,
 426 |    "metadata": {},
 427 |    "outputs": [],
 428 |    "source": [
 429 |     "# Combine stimuli\n",
 430 |     "from npp import zscore\n",
 431 |     "trim = 5\n",
 432 |     "Rstim = np.vstack([zscore(downsampled_semanticseqs[story][5+trim:-trim]) for story in Rstories])\n",
 433 |     "Pstim = np.vstack([zscore(downsampled_semanticseqs[story][5+trim:-trim]) for story in Pstories])\n"
 434 |    ]
 435 |   },
 436 |   {
 437 |    "cell_type": "code",
 438 |    "execution_count": null,
 439 |    "metadata": {},
 440 |    "outputs": [],
 441 |    "source": [
 442 |     "storylens = [len(downsampled_semanticseqs[story][5+trim:-trim]) for story in Rstories]\n",
 443 |     "print(storylens)\n",
 444 |     "\n",
 445 |     "print(np.cumsum(storylens))"
 446 |    ]
 447 |   },
 448 |   {
 449 |    "cell_type": "code",
 450 |    "execution_count": null,
 451 |    "metadata": {},
 452 |    "outputs": [],
 453 |    "source": [
 454 |     "# Print the sizes of these matrices\n",
 455 |     "print (\"Rstim shape: \", Rstim.shape)\n",
 456 |     "print (\"Pstim shape: \", Pstim.shape)"
 457 |    ]
 458 |   },
 459 |   {
 460 |    "cell_type": "markdown",
 461 |    "metadata": {},
 462 |    "source": [
 463 |     "### Visualizing the combined stimuli\n",
 464 |     "Next you're going to plot some of the feature channels. This is just to see what the feature look like that are going to go into the regression model."
 465 |    ]
 466 |   },
 467 |   {
 468 |    "cell_type": "code",
 469 |    "execution_count": null,
 470 |    "metadata": {},
 471 |    "outputs": [],
 472 |    "source": [
 473 |     "# Plot some of the combined stimuli\n",
 474 |     "f = figure(figsize=(20, 5))\n",
 475 |     "ax = f.add_subplot(1,1,1)\n",
 476 |     "\n",
 477 |     "for ii in range(10):\n",
 478 |     "    # Plot each feature, offset by 5 vertically so they are easier to see\n",
 479 |     "    ax.plot(Rstim[:750,ii] - 5 * ii)\n",
 480 |     "\n",
 481 |     "ax.set_xlim(0, 750)\n",
 482 |     "ax.set_yticks([])\n",
 483 |     "ax.set_xticks(range(0, 750, 50))\n",
 484 |     "ax.set_xlabel(\"Time (fMRI volumes)\")\n",
 485 |     "ax.set_ylabel(\"Features 1-10\")\n",
 486 |     "ax.grid()"
 487 |    ]
 488 |   },
 489 |   {
 490 |    "cell_type": "markdown",
 491 |    "metadata": {},
 492 |    "source": [
 493 |     "### Concatenate delayed stimuli for FIR model\n",
 494 |     "Next you are going to concatenate multiple delayed versions of the stimuli, in order to create a linear [finite impulse response (FIR) model](http://en.wikipedia.org/wiki/Fir_filter). This is a vitally important step, and is conceptually a bit difficult, so take a few minutes to make sure you understand what is going on here.\n",
 495 |     "\n",
 496 |     "#### Background: the hemodynamic response\n",
 497 |     "First you need to understand the problem that the FIR model is solving. fMRI measures the blood-oxygen level dependent (BOLD) signal, which is a complicated and nonlinear combination of blood oxygenation and blood volume. When neurons in an area of the brain become active, they start using up lots of energy. To compensate, nearby blood vessels dilate so that more oxygen and glucose become available to the neurons. The resulting changes in blood oxygenation (which increases) and volume (which also increases) create the magnetic signature that is recorded by fMRI. \n",
 498 |     "\n",
 499 |     "But this process is **slow**. It takes seconds after the neural activity begins for the blood vessels to dilate and for the BOLD response to become apparent. And then it takes more seconds for the response to go away. So although a neural response might only last milliseconds, the associated BOLD response will rise and fall over a span of maybe 10 seconds, orders of magnitude slower. The shape of this rise and fall is called the [hemodynamic response function (HRF)](http://en.wikipedia.org/wiki/Haemodynamic_response).\n",
 500 |     "\n",
 501 |     "Here is a pretty standard looking example of an HRF:\n",
 502 |     "\n",
 503 |     "<img src='http://www.brainmatters.nl/wp-content/uploads/bold.png' width=350px></img>\n",
 504 |     "\n",
 505 |     "#### FIR model\n",
 506 |     "To accurately model how the brain responds to these stimuli we must also model the HRF. There are many ways to do this. The most common is to assume that the HRF follows a canonical shape. But this approach turns out to not work very well: different parts of the brain have very different vasculature (blood vessels), so the HRF shape can vary a lot. \n",
 507 |     "\n",
 508 |     "Instead, what you are going to here is estimate a separate HRF for each semantic feature in each voxel that is being modeled. This estimate is going to take the form of a linear finite impulse response (FIR) model. The linear FIR form is particularly nice to use because it's very simple to estimate and powerful (if anything, it might be too powerful.. more on that later). To build a linear FIR model all you have to do is concatenate together multiple delayed copies of the stimulus. I usually use four delays: 1, 2, 3, and 4 time points. The resulting delayed features can be thought of as representing the stimulus 1, 2, 3, and 4 time points ago. So the regression weights for those features will represent how a particular voxel responds to a feature 1, 2, 3, or 4 time points in the past, and these regression weights are a 4-point estimate of the HRF for that feature in that voxel.\n",
 509 |     "\n",
 510 |     "The potential downside of the FIR model is that it may be too expressive. Each feature in each voxel is allowed to have any HRF, but this comes at the cost of multiplying the total number of regression weights that we must fit by the number of delays. In all likelihood the true HRFs vary, but they don't vary that much, so we probably don't need this many independent features. This cost becomes apparent if you increase the number of delays. This will slow down model fitting and likely decrease the stability of the regression weights, leading to decreased model performance. \n",
 511 |     "\n",
 512 |     "Feel free to play around with the number of delays and see how it affects the model results!"
 513 |    ]
 514 |   },
 515 |   {
 516 |    "cell_type": "code",
 517 |    "execution_count": null,
 518 |    "metadata": {},
 519 |    "outputs": [],
 520 |    "source": [
 521 |     "# Delay stimuli\n",
 522 |     "from util import make_delayed\n",
 523 |     "ndelays = 4\n",
 524 |     "delays = range(1, ndelays+1)\n",
 525 |     "\n",
 526 |     "print (\"FIR model delays: \", delays)\n",
 527 |     "\n",
 528 |     "delRstim = make_delayed(Rstim, delays)\n",
 529 |     "delPstim = make_delayed(Pstim, delays)\n"
 530 |    ]
 531 |   },
 532 |   {
 533 |    "cell_type": "code",
 534 |    "execution_count": null,
 535 |    "metadata": {},
 536 |    "outputs": [],
 537 |    "source": [
 538 |     "# Print the sizes of these matrices\n",
 539 |     "print (\"delRstim shape: \", delRstim.shape)\n",
 540 |     "print (\"delPstim shape: \", delPstim.shape)"
 541 |    ]
 542 |   },
 543 |   {
 544 |    "cell_type": "markdown",
 545 |    "metadata": {},
 546 |    "source": [
 547 |     "### Visualizing FIR features\n",
 548 |     "Here you will visualize the first semantic feature at each of the delays."
 549 |    ]
 550 |   },
 551 |   {
 552 |    "cell_type": "code",
 553 |    "execution_count": null,
 554 |    "metadata": {},
 555 |    "outputs": [],
 556 |    "source": [
 557 |     "# Plot the same feature at different delays\n",
 558 |     "f = figure(figsize=(15, 4))\n",
 559 |     "ax = f.add_subplot(1,1,1)\n",
 560 |     "for ii in range(ndelays):\n",
 561 |     "    ax.plot(delRstim[:500, ii * Rstim.shape[1]] - 5 * ii)\n",
 562 |     "ax.set_xlim(0, 500)\n",
 563 |     "ax.set_yticks([])\n",
 564 |     "ax.set_xticks(range(0, 500, 50))\n",
 565 |     "ax.set_xlabel(\"Time (fMRI volumes)\")\n",
 566 |     "ax.set_ylabel(\"Feature 1 across delays\")\n",
 567 |     "ax.grid()"
 568 |    ]
 569 |   },
 570 |   {
 571 |    "cell_type": "markdown",
 572 |    "metadata": {},
 573 |    "source": [
 574 |     "## Response data\n",
 575 |     "Next you will load the fMRI data. This is totally the most exciting part! These responses have already been preprocessed (the 3D images were motion corrected and aligned to each other, detrended, and then z-scored within each stimulus) so you don't have to worry about that.\n",
 576 |     "\n",
 577 |     "You will load three different variables: `zRresp`, the responses to the regression dataset; `zPresp`, the responses to the prediction dataset; and `mask`, which is a 3D mask showing which voxels have been selected (we are not modeling every voxel in the scan, that would take forever, we are only modeling the voxels that overlap with the cerebral cortex)."
 578 |    ]
 579 |   },
 580 |   {
 581 |    "cell_type": "code",
 582 |    "execution_count": null,
 583 |    "metadata": {},
 584 |    "outputs": [],
 585 |    "source": [
 586 |     "# Load responses\n",
 587 |     "import tables\n",
 588 |     "resptf = tables.open_file(\"data/fmri-responses.hf5\")\n",
 589 |     "zRresp = resptf.root.zRresp.read()\n",
 590 |     "zPresp = resptf.root.zPresp.read()\n",
 591 |     "mask = resptf.root.mask.read()"
 592 |    ]
 593 |   },
 594 |   {
 595 |    "cell_type": "code",
 596 |    "execution_count": null,
 597 |    "metadata": {},
 598 |    "outputs": [],
 599 |    "source": [
 600 |     "# Print matrix shapes\n",
 601 |     "print (\"zRresp shape (num time points, num voxels): \", zRresp.shape)\n",
 602 |     "print (\"zPresp shape (num time points, num voxels): \", zPresp.shape)\n",
 603 |     "print (\"mask shape (Z, Y, X): \", mask.shape)"
 604 |    ]
 605 |   },
 606 |   {
 607 |    "cell_type": "markdown",
 608 |    "metadata": {},
 609 |    "source": [
 610 |     "### Visualize where the voxels are coming from (mask)\n",
 611 |     "Next you will visualize where the voxels are coming from in the brain. This will give you an idea of where the data come from.\n",
 612 |     "\n",
 613 |     "First you will plot a single slice through the mask in the Z dimension. This is called an \"axial\" slice. The top of the image is the front of the brain, the bottom is the back. The left side of the image is the right side of the brain, and the right side of the image is the left side of the brain (as if you are looking up at the brain from under the subject's chin; this left-right reversal is often referred to as \"radiological coordinates\", as opposed to \"neurological coordinates\" where you are looking down from the top).\n",
 614 |     "\n",
 615 |     "Then you will plot a mosaic of all the slices. This is done using the function `mosaic` from James Gao's pyCortex package."
 616 |    ]
 617 |   },
 618 |   {
 619 |    "cell_type": "code",
 620 |    "execution_count": null,
 621 |    "metadata": {},
 622 |    "outputs": [],
 623 |    "source": [
 624 |     "# Plot one slice of the mask that was used to select the voxels\n",
 625 |     "f = figure()\n",
 626 |     "ax = f.add_subplot(1,1,1)\n",
 627 |     "ax.matshow(mask[16], interpolation=\"nearest\", cmap=cm.gray) # show the 17th slice of the mask"
 628 |    ]
 629 |   },
 630 |   {
 631 |    "cell_type": "code",
 632 |    "execution_count": null,
 633 |    "metadata": {},
 634 |    "outputs": [],
 635 |    "source": [
 636 |     "# Plot mask mosaic\n",
 637 |     "import cortex\n",
 638 |     "f = figure(figsize=(10,10))\n",
 639 |     "cortex.mosaic(mask, cmap=cm.gray, interpolation=\"nearest\");"
 640 |    ]
 641 |   },
 642 |   {
 643 |    "cell_type": "markdown",
 644 |    "metadata": {},
 645 |    "source": [
 646 |     "### Visualizing the responses of a few voxels over time\n",
 647 |     "Next you will visualize the responses of a few selected voxels over time. I selected these particular voxels because they are reasonably well explained by the semantic model, but have some differences in their responses across time."
 648 |    ]
 649 |   },
 650 |   {
 651 |    "cell_type": "code",
 652 |    "execution_count": null,
 653 |    "metadata": {},
 654 |    "outputs": [],
 655 |    "source": [
 656 |     "# Plot the response of a few voxels over time\n",
 657 |     "selvoxels = [20710, 27627, 24344, 34808, 22423, 25397]\n",
 658 |     "\n",
 659 |     "f = figure(figsize=(15, 5))\n",
 660 |     "ax = f.add_subplot(1,1,1)\n",
 661 |     "for ii,vi in enumerate(selvoxels):\n",
 662 |     "    ax.plot(zRresp[:500, vi] - 5 * ii)\n",
 663 |     "ax.set_xlim(0, 500)\n",
 664 |     "ax.set_yticks([])\n",
 665 |     "ax.set_xticks(range(0, 500, 50))\n",
 666 |     "ax.set_xlabel(\"Time (fMRI volumes)\")\n",
 667 |     "ax.set_ylabel(\"Voxel responses\")\n",
 668 |     "ax.grid()"
 669 |    ]
 670 |   },
 671 |   {
 672 |    "cell_type": "markdown",
 673 |    "metadata": {},
 674 |    "source": [
 675 |     "## Regression model\n",
 676 |     "Finally, the core of the analysis: you will fit a regression model that predicts the responses of each voxel as a weighted sum of the semantic features. This model will then be tested using a held out dataset (the Prediction dataset). And if the model proves to be reasonably predictive, then the weights of the regression model will tell us what semantic features each voxel responds to.\n",
 677 |     "\n",
 678 |     "This is a linear regression model, so if the response time course for voxel $j$ is $R_j$, the stimulus time course for semantic feature $i$ is $S_i$, and the regression weight for feature $i$ in voxel $j$ is $\\beta_{ij}$, then the model can be written as:\n",
 679 |     "\n",
 680 |     "$$\\hat{R}_j = \\beta_{0j} S_0 + \\beta_{1j} S_1 + \\cdots$$\n",
 681 |     "\n",
 682 |     "or:\n",
 683 |     "\n",
 684 |     "$$\\hat{R}_j = \\sum_i \\beta_{ij} S_i$$\n",
 685 |     "\n",
 686 |     "The trick, of course, is accurately estimating the $\\beta_j$ values. This is commonly done by minimizing the sum of the squared error (here across time, $t$):\n",
 687 |     "\n",
 688 |     "$$E_j(\\beta) = \\sum_t (R_{jt} - \\hat{R}_{jt})^2 = \\sum_t (R_{jt} - \\sum_i \\beta_{i} S_{it})^2$$\n",
 689 |     "\n",
 690 |     "$$\\beta_j = \\underset{\\beta}{\\operatorname{argmin}} E_j(\\beta)$$\n",
 691 |     "\n",
 692 |     "Computing $\\beta$ this way is called ordinary least squares (OLS), and this will not work in our case because the total number of features (3940) is smaller than the number of time points (3737). (It would be possible if the number of delays was smaller than 4, but it would give terrible results.. feel free to try it! OLS can be performed using the function `np.linalg.lstsq`.)\n",
 693 |     "\n",
 694 |     "In almost every case, linear regression can be improved by making some prior assumptions about the weights (or, equivalently, about the covariance structure of the stimuli). This is called **regularization**, or **regularized linear regression**. One way to do this is to penalize the error function by the sum of the squared weights. This is commonly known as **ridge regression**, and is a special case of [Tikhonov regularization](http://en.wikipedia.org/wiki/Ridge_regression). It finds the $\\beta$ that minimizes the following error function:\n",
 695 |     "\n",
 696 |     "$$E_j(\\beta) = \\sum_t (R_{jt} - \\sum_i \\beta_{i} S_{it})^2 + \\alpha \\sum_i \\beta_i^2$$\n",
 697 |     "\n",
 698 |     "(In practice we will use a different formulation that involves re-weighting the singular values of the matrix $S$ before computing its pseudoinverse. This method achieves the same results but is extremely efficient because it uses all the linear algebra machinery that computers are so good at to build many models in parallel.)\n",
 699 |     "\n",
 700 |     "### The hyperparameter: $\\alpha$\n",
 701 |     "You may have noticed in the equation above that we have introduced a new parameter, $\\alpha$, which controls the strength of the regularization. If $\\alpha$ is set to zero, then we get back to exactly the OLS formulation (above). As $\\alpha$ goes to infinity, the regularization forces all the weights to go to zero (in practice this also has the slightly weirder effect of making all the weights independent, as if each feature was regressed separately on the responses).\n",
 702 |     "\n",
 703 |     "So how do we choose $\\alpha$? We're going to do it here using cross-validation. First, we split the Regression dataset up into two parts. Then we estimate the weights for a given $\\alpha$ on the first part, and test how well we can predict responses on the second part. This is repeated for each possible $\\alpha$ that we want to test, and for a couple different splits of the Regression dataset. Then we find the $\\alpha^*$ that gave us the best predictions within the split Regression dataset. Finally we estimate the weights using the entire Regression dataset and the selected $\\alpha^*$.\n",
 704 |     "\n",
 705 |     "Because this is an annoying and laborious process, I've encapsulated it within the function `bootstrap_ridge`. You simply give this function your datasets, the possible $\\alpha$ values, and a few parameters for the cross-validation, and it does all the rest. The parameter `nboots` determines the number of cross-validation tests that will be run. \n",
 706 |     "\n",
 707 |     "To do cross-validation, `bootstrap_ridge` divides the Regression dataset into many small chunks, and then splits those chunks into the two groups that will be used to estimate weights and test $\\alpha$ values. This is better than just choosing individual time points because both the fMRI data and stimuli are autocorrelated (i.e. correlated across time). The parameter `chunklen` determines the length of the chunks, and the parameter `nchunks` determines the number of chunks in the $\\alpha$-testing dataset. By default I set `chunklen` to 40 time points (80-second chunks), and set `nchunks` to 20 (40 * 20 = 800 time points for testing $\\alpha$ values, 3737-800 = 2937 time points for estimating weights). These values should not matter too much.\n",
 708 |     "\n",
 709 |     "Running the regression will take a few minutes."
 710 |    ]
 711 |   },
 712 |   {
 713 |    "cell_type": "code",
 714 |    "execution_count": null,
 715 |    "metadata": {},
 716 |    "outputs": [],
 717 |    "source": [
 718 |     "# Run regression\n",
 719 |     "from ridge import bootstrap_ridge\n",
 720 |     "alphas = np.logspace(1, 3, 10) # Equally log-spaced alphas between 10 and 1000. The third number is the number of alphas to test.\n",
 721 |     "nboots = 1 # Number of cross-validation runs.\n",
 722 |     "chunklen = 40 # \n",
 723 |     "nchunks = 20\n",
 724 |     "\n",
 725 |     "wt, corr, alphas, bscorrs, valinds = bootstrap_ridge(delRstim, zRresp, delPstim, zPresp,\n",
 726 |     "                                                     alphas, nboots, chunklen, nchunks,\n",
 727 |     "                                                     singcutoff=1e-10, single_alpha=True)\n"
 728 |    ]
 729 |   },
 730 |   {
 731 |    "cell_type": "code",
 732 |    "execution_count": null,
 733 |    "metadata": {},
 734 |    "outputs": [],
 735 |    "source": [
 736 |     "f = figure()\n",
 737 |     "ax = f.add_subplot(1,1,1)\n",
 738 |     "ax.semilogx( np.logspace(1, 3, 10), bscorrs.mean(2).mean(1), 'o-')"
 739 |    ]
 740 |   },
 741 |   {
 742 |    "cell_type": "markdown",
 743 |    "metadata": {},
 744 |    "source": [
 745 |     "### Variables returned by the regression\n",
 746 |     "Next let's have a look at the variables returned by the regression function."
 747 |    ]
 748 |   },
 749 |   {
 750 |    "cell_type": "code",
 751 |    "execution_count": null,
 752 |    "metadata": {},
 753 |    "outputs": [],
 754 |    "source": [
 755 |     "# wt is the regression weights\n",
 756 |     "print (\"wt has shape: \", wt.shape)\n",
 757 |     "\n",
 758 |     "# corr is the correlation between predicted and actual voxel responses in the Prediction dataset\n",
 759 |     "print (\"corr has shape: \", corr.shape)\n",
 760 |     "\n",
 761 |     "# alphas is the selected alpha value for each voxel, here it should be the same across voxels\n",
 762 |     "print (\"alphas has shape: \", alphas.shape)\n",
 763 |     "\n",
 764 |     "# bscorrs is the correlation between predicted and actual voxel responses for each round of cross-validation\n",
 765 |     "# within the Regression dataset\n",
 766 |     "print (\"bscorrs has shape (num alphas, num voxels, nboots): \", bscorrs.shape)\n",
 767 |     "\n",
 768 |     "# valinds is the indices of the time points in the Regression dataset that were used for each\n",
 769 |     "# round of cross-validation\n",
 770 |     "print (\"valinds has shape: \", np.array(valinds).shape)"
 771 |    ]
 772 |   },
 773 |   {
 774 |    "cell_type": "markdown",
 775 |    "metadata": {},
 776 |    "source": [
 777 |     "### Testing the regression models by predicting responses\n",
 778 |     "The `bootstrap_ridge` function already computed predictions and correlations for the Prediction dataset, but this is important so let's reproduce that step more explicitly.\n",
 779 |     "\n",
 780 |     "Remember that according to the linear model, the predicted responses for each voxel are a weighted sum of the semantic features. An easy way to compute that is by taking the dot product between the weights and semantic features: $$\\hat{R} = S \\beta$$"
 781 |    ]
 782 |   },
 783 |   {
 784 |    "cell_type": "code",
 785 |    "execution_count": null,
 786 |    "metadata": {},
 787 |    "outputs": [],
 788 |    "source": [
 789 |     "# Predict responses in the Prediction dataset\n",
 790 |     "\n",
 791 |     "# First let's refresh ourselves on the shapes of these matrices\n",
 792 |     "print (\"zPresp has shape: \", zPresp.shape)\n",
 793 |     "print (\"wt has shape: \", wt.shape)\n",
 794 |     "print (\"delPstim has shape: \", delPstim.shape)"
 795 |    ]
 796 |   },
 797 |   {
 798 |    "cell_type": "code",
 799 |    "execution_count": null,
 800 |    "metadata": {},
 801 |    "outputs": [],
 802 |    "source": [
 803 |     "# Then let's predict responses by taking the dot product of the weights and stim\n",
 804 |     "pred = np.dot(delPstim, wt)\n",
 805 |     "\n",
 806 |     "print (\"pred has shape: \", pred.shape)"
 807 |    ]
 808 |   },
 809 |   {
 810 |    "cell_type": "markdown",
 811 |    "metadata": {},
 812 |    "source": [
 813 |     "#### Visualizing predicted and actual responses\n",
 814 |     "Next let's plot some predicted and actual responses side by side."
 815 |    ]
 816 |   },
 817 |   {
 818 |    "cell_type": "code",
 819 |    "execution_count": null,
 820 |    "metadata": {},
 821 |    "outputs": [],
 822 |    "source": [
 823 |     "f = figure(figsize=(15,5))\n",
 824 |     "ax = f.add_subplot(1,1,1)\n",
 825 |     "\n",
 826 |     "selvox = 20710 # a decent voxel\n",
 827 |     "\n",
 828 |     "realresp = ax.plot(zPresp[:,selvox], 'k')[0]\n",
 829 |     "predresp = ax.plot(pred[:,selvox], 'r')[0]\n",
 830 |     "\n",
 831 |     "ax.set_xlim(0, 291)\n",
 832 |     "ax.set_xlabel(\"Time (fMRI time points)\")\n",
 833 |     "\n",
 834 |     "ax.legend((realresp, predresp), (\"Actual response\", \"Predicted response\"));"
 835 |    ]
 836 |   },
 837 |   {
 838 |    "cell_type": "markdown",
 839 |    "metadata": {},
 840 |    "source": [
 841 |     "#### Visualizing predicted and actual responses cont'd\n",
 842 |     "You might notice above that the predicted and actual responses look pretty different scale-wise, although the patterns of ups and downs are vaguely similar. But we don't really care about the scale -- for fMRI it's relatively arbitrary anyway, so let's rescale them both to have unit standard deviation and re-plot."
 843 |    ]
 844 |   },
 845 |   {
 846 |    "cell_type": "code",
 847 |    "execution_count": null,
 848 |    "metadata": {},
 849 |    "outputs": [],
 850 |    "source": [
 851 |     "f = figure(figsize=(15,5))\n",
 852 |     "ax = f.add_subplot(1,1,1)\n",
 853 |     "\n",
 854 |     "selvox = 20710 # a good voxel\n",
 855 |     "\n",
 856 |     "realresp = ax.plot(zPresp[:,selvox], 'k')[0]\n",
 857 |     "predresp = ax.plot(zscore(pred[:,selvox]), 'r')[0]\n",
 858 |     "\n",
 859 |     "ax.set_xlim(0, 291)\n",
 860 |     "ax.set_xlabel(\"Time (fMRI time points)\")\n",
 861 |     "\n",
 862 |     "ax.legend((realresp, predresp), (\"Actual response\", \"Predicted response (scaled)\"));"
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "markdown",
 867 |    "metadata": {},
 868 |    "source": [
 869 |     "Now you see that the actual and scaled predicted responses look very similar. We can quantify this similarity by computing the correlation between the two (correlation is scale-free, so it effectively automatically does the re-scaling that we did here). This voxel has high correlation."
 870 |    ]
 871 |   },
 872 |   {
 873 |    "cell_type": "code",
 874 |    "execution_count": null,
 875 |    "metadata": {},
 876 |    "outputs": [],
 877 |    "source": [
 878 |     "# Compute correlation between single predicted and actual response\n",
 879 |     "# (np.corrcoef returns a correlation matrix; pull out the element [0,1] to get \n",
 880 |     "# correlation between the two vectors)\n",
 881 |     "voxcorr = np.corrcoef(zPresp[:,selvox], pred[:,selvox])[0,1]\n",
 882 |     "print (\"Correlation between predicted and actual responses for voxel %d: %f\" % (selvox, voxcorr))"
 883 |    ]
 884 |   },
 885 |   {
 886 |    "cell_type": "markdown",
 887 |    "metadata": {},
 888 |    "source": [
 889 |     "#### Computing correlations for all voxels\n",
 890 |     "Next let's compute this correlation for every voxel in the dataset. There are some very efficient ways to do this, but here I've written a for loop so that it's very explicit what's happening. (This should give exactly the same values as the variable `corr`, which was returned by `bootstrap_ridge`.)"
 891 |    ]
 892 |   },
 893 |   {
 894 |    "cell_type": "code",
 895 |    "execution_count": null,
 896 |    "metadata": {},
 897 |    "outputs": [],
 898 |    "source": [
 899 |     "voxcorrs = np.zeros((zPresp.shape[1],)) # create zero-filled array to hold correlations\n",
 900 |     "for vi in range(zPresp.shape[1]):\n",
 901 |     "    voxcorrs[vi] = np.corrcoef(zPresp[:,vi], pred[:,vi])[0,1]\n",
 902 |     "print (voxcorrs)"
 903 |    ]
 904 |   },
 905 |   {
 906 |    "cell_type": "markdown",
 907 |    "metadata": {},
 908 |    "source": [
 909 |     "### Visualizing correlations across the brain\n",
 910 |     "Let's start with a supposition: the correlation should not be high everywhere, even if this is a good model of how the brain represents the semantic content of speech. There are parts of the brain that just don't respond to speech, so the correlation should be low in those areas. There are other parts of the brain that respond to speech, but maybe don't represent semantic information, so the correlation should be low in those areas as well.\n",
 911 |     "\n",
 912 |     "But let's begin by plotting a histogram of the correlations across the entire brain. This will show generally whether the model is working well or not."
 913 |    ]
 914 |   },
 915 |   {
 916 |    "cell_type": "code",
 917 |    "execution_count": null,
 918 |    "metadata": {},
 919 |    "outputs": [],
 920 |    "source": [
 921 |     "# Plot histogram of correlations\n",
 922 |     "f = figure(figsize=(8,8))\n",
 923 |     "ax = f.add_subplot(1,1,1)\n",
 924 |     "ax.hist(voxcorrs, 100) # histogram correlations with 100 bins\n",
 925 |     "ax.set_xlabel(\"Correlation\")\n",
 926 |     "ax.set_ylabel(\"Num. voxels\");"
 927 |    ]
 928 |   },
 929 |   {
 930 |    "cell_type": "markdown",
 931 |    "metadata": {},
 932 |    "source": [
 933 |     "If the semantic features didn't capture anything about brain activity, then we would expect the histogram to be symmetric and centered around zero. But here we see that it's highly skewed, with lots of positive values. This looks good! This model is working!\n",
 934 |     "\n",
 935 |     "Next, let's plot a mosaic of the correlations across the brain, as we plotted the mask earlier."
 936 |    ]
 937 |   },
 938 |   {
 939 |    "cell_type": "code",
 940 |    "execution_count": null,
 941 |    "metadata": {},
 942 |    "outputs": [],
 943 |    "source": [
 944 |     "# Plot mosaic of correlations\n",
 945 |     "corrvolume = np.zeros(mask.shape)\n",
 946 |     "corrvolume[mask>0] = voxcorrs\n",
 947 |     "\n",
 948 |     "f = figure(figsize=(10,10))\n",
 949 |     "cortex.mosaic(corrvolume, vmin=0, vmax=0.5, cmap=cm.hot);"
 950 |    ]
 951 |   },
 952 |   {
 953 |    "cell_type": "markdown",
 954 |    "metadata": {},
 955 |    "source": [
 956 |     "#### 3D visualization of correlations\n",
 957 |     "In the mosaic we can see that there seem to be some concentrated areas of high correlation. But it's hard to say where in the brain those areas are based on the mosaic. So next you're going to create a fancy 3D visualization of the correlations using pyCortex.\n",
 958 |     "\n",
 959 |     "Once you've opened the viewer you'll be presented with a 3D view of the brain with colors showing the correlations. White outlines and labels show the locations of known brain areas (motor, somatosensory, visual, and some language areas). Drag around with your left mouse button to rotate the view, and the right mouse button to zoom in or out. \n",
 960 |     "\n",
 961 |     "By default you'll see a view of the cortex as it looks in reality: folded and convoluted. To better see parts of the brain that are hidden down in the folds, you can press \"i\" to see an inflated view (or drag the \"Mix\" slider at the bottom of the screen to the middle). This helps to see the data, but you will still need to rotate the brain to see all of it. To make the entire cortex visible at once, you can press \"f\" to see a flattened view. To create this view we cut the cortical surface at a few locations, and then flattened it out so that it can all be seen at once (but this introduces some distortions)."
 962 |    ]
 963 |   },
 964 |   {
 965 |    "cell_type": "code",
 966 |    "execution_count": null,
 967 |    "metadata": {},
 968 |    "outputs": [],
 969 |    "source": [
 970 |     "# Plot correlations on cortex\n",
 971 |     "import cortex\n",
 972 |     "corrvol = cortex.Volume(corr, \"S1\", \"fullhead\", mask=mask, vmin=0, vmax=0.5, cmap='hot')\n",
 973 |     "cortex.webshow(corrvol, port=8889, open_browser=False)\n"
 974 |    ]
 975 |   },
 976 |   {
 977 |    "cell_type": "code",
 978 |    "execution_count": null,
 979 |    "metadata": {},
 980 |    "outputs": [],
 981 |    "source": [
 982 |     "# View 3D model\n",
 983 |     "# You will need to change where it says SERVERIP below to the IP you are connected to\n",
 984 |     "from IPython.display import HTML\n",
 985 |     "HTML(\"<a target='_blank' href='http://127.0.0.1/:8889'>Click here for viewer</a>\")"
 986 |    ]
 987 |   },
 988 |   {
 989 |    "cell_type": "markdown",
 990 |    "metadata": {},
 991 |    "source": [
 992 |     "### Simpler view of the correlations\n",
 993 |     "pyCortex also offers a simpler way to view the correlations. This method only shows the flat view, but can be embedded right here in the ipython notebook. This should look like the flat view in the 3D viewer."
 994 |    ]
 995 |   },
 996 |   {
 997 |    "cell_type": "code",
 998 |    "execution_count": null,
 999 |    "metadata": {},
1000 |    "outputs": [],
1001 |    "source": [
1002 |     "# Plot correlation flatmap\n",
1003 |     "cortex.quickshow(corrvol, with_rois=False, with_labels=False);"
1004 |    ]
1005 |   },
1006 |   {
1007 |    "cell_type": "markdown",
1008 |    "metadata": {},
1009 |    "source": [
1010 |     "## What semantic features are the voxels responding to?\n",
1011 |     "Now that we have a working model, let's try to figure out what semantic features are making each voxel respond. One way to do this is to simulate how the voxel will respond to individual words, and then find the most preferred words for that voxel.\n",
1012 |     "\n",
1013 |     "But first we have an issue to contend with: we have separate weights for each delay. We could look at the weights for each delay, but instead here you will average the weights across delays to get a single set of weights for the voxel."
1014 |    ]
1015 |   },
1016 |   {
1017 |    "cell_type": "code",
1018 |    "execution_count": null,
1019 |    "metadata": {},
1020 |    "outputs": [],
1021 |    "source": [
1022 |     "# Undelay voxel weights (average across delays)\n",
1023 |     "import operator\n",
1024 |     "from functools import reduce\n",
1025 |     "udwt = reduce(operator.add, np.split(wt/ndelays, ndelays))"
1026 |    ]
1027 |   },
1028 |   {
1029 |    "cell_type": "code",
1030 |    "execution_count": null,
1031 |    "metadata": {},
1032 |    "outputs": [],
1033 |    "source": [
1034 |     "udwt.shape"
1035 |    ]
1036 |   },
1037 |   {
1038 |    "cell_type": "markdown",
1039 |    "metadata": {},
1040 |    "source": [
1041 |     "Next you will pick which voxel to visualize. Since many voxels are modeled poorly, we will pick from among the best modeled voxels."
1042 |    ]
1043 |   },
1044 |   {
1045 |    "cell_type": "code",
1046 |    "execution_count": null,
1047 |    "metadata": {},
1048 |    "outputs": [],
1049 |    "source": [
1050 |     "# Sort voxels by correlation so that we can pick a good voxel\n",
1051 |     "# This will sort voxels in decreasing order of correlation\n",
1052 |     "corrsort = np.argsort(corr)[::-1]"
1053 |    ]
1054 |   },
1055 |   {
1056 |    "cell_type": "code",
1057 |    "execution_count": null,
1058 |    "metadata": {},
1059 |    "outputs": [],
1060 |    "source": [
1061 |     "# Define function that will print best words for a voxel\n",
1062 |     "import pprint\n",
1063 |     "\n",
1064 |     "def print_voxel_words(voxnum):\n",
1065 |     "    # find_words_like_vec returns 10 words most correlated with the given vector, and the correlations\n",
1066 |     "    voxwords = eng1000.find_words_like_vec(udwt[:,voxnum])\n",
1067 |     "    print (\"Best words for voxel %d (correlation %0.3f):\" % (voxnum, voxcorrs[voxnum]))\n",
1068 |     "    pprint.pprint(voxwords)"
1069 |    ]
1070 |   },
1071 |   {
1072 |    "cell_type": "code",
1073 |    "execution_count": null,
1074 |    "metadata": {},
1075 |    "outputs": [],
1076 |    "source": [
1077 |     "# Print best words for some voxels\n",
1078 |     "print_voxel_words(corrsort[0]) # best voxel\n",
1079 |     "print_voxel_words(corrsort[14]) # 15th best voxel"
1080 |    ]
1081 |   },
1082 |   {
1083 |    "cell_type": "markdown",
1084 |    "metadata": {},
1085 |    "source": [
1086 |     "### That's it!\n",
1087 |     "That's the semantic model! Since you made it this far, well done. If you're interested, you can go back and try changing some of the parameters and see how it affects the model. One easy parameter to change is the number of delays (or the delays themselves). Try using just one delay. Or try using 10 delays (that might be slow). You could also try pruning off some of the semantic features. How does the model work if you only use the first 100 semantic features?\n",
1088 |     "\n",
1089 |     "Alternatively, you can try using a different type of feature to model the fMRI responses: phonemes. Below are some blocks of code that will create stimulus vectors representing the number of times each different phoneme is spoken. The phoneme model will predict some voxels much better than the semantic model, and some voxels worse.\n"
1090 |    ]
1091 |   },
1092 |   {
1093 |    "cell_type": "markdown",
1094 |    "metadata": {},
1095 |    "source": [
1096 |     "## Optional: phoneme model\n",
1097 |     "Another type of information that the brain extracts from speech is the phonemic content. The following blocks of code will extract phonemic features from the stimuli. Run this, and then go back and run the \"Regression Model\" block, above. You can compare the correlations of the semantic model and phoneme model to see which works best in each voxel.\n",
1098 |     "\n",
1099 |     "You can also visualize the phoneme stimuli and model as you go, building on the code blocks used above."
1100 |    ]
1101 |   },
1102 |   {
1103 |    "cell_type": "code",
1104 |    "execution_count": null,
1105 |    "metadata": {},
1106 |    "outputs": [],
1107 |    "source": [
1108 |     "# Create phoneme histogram DataSequences\n",
1109 |     "from dsutils import histogram_phonemes2, phonemes\n",
1110 |     "phonemehistseqs = dict() # dictionary to hold phoneme histograms {story name : DataSequence}\n",
1111 |     "for story in allstories:\n",
1112 |     "    phonemehistseqs[story] = histogram_phonemes2(phonseqs[story])"
1113 |    ]
1114 |   },
1115 |   {
1116 |    "cell_type": "code",
1117 |    "execution_count": null,
1118 |    "metadata": {},
1119 |    "outputs": [],
1120 |    "source": [
1121 |     "# Phonemes were labeled using the ARPABET. The labeled phonemes are listed here.\n",
1122 |     "print (phonemes)"
1123 |    ]
1124 |   },
1125 |   {
1126 |    "cell_type": "code",
1127 |    "execution_count": null,
1128 |    "metadata": {},
1129 |    "outputs": [],
1130 |    "source": [
1131 |     "# Downsample phoneme histograms\n",
1132 |     "interptype = \"lanczos\"\n",
1133 |     "window = 3\n",
1134 |     "\n",
1135 |     "downsampled_phonemehistseqs = dict()\n",
1136 |     "for story in allstories:\n",
1137 |     "    downsampled_phonemehistseqs[story] = phonemehistseqs[story].chunksums(interptype, window=window)\n"
1138 |    ]
1139 |   },
1140 |   {
1141 |    "cell_type": "code",
1142 |    "execution_count": null,
1143 |    "metadata": {},
1144 |    "outputs": [],
1145 |    "source": [
1146 |     "# Combine phoneme stimuli\n",
1147 |     "trim = 5\n",
1148 |     "phRstim = np.vstack([np.nan_to_num(zscore(downsampled_phonemehistseqs[story][5+trim:-trim])) for story in Rstories])\n",
1149 |     "phPstim = np.vstack([np.nan_to_num(zscore(downsampled_phonemehistseqs[story][5+trim:-trim])) for story in Pstories])\n"
1150 |    ]
1151 |   },
1152 |   {
1153 |    "cell_type": "code",
1154 |    "execution_count": null,
1155 |    "metadata": {},
1156 |    "outputs": [],
1157 |    "source": [
1158 |     "# Delay stimuli\n",
1159 |     "ndelays = 4\n",
1160 |     "delays = range(1, ndelays+1)\n",
1161 |     "\n",
1162 |     "delRstim = make_delayed(phRstim, delays)\n",
1163 |     "delPstim = make_delayed(phPstim, delays)"
1164 |    ]
1165 |   },
1166 |   {
1167 |    "cell_type": "code",
1168 |    "execution_count": null,
1169 |    "metadata": {},
1170 |    "outputs": [],
1171 |    "source": [
1172 |     "# Now go back to \"Regression Model\" and run that to fit a phoneme-based model!"
1173 |    ]
1174 |   },
1175 |   {
1176 |    "cell_type": "markdown",
1177 |    "metadata": {},
1178 |    "source": [
1179 |     "### "
1180 |    ]
1181 |   }
1182 |  ],
1183 |  "metadata": {
1184 |   "kernelspec": {
1185 |    "display_name": "Python 3",
1186 |    "language": "python",
1187 |    "name": "python3"
1188 |   },
1189 |   "language_info": {
1190 |    "codemirror_mode": {
1191 |     "name": "ipython",
1192 |     "version": 3
1193 |    },
1194 |    "file_extension": ".py",
1195 |    "mimetype": "text/x-python",
1196 |    "name": "python",
1197 |    "nbconvert_exporter": "python",
1198 |    "pygments_lexer": "ipython3",
1199 |    "version": "3.6.5"
1200 |   }
1201 |  },
1202 |  "nbformat": 4,
1203 |  "nbformat_minor": 1
1204 | }
1205 | 


--------------------------------------------------------------------------------
/dsutils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import itertools as itools
  3 | from DataSequence import DataSequence
  4 | 
  5 | DEFAULT_BAD_WORDS = frozenset(["sentence_start", "sentence_end", "br", "lg", "ls", "ns", "sp"])
  6 | 
  7 | def make_word_ds(grids, trfiles, bad_words=DEFAULT_BAD_WORDS):
  8 |     """Creates DataSequence objects containing the words from each grid, with any words appearing
  9 |     in the [bad_words] set removed.
 10 |     """
 11 |     ds = dict()
 12 |     stories = grids.keys()
 13 |     for st in stories:
 14 |         grtranscript = grids[st].tiers[1].make_simple_transcript()
 15 |         ## Filter out bad words
 16 |         goodtranscript = [x for x in grtranscript
 17 |                           if x[2].lower().strip("{}").strip() not in bad_words]
 18 |         d = DataSequence.from_grid(goodtranscript, trfiles[st][0])
 19 |         ds[st] = d
 20 | 
 21 |     return ds
 22 | 
 23 | def make_phoneme_ds(grids, trfiles):
 24 |     """Creates DataSequence objects containing the phonemes from each grid.
 25 |     """
 26 |     ds = dict()
 27 |     stories = grids.keys()
 28 |     for st in stories:
 29 |         grtranscript = grids[st].tiers[0].make_simple_transcript()
 30 |         d = DataSequence.from_grid(grtranscript, trfiles[st][0])
 31 |         ds[st] = d
 32 | 
 33 |     return ds
 34 | 
 35 | phonemes = ['AA', 'AE','AH','AO','AW','AY','B','CH','D',
 36 |     'DH', 'EH', 'ER',   'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH',
 37 |     'K', 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH',
 38 |     'T', 'TH', 'UH', 'UW', 'V', 'W', 'Y', 'Z', 'ZH']
 39 | 
 40 | def make_character_ds(grids, trfiles):
 41 |     ds = dict()
 42 |     stories = grids.keys()
 43 |     for st in stories:
 44 |         grtranscript = grids[st].tiers[2].make_simple_transcript()
 45 |         fixed_grtranscript = [(s,e,map(int, c.split(","))) for s,e,c in grtranscript if c]
 46 |         d = DataSequence.from_grid(fixed_grtranscript, trfiles[st][0])
 47 |         ds[st] = d
 48 |     return ds
 49 | 
 50 | def make_dialogue_ds(grids, trfiles):
 51 |     ds = dict()
 52 |     for st, gr in grids.iteritems():
 53 |         grtranscript = gr.tiers[3].make_simple_transcript()
 54 |         fixed_grtranscript = [(s,e,c) for s,e,c in grtranscript if c]
 55 |         ds[st] = DataSequence.from_grid(fixed_grtranscript, trfiles[st][0])
 56 |     return ds
 57 | 
 58 | def histogram_phonemes(ds, phonemeset=phonemes):
 59 |     """Histograms the phonemes in the DataSequence [ds].
 60 |     """
 61 |     olddata = ds.data
 62 |     N = len(ds.data)
 63 |     newdata = np.zeros((N, len(phonemeset)))
 64 |     phind = dict(enumerate(phonemeset))
 65 |     for ii,ph in enumerate(olddata):
 66 |         try:
 67 |             #ind = phonemeset.index(ph.upper().strip("0123456789"))
 68 |             ind = phind[ph.upper().strip("0123456789")]
 69 |             newdata[ii][ind] = 1
 70 |         except Exception as e:
 71 |             pass
 72 | 
 73 |     return DataSequence(newdata, ds.split_inds, ds.data_times, ds.tr_times)
 74 | 
 75 | def histogram_phonemes2(ds, phonemeset=phonemes):
 76 |     """Histograms the phonemes in the DataSequence [ds].
 77 |     """
 78 |     olddata = np.array([ph.upper().strip("0123456789") for ph in ds.data])
 79 |     newdata = np.vstack([olddata==ph for ph in phonemeset]).T
 80 |     return DataSequence(newdata, ds.split_inds, ds.data_times, ds.tr_times)
 81 | 
 82 | def make_semantic_model(ds, lsasm):
 83 |     newdata = []
 84 |     for w in ds.data:
 85 |         try:
 86 |             v = lsasm[w]
 87 |         except KeyError as e:
 88 |             v = np.zeros((lsasm.data.shape[0],))
 89 |         newdata.append(v)
 90 |     return DataSequence(np.array(newdata), ds.split_inds, ds.data_times, ds.tr_times)
 91 | 
 92 | def make_character_model(dss):
 93 |     """Make character indicator model for a dict of datasequences.
 94 |     """
 95 |     stories = dss.keys()
 96 |     storychars = dict([(st,np.unique(np.hstack(ds.data))) for st,ds in dss.iteritems()])
 97 |     total_chars = sum(map(len, storychars.values()))
 98 |     char_inds = dict()
 99 |     ncharsdone = 0
100 |     for st in stories:
101 |         char_inds[st] = dict(zip(storychars[st], range(ncharsdone, ncharsdone+len(storychars[st]))))
102 |         ncharsdone += len(storychars[st])
103 | 
104 |     charmodels = dict()
105 |     for st,ds in dss.iteritems():
106 |         charmat = np.zeros((len(ds.data), total_chars))
107 |         for ti,charlist in enumerate(ds.data):
108 |             for char in charlist:
109 |                 charmat[ti, char_inds[st][char]] = 1
110 |         charmodels[st] = DataSequence(charmat, ds.split_inds, ds.data_times, ds.tr_times)
111 | 
112 |     return charmodels, char_inds
113 | 
114 | def make_dialogue_model(ds):
115 |     return DataSequence(np.ones((len(ds.data),1)), ds.split_inds, ds.data_times, ds.tr_times)
116 | 
117 | def modulate(ds, vec):
118 |     """Multiplies each row (each word/phoneme) by the corresponding value in [vec].
119 |     """
120 |     return DataSequence((ds.data.T*vec).T, ds.split_inds, ds.data_times, ds.tr_times)
121 | 
122 | def catmats(*seqs):
123 |     keys = seqs[0].keys()
124 |     return dict([(k, DataSequence(np.hstack([s[k].data for s in seqs]), seqs[0][k].split_inds)) for k in keys])
125 | 


--------------------------------------------------------------------------------
/english1000.py:
--------------------------------------------------------------------------------
1 | english1000words = sorted(set([w.lower() for w in """a, about, above, across, act, active, activity, add, afraid, after, again, age, ago, agree, air, all, alone, along, already, always, am, amount, an, and, angry, another, answer, any, anyone, anything, appear, apple, are, area, arm, army, around, arrive, art, as, ask, at, attack, aunt, autumn, away, baby, base, back, bad, bag, ball, bank, basket, bath, be, bear, beautiful, beer, bed, bedroom, behave, before, begin, behind, bell, below, besides, best, better, between, big, bird, birth, birthday, bit, bite, black, block, blood, blow, blue, board, boat, body, boil, bone, book, border, born, borrow, both, bottle, bottom, bowl, box, boy, branch, brave, bread, break, breakfast, breathe, bridge, bright, bring, brother, brown, brush, build, burn, business, bus, busy, but, buy, by, cake, call, can, candle, cap, car, card, care, careful, careless, carry, case, cat, catch, central, century, certain, chair, chance, change, chase, cheap, cheese, chicken, child, children, chocolate, choice, choose, circle, city, class, clever, clean, clear, climb, clock, cloth, clothes, cloud, cloudy, close, coffee, coat, coin, cold, collect, colour, comb, comfortable, common, compare, come, complete, computer, condition, continue, control, cook, cool, copper, corn, corner, correct, cost, contain, count, country, course, cover, crash, cross, cry, cup, cupboard, cut, dance, dangerous, dark, daughter, day, dead, decide, deep, deer, depend, desk, destroy, develop, die, different, difficult, dinner, direction, dirty, discover, dish, do, dog, door, double, down, draw, dream, dress, drink, drive, drop, dry, duck, dust, duty, each, ear, early, earn, earth, east, easy, eat, education, effect, egg, eight, either, electric, elephant, else, empty, end, enemy, enjoy, enough, enter, equal, entrance, escape, even, evening, event, ever, every, everyone, exact, everybody, examination, example, except, excited, exercise, expect, expensive, explain, extremely, eye, face, fact, fail, fall, false, family, famous, far, farm, father, fast, fat, fault, fear, feed, feel, female, fever, few, fight, fill, film, find, fine, finger, finish, fire, first, fit, five, fix, flag, flat, float, floor, flower, fly, fold, food, fool, foot, football, for, force, foreign, forest, forget, forgive, fork, form, fox, four, free, freedom, freeze, fresh, friend, friendly, from, front, fruit, full, fun, funny, furniture, further, future, game, garden, gate, general, gentleman, get, gift, give, glad, glass, go, goat, god, gold, good, goodbye, grandfather, grandmother, grass, grave, great, green, grey, ground, group, grow, gun, hair, half, hall, hammer, hand, happen, happy, hard, hat, hate, have, he, head, healthy, hear, heavy, hello, help, heart, heaven, height, help, her, here, hers, hide, high, hill, him, his, hit, hobby, hold, hole, holiday, home, hope, horse, hospital, hot, hotel, house, how, hundred, hungry, hour, hurry, husband, hurt, I, ice, idea, if, important, in, increase, inside, into, introduce, iron, invite, is, island, it, its, jelly, job, join, juice, jump, just, keep, key, kill, kind, king, kitchen, knee, knife, knock, know, ladder, lady, lamp, land, large, last, late, lately, laugh, lazy, lead, leaf, learn, leave, leg, left, lend, length, less, lesson, let, letter, library, lie, life, light, like, lion, lip, list, listen, little, live, lock, lonely, long, look, lose, lot, love, low, lower, luck, machine, main, make, male, man, many, map, mark, market, marry, matter, may, me, meal, mean, measure, meat, medicine, meet, member, mention, method, middle, milk, million, mind, minute, miss, mistake, mix, model, modern, moment, money, monkey, month, moon, more, morning, most, mother, mountain, mouth, move, much, music, must, my, name, narrow, nation, nature, near, nearly, neck, need, needle, neighbour, neither, net, never, new, news, newspaper, next, nice, night, nine, no, noble, noise, none, nor, north, nose, not, nothing, notice, now, number, obey, object, ocean, of, off, offer, office, often, oil, old, on, one, only, open, opposite, or, orange, order, other, our, out, outside, over, own, page, pain, paint, pair, pan, paper, parent, park, part, partner, party, pass, past, path, pay, peace, pen, pencil, people, per, perfect, period, person, photograph, piano, pick, picture, piece, pig, pin, pink, place, plane, plant, plastic, plate, play, please, pleased, plenty, pocket, point, poison, police, polite, pool, poor, popular, position, possible, potato, pour, power, present, press, pretty, prevent, price, prince, prison, private, prize, probably, problem, produce, promise, proper, protect, provide, public, pull, punish, pupil, push, put, queen, question, quick, quiet, quite, radio, rain, raise, reach, read, ready, real, really, receive, record, red, remember, remind, remove, rent, repair, repeat, reply, report, rest, restaurant, result, return, rice, rich, ride, right, ring, rise, road, rob, rock, room, round, rubber, rude, rule, ruler, run, rush, sad, safe, sail, salt, same, sand, save, say, school, science, search, seat, second, see, seem, sell, send, sentence, serve, seven, several, sex, shade, shadow, shake, shape, share, sharp, she, sheep, sheet, shelf, shine, ship, shirt, shoe, shoot, shop, short, should, shoulder, shout, show, sick, side, signal, silence, silly, silver, similar, simple, single, since, sing, sink, sister, sit, six, size, skill, skin, skirt, sky, sleep, slip, slow, smoke, small, smell, smile, smoke, snow, so, soap, sock, soft, some, someone, something, sometimes, son, soon, sorry, sound, soup, south, space, speak, special, speed, spell, spend, spoon, sport, spread, spring, square, stamp, stand, star, start, station, stay, steal, steam, step, still, stomach, stone, stop, store, storm, story, strange, street, strong, structure, student, study, stupid, subject, substance, successful, such, sudden, sugar, suitable, summer, sun, sunny, support, sure, surprise, sweet, swim, sword, table, take, talk, tall, taste, taxi, tea, teach, team, tear, telephone, television, tell, ten, tennis, terrible, test, than, that, the, their, then, there, therefore, these, thick, thin, thing, think, third, this, though, threat, three, tidy, tie, title, to, today, toe, together, tomorrow, tonight, too, tool, tooth, top, total, touch, town, train, travel, tree, trouble, true, trust, two, twice, try, turn, type, uncle, under, understand, unit, until, up, use, useful, usual, usually, vegetable, very, village, voice, visit, wait, wake, walk, want, warm, wash, waste, watch, water, way, we, weak, wear, weather, wedding, week, weight, welcome, well, west, wet, what, wheel, when, where, which, while, white, who, why, wide, wife, wild, will, win, wind, window, wine, winter, wire, wise, wish, with, without, woman, wonder, word, work, world, worry, worst, write, wrong, year, yes, yesterday, yet, you, young, your, zero""".split(", ")]))


--------------------------------------------------------------------------------
/features.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cPickle
  3 | import tables
  4 | import os
  5 | 
  6 | from text.story.util.dsutils import make_word_ds, make_phoneme_ds, histogram_phonemes, cstates_to_bigrams, DataSequence, makelsa, catmats, histogram_phonemes2, sliding_chunk_sum, modulate
  7 | 
  8 | from text.models.semtax.Semtax import Semtax
  9 | from text.movie.util.SemanticModel import SemanticModel
 10 | 
 11 | 
 12 | mapdict = lambda d, fun: dict(zip(d.keys(), map(fun, d.values())))
 13 | 
 14 | class Features(object):
 15 |     def __init__(self, grids, trfiles, interp="rect", **kwargs):
 16 |         """Initializes a Features object that can be used to create feature-space
 17 |         representations of the stimulus with the given [grids] and [trfiles].
 18 | 
 19 |         [interp] can be "rect" or "sinc".
 20 |         [kwargs] are passed to the interpolation function.
 21 |         """
 22 |         self.grids = grids
 23 |         self.trfiles = trfiles
 24 | 
 25 |         self.interp = interp
 26 |         self.interpargs = kwargs
 27 | 
 28 |         ## Precache word sequences and phoneme sequences for later use
 29 |         self.wordseqs = make_word_ds(grids, trfiles)
 30 |         self.phonseqs = make_phoneme_ds(grids, trfiles)
 31 | 
 32 |     def downsample(self, dsdict):
 33 |         """Downsamples each DataSequence in [dsdict] using the settings specified in the
 34 |         initializer.
 35 |         """
 36 |         return mapdict(dsdict, lambda h: h.chunksums(self.interp,
 37 |                                                      **self.interpargs))
 38 | 
 39 |     def perstory(self):
 40 |         """Simple model: a separate intercept regressor for each story.
 41 |         """
 42 |         nstories = len(self.grids)
 43 |         storymats = dict()
 44 |         for ii,st in enumerate(sorted(self.grids.keys())):
 45 |             smat = np.zeros((len(self.wordseqs[st].tr_times), nstories))
 46 |             smat[:,ii] = 1
 47 |             storymats[st] = smat
 48 |         return storymats
 49 |         
 50 |     def numwords(self):
 51 |         """Simple model: the number of words per TR.
 52 |         """
 53 |         return mapdict(self.wordseqs, lambda s: np.atleast_2d(map(len, s.chunks())).T.astype(float))
 54 | 
 55 |     def numphonemes(self):
 56 |         """Simple model: the number of phonemes per TR.
 57 |         """
 58 |         return mapdict(self.phonseqs, lambda s: np.atleast_2d(map(len, s.chunks())).T.astype(float))
 59 | 
 60 |     def phonemecounts(self, debug=False):
 61 |         """Number of times each phoneme appears per TR.
 62 |         """
 63 |         phonhists = mapdict(self.phonseqs, histogram_phonemes2)
 64 |         if debug:
 65 |             return phonhists
 66 |         return self.downsample(phonhists)
 67 |         #return mapdict(phonhists, lambda h: h.chunksums())
 68 |         
 69 |     def markov(self, log=False, modeldir="/auto/k8/huth/storydata/stories-semtax-wbooks-3+20_100", num=10, nC=20, debug=False):
 70 |         """Markov syntactic model. The [modeldir] and [num] will be passed to the
 71 |         function Semtax.load_from_dir.
 72 |         This function assumes that the first [nC] features are syntactic.
 73 |         """
 74 |         stmodel = Semtax.load_from_dir(modeldir, num)
 75 |         stmodel.cphi[0] = stmodel.cphi[-1] ## Fix vocab * bug
 76 |         stmodel.zphi[0] = stmodel.zphi[-1]
 77 | 
 78 |         sm = stmodel.to_SemanticModel(True)
 79 |         sm.data = sm.data[:nC] ## Limit to only syntactic part
 80 | 
 81 |         makecs = lambda ds: DataSequence(stmodel.infer_word_cstates(ds.data)[:,:nC],
 82 |                                          ds.split_inds,
 83 |                                          ds.data_times,
 84 |                                          ds.tr_times)
 85 |         rstimseqs = mapdict(self.wordseqs, makecs)
 86 |         if log:
 87 |             rstimseqs = mapdict(rstimseqs, lambda ds: DataSequence(np.log(ds.data+1e-10),
 88 |                                                                    ds.split_inds,
 89 |                                                                    ds.data_times,
 90 |                                                                    ds.tr_times))
 91 | 
 92 |         if debug:
 93 |             return rstimseqs
 94 |         #return mapdict(rstimseqs, lambda s: s.chunksums())
 95 |         return self.downsample(rstimseqs)
 96 | 
 97 |     def markov_bigrams(self, log=False, modeldir="/auto/k8/huth/storydata/stories-semtax-wbooks-3+20_100",
 98 |                        num=10, nC=20):
 99 |         """Markov bigram syntactic model. The [modeldir] and [num] will be passed to the
100 |         function Semtax.load_from_dir.
101 |         This function assumes that the first [nC] features are syntactic.
102 |         """
103 |         stmodel = Semtax.load_from_dir(modeldir, num)
104 |         stmodel.cphi[0] = stmodel.cphi[-1] ## Fix vocab * bug
105 |         stmodel.zphi[0] = stmodel.zphi[-1]
106 | 
107 |         sm = stmodel.to_SemanticModel(True)
108 |         sm.data = sm.data[:nC] ## Limit to only syntactic part
109 | 
110 |         makecs = lambda ds: DataSequence(stmodel.infer_word_cstates(ds.data)[:,:nC],
111 |                                          ds.split_inds,
112 |                                          ds.data_times,
113 |                                          ds.tr_times)
114 |         rstimseqs = mapdict(self.wordseqs, makecs)
115 |         bigramseqs = mapdict(rstimseqs, cstates_to_bigrams)
116 |         if log:
117 |             bigramseqs = mapdict(bigramseqs, lambda ds: DataSequence(np.log(ds.data+1e-10),
118 |                                                                      ds.split_inds,
119 |                                                                      ds.data_times,
120 |                                                                      ds.tr_times))
121 | 
122 |         #return mapdict(bigramseqs, lambda s: s.chunksums())
123 |         return self.downsample(bigramseqs)
124 | 
125 |     def markov_bigram_ics(self, modeldir="/auto/k8/huth/storydata/stories-semtax-wbooks-3+20_100",
126 |                        num=10, nC=20, icfile="/auto/k8/huth/storydata/transmat-ics-150-wbooks-2.hf5"):
127 |         """Markov bigram IC syntactic model. The [modeldir] and [num] will be passed to the
128 |         function Semtax.load_from_dir.
129 |         This function assumes that the first [nC] features are syntactic.
130 |         """
131 |         stmodel = Semtax.load_from_dir(modeldir, num)
132 |         stmodel.cphi[0] = stmodel.cphi[-1] ## Fix vocab * bug
133 |         stmodel.zphi[0] = stmodel.zphi[-1]
134 | 
135 |         sm = stmodel.to_SemanticModel(True)
136 |         sm.data = sm.data[:nC] ## Limit to only syntactic part
137 | 
138 |         makecs = lambda ds: DataSequence(stmodel.infer_word_cstates(ds.data)[:,:nC],
139 |                                          ds.split_inds,
140 |                                          ds.data_times,
141 |                                          ds.tr_times)
142 |         rstimseqs = mapdict(self.wordseqs, makecs)
143 |         bigramseqs = mapdict(rstimseqs, cstates_to_bigrams)
144 |         logbigramseqs = mapdict(bigramseqs, lambda ds: DataSequence(np.log(ds.data+1e-10),
145 |                                                                     ds.split_inds,
146 |                                                                     ds.data_times,
147 |                                                                     ds.tr_times))
148 | 
149 |         bgics = tables.openFile(icfile).root.ics.read()
150 |         projics = lambda ds: DataSequence(np.dot(bgics, sliding_chunk_sum(ds.data, 7).T).T,
151 |                                           ds.split_inds,
152 |                                           ds.data_times,
153 |                                           ds.tr_times)
154 |         bgicseqs = mapdict(logbigramseqs, projics)
155 | 
156 |         #return mapdict(bgicseqs, lambda s: s.chunksums())
157 |         return self.downsample(bgicseqs)
158 | 
159 |     def lsa(self, ndim, rectify, zsaxes=(1,), basepath="/auto/k8/huth/storydata/stories-wbooks-lsa-2", debug=False):
160 |         """LSA semantic model.
161 |         """
162 |         vocab = cPickle.load(open(basepath+"-vocab"))
163 |         lsasm = SemanticModel(None, None)
164 |         lsasm.load_ascii_root(basepath+"-Vt", vocab)
165 |         lsasm.data = lsasm.data[:ndim]
166 | 
167 |         for axis in zsaxes:
168 |             lsasm.zscore(axis)
169 | 
170 |         if rectify:
171 |             lsasm.rectify()
172 | 
173 |         lsastimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, lsasm))
174 |         #return mapdict(lsastimseqs, lambda s: s.chunksums())
175 |         if debug:
176 |             return lsastimseqs
177 |         return self.downsample(lsastimseqs)
178 | 
179 |     @staticmethod
180 |     def get_newlsa_model(ndim, rectify, entweight, entcutoff=5, basepath="/auto/k6/huth/lsamats6/"):
181 |         """Returns a new LSA semantic model.
182 |         """
183 |         entropyfilename = os.path.join(basepath, "globnorm_lsa1_1.npy")
184 |         modelfilename = os.path.join(basepath, "evd1.hf5")
185 | 
186 |         entropy = np.load(entropyfilename)
187 |         lsafile = tables.openFile(modelfilename)
188 | 
189 |         Q = lsafile.root.Q.read()
190 |         vocab = lsafile.root.vocab.read()
191 | 
192 |         if entweight:
193 |             lsasm = SemanticModel(Q[:,-ndim:].T * (np.clip(entropy, entcutoff, np.inf)**-1), vocab)
194 |         else:
195 |             lsasm = SemanticModel(Q[:,-ndim:].T, vocab)
196 | 
197 |         if rectify:
198 |             lsasm.rectify()
199 | 
200 |         ## Store entropies in there as well
201 |         lsasm.wordentropy = entropy
202 | 
203 |         lsafile.close()
204 |         
205 |         return lsasm
206 | 
207 |     def newlsa(self, ndim, rectify, entweight, entcutoff=5, basepath="/auto/k6/huth/lsamats6/", debug=False):
208 |         """New LSA semantic model.
209 |         """
210 |         lsasm = self.get_newlsa_model(ndim, rectify, entweight, entcutoff, basepath)
211 |         lsastimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, lsasm))
212 | 
213 |         if debug:
214 |             return lsastimseqs
215 |         return self.downsample(lsastimseqs)
216 | 
217 |     def hal(self, wordset="verbs", zsaxes=(0,1), rectify=False,
218 |             basepath="/auto/k8/huth/storydata/story+books+wiki+15w-densehal-mat", debug=False):
219 |         """HAL semantic model (without dimensionality reduction).
220 |         """
221 |         from text.story.util.HalModel import make_hal_wordset_model, verb_set, make_hal_sm, english1000
222 |         haltf = tables.openFile(basepath+".hf5")
223 |         halmat = np.array(haltf.root.halmat.read())
224 |         halvocab = cPickle.load(open(basepath+"-vocab"))
225 | 
226 |         ## Choose a wordset
227 |         if wordset=="verbs":
228 |             wordset = verb_set
229 |         elif wordset=="cmuverbs":
230 |             wordset = verb_set[:23]
231 |         elif wordset=="english1000":
232 |             wordset = english1000
233 |         
234 |         halsm = make_hal_sm(halmat, halvocab, wordset)
235 | 
236 |         for axis in zsaxes:
237 |             halsm.zscore(axis)
238 | 
239 |         if rectify:
240 |             halsm.rectify()
241 | 
242 |         halstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, halsm))
243 |         #return mapdict(halstimseqs, lambda s: s.chunksums())
244 |         if debug:
245 |             return halstimseqs
246 |         return self.downsample(halstimseqs)
247 | 
248 |     @staticmethod
249 |     def get_co_model(wordset="english1000", zsaxes=(0,1), rectify=False,
250 |                      basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat"):
251 |         """Co-occurence-based semantic model (without dimensionality reduction).
252 |         """
253 |         from text.story.util.HalModel import make_hal_wordset_model, verb_set, make_hal_sm, english1000
254 |         cotf = tables.openFile(basepath+".hf5")
255 |         comat = np.array(cotf.root.mat.read())
256 |         covocab = cPickle.load(open(basepath+"-vocab"))
257 | 
258 |         ## Choose a wordset
259 |         if wordset=="verbs":
260 |             wordset = verb_set
261 |         elif wordset=="cmuverbs":
262 |             wordset = verb_set[:23]
263 |         elif wordset=="english1000":
264 |             wordset = english1000
265 |         elif wordset=="story":
266 |             wordset = [[w] for w in cPickle.load(open("/auto/k1/huth/text/story/storyvocab_2013.pickle"))]
267 |         
268 |         cosm = make_hal_sm(comat, covocab, wordset)
269 | 
270 |         for axis in zsaxes:
271 |             cosm.zscore(axis)
272 | 
273 |         if rectify:
274 |             cosm.rectify()
275 | 
276 |         return cosm
277 |         
278 | 
279 |     def co(self, wordset="english1000", zsaxes=(0,1), rectify=False,
280 |            basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat", debug=False):
281 |         """Co-occurence-based semantic model (without dimensionality reduction).
282 |         """
283 |         cosm = self.get_co_model(wordset, zsaxes, rectify, basepath)
284 |         costimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, cosm))
285 |         #return mapdict(halstimseqs, lambda s: s.chunksums())
286 |         if debug:
287 |             return costimseqs
288 |         return self.downsample(costimseqs)
289 | 
290 |     @staticmethod
291 |     def get_orthogonal_co_model(wordset="english1000", zsaxes=(0,1), rectify=False,
292 |                                 basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat",
293 |                                 debug=False):
294 |         """Co-occurence-based semantic model with pre-whitening.
295 |         """
296 |         cosm = Features.get_co_model(wordset, zsaxes, rectify, basepath)
297 |         ## Orthogonalize cosm data
298 |         from text.movie.util.util import make_delayed, save_table_file, eigprincomp
299 |         coc, col = eigprincomp(cosm.data.T)
300 |         ## Flip so that first value on each component is positive (makes result deterministic)
301 |         fcoc = (coc.T * np.sign(coc[:,0])).T
302 |         ## Make new orthogonal cosm
303 |         ocosm = cosm.copy()
304 |         ocosm.data = np.dot(fcoc, cosm.data)
305 |         return ocosm
306 | 
307 |     def orthogonal_co(self, wordset="english1000", zsaxes=(0,1), rectify=False,
308 |                       basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat",
309 |                       debug=False):
310 |         """Co-occurence-based semantic model with pre-whitening.
311 |         """
312 |         ocosm = self.get_orthogonal_co_model(wordset, zsaxes, rectify, basepath)
313 |         costimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, ocosm))
314 |         #return mapdict(halstimseqs, lambda s: s.chunksums())
315 |         if debug:
316 |             return costimseqs
317 |         return self.downsample(costimseqs)
318 | 
319 |     def commonwords(self, num=100, basepath="/auto/k8/huth/storydata/stories-wbooks-lsa-2-vocab"):
320 |         """Common word indicator model. Based on old LSA model fitting, used less data.
321 |         """
322 |         vocab = cPickle.load(open(basepath))
323 |         counts = cPickle.load(open(basepath+"-Rcounts"))
324 |         selwords = np.argsort(counts)[-num:]
325 |         wmodel = SemanticModel(np.eye(num), list(np.array(vocab)[selwords]))
326 | 
327 |         wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel))
328 |         #return mapdict(wordstimseqs, lambda s: s.chunksums())
329 |         return self.downsample(wordstimseqs)
330 | 
331 |     def commonwords2(self, num=100, basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat"):
332 |         """Common word indicator model. Base on newer co model fitting, using more data.
333 |         """
334 |         cotf = tables.openFile(basepath+".hf5")
335 |         counts = cotf.root.wordcounts.read()
336 |         covocab = cPickle.load(open(basepath+"-vocab"))
337 |         selwords = np.argsort(counts)[-num:]
338 |         wmodel = SemanticModel(np.eye(num), list(np.array(covocab)[selwords]))
339 | 
340 |         wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel))
341 |         return self.downsample(wordstimseqs)
342 | 
343 |     def allwords(self):
344 |         """All word indicator model.
345 |         """
346 |         from text.textcore import Corpus
347 |         corpus_file = "/auto/k5/huth/corpora/story/raw-transcripts/stories1.tar.gz"
348 |         corpus = Corpus(corpus_file, split_documents=200)
349 |         corpus_file1 = "/auto/k5/huth/corpora/story/raw-transcripts/stories2.tar.gz"
350 |         corpus.append_corpus(corpus_file1)
351 | 
352 |         storyvocab = sorted(list(set(corpus.get_vocabulary())))
353 |         num = len(storyvocab)
354 |         wmodel = SemanticModel(np.eye(num), list(np.array(storyvocab)))
355 | 
356 |         wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel))
357 |         #return mapdict(wordstimseqs, lambda s: s.chunksums())
358 |         return self.downsample(wordstimseqs)
359 | 
360 |     def nmflsa(self):
361 |         """NMF LSA model based on newLSA.
362 |         """
363 |         tf = tables.openFile("/auto/k6/huth/nmf-lsa.hf5")
364 |         vocab = tf.root.vocab.read()
365 |         data = tf.root.data.read()
366 |         nmodel = SemanticModel(data, vocab)
367 |         wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, nmodel))
368 |         #return mapdict(wordstimseqs, lambda s: s.chunksums())
369 |         return self.downsample(wordstimseqs)
370 | 
371 |     def surprisal(self, template="/auto/k5/huth/story-surprisal/%s.npy", prob=False, debug=False):
372 |         """Word surprisal model.
373 |         """
374 |         ## Load surprisal for each story
375 |         sseqs = dict()
376 |         for story,wseq in self.wordseqs.iteritems():
377 |             surprisal = np.load(template%story)
378 |             if prob:
379 |                 d = 1-np.atleast_2d(surprisal).T
380 |             else:
381 |                 d = -np.log2(np.atleast_2d(surprisal).T)
382 |                 d[np.isinf(d)] = 100
383 |             sseq = DataSequence(d,
384 |                                 wseq.split_inds,
385 |                                 wseq.data_times,
386 |                                 wseq.tr_times)
387 |             sseqs[story] = sseq
388 | 
389 |         if debug:
390 |             return sseqs
391 |         else:
392 |             return self.downsample(sseqs)
393 | 
394 |     def sphal(self, halargs, spargs, debug=False):
395 |         """HAL model modulated by surprisal.
396 |         """
397 |         halargs["debug"] = True
398 |         halseqs = self.hal(**halargs)
399 |         spargs["debug"] = True
400 |         spargs["prob"] = True
401 |         spseqs = self.surprisal(**spargs)
402 | 
403 |         modhal = dict([(st, modulate(ds, spseqs[st].data[:,0])) for (st,ds) in halseqs.items()])
404 |         
405 |         if debug:
406 |             return modhal
407 |         return self.downsample(modhal)
408 | 
409 |     @classmethod
410 |     def _get_word2vec_model(cls, modelfile="/auto/k8/huth/GoogleNews-vectors-negative300.bin",
411 |                             norm=False):
412 |         from gensim.models.word2vec import Word2Vec
413 |         model = Word2Vec.load_word2vec_format(modelfile, binary=True)
414 |         usevocab = set(cPickle.load(open("/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat-vocab")))
415 |         vocab, vocinds = zip(*[(w, model.vocab[w].index) for w in usevocab if w in model.vocab])
416 |         #w2v_usevocab = [(w,val.index) for w,val in w2v.vocab.items() if w in usevocab]
417 |         #srtvocab = [w for w,voc in sorted(w2v.vocab.items(), key=lambda item:item[1].index)]
418 |         #srtvocab,srtinds = zip(*sorted(w2v_usevocab, key=lambda item:item[1]))
419 |         if norm:
420 |             data = model.syn0norm[list(vocinds)]
421 |         else:
422 |             data = model.syn0[list(vocinds)]
423 | 
424 |         w2vsm = SemanticModel(data.T, vocab)
425 |         return w2vsm
426 | 
427 |     @classmethod
428 |     def get_word2vec_model(cls, *args, **kwargs):
429 |         if "_w2v_cache" not in dir(cls):
430 |             cls._w2v_cache = cls._get_word2vec_model(*args, **kwargs)
431 |         return cls._w2v_cache
432 |     
433 |     def word2vec(self, modelfile="/auto/k8/huth/GoogleNews-vectors-negative300.bin", norm=False):
434 |         """GenSim / word2vec model.
435 |         """
436 |         model = self.get_word2vec_model(modelfile, norm)
437 |         #modeldims = model["test"].shape[0]
438 |         #model.data = np.zeros((modeldims,))
439 |         w2vstims = mapdict(self.wordseqs, lambda ds: makelsa(ds, model))
440 |         return self.downsample(w2vstims)
441 | 
442 |     def emoratings(self, subjects=("ah", "ds", "jg", "wh", "ml"), smoothing=1.0):
443 |         from text.story.emotions import util
444 |         storyemolevels = util.load_story_ratings(subjects, self.grids)
445 |         return util.story_interp_grids(subjects, self.grids, self.trfiles,
446 |                                        storyemolevels, [smoothing])
447 | 


--------------------------------------------------------------------------------
/interpdata.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import logging
  3 | 
  4 | logger = logging.getLogger("text.regression.interpdata")
  5 | 
  6 | def interpdata(data, oldtime, newtime):
  7 |     """Interpolates the columns of [data] to find the values at [newtime], given that the current
  8 |     values are at [oldtime].  [oldtime] must have the same number of elements as [data] has rows.
  9 |     """
 10 |     ## Check input sizes ##
 11 |     if not len(oldtime) == data.shape[0]:
 12 |         raise IndexError("oldtime must have same number of elements as data has rows.")
 13 |     
 14 |     ## Set up matrix to hold output ##
 15 |     newdata = np.empty((len(newtime), data.shape[1]))
 16 |     
 17 |     ## Interpolate each column of data ##
 18 |     for ci in range(data.shape[1]):
 19 |         if (ci%100) == 0:
 20 |             logger.info("Interpolating column %d/%d.." % (ci+1, data.shape[1]))
 21 |         
 22 |         newdata[:,ci] = np.interp(newtime, oldtime, data[:,ci])
 23 |     
 24 |     ## Return interpolated data ##
 25 |     return newdata
 26 | 
 27 | def sincinterp1D(data, oldtime, newtime, cutoff_mult=1.0, window=1):
 28 |     """Interpolates the one-dimensional signal [data] at the times given by [newtime], assuming
 29 |     that each sample in [data] was collected at the corresponding time in [oldtime]. Clearly,
 30 |     [oldtime] and [data] must have the same length, but [newtime] can have any length.
 31 |     
 32 |     This function will assume that the time points in [newtime] are evenly spaced and will use
 33 |     that frequency multipled by [cutoff_mult] as the cutoff frequency of the sinc filter.
 34 |     
 35 |     The sinc function will be computed with [window] lobes.  With [window]=1, this will
 36 |     effectively compute the Lanczos filter.
 37 |     
 38 |     This is a very simplistic filtering algorithm, so will take O(N*M) time, where N is the
 39 |     length of [oldtime] and M is the length of [newtime].
 40 |     
 41 |     This filter is non-causal.
 42 |     """
 43 |     ## Find the cutoff frequency ##
 44 |     cutoff = 1/np.mean(np.diff(newtime)) * cutoff_mult
 45 |     print ("Doing sinc interpolation with cutoff=%0.3f and %d lobes." % (cutoff, window))
 46 |     
 47 |     ## Construct new signal ##
 48 |     newdata = np.zeros((len(newtime),1))
 49 |     for ndi in range(len(newtime)):
 50 |         for di in range(len(oldtime)):
 51 |             newdata[ndi] += sincfun(cutoff, newtime[ndi]-oldtime[di], window) * data[di]
 52 |     return newdata
 53 | 
 54 | def sincinterp2D(data, oldtime, newtime, cutoff_mult=1.0, window=1, causal=False, renorm=True):
 55 |     """Interpolates the columns of [data], assuming that the i'th row of data corresponds to
 56 |     oldtime(i).  A new matrix with the same number of columns and a number of rows given
 57 |     by the length of [newtime] is returned.  If [causal], only past time points will be used
 58 |     to computed the present value, and future time points will be ignored.
 59 |     
 60 |     The time points in [newtime] are assumed to be evenly spaced, and their frequency will
 61 |     be used to calculate the low-pass cutoff of the sinc interpolation filter.
 62 |     
 63 |     [window] lobes of the sinc function will be used.  [window] should be an integer.
 64 |     """
 65 |     ## Find the cutoff frequency ##
 66 |     cutoff = 1/np.mean(np.diff(newtime)) * cutoff_mult
 67 |     print ("Doing sinc interpolation with cutoff=%0.3f and %d lobes." % (cutoff, window))
 68 |     
 69 |     ## Construct new signal ##
 70 |     # newdata = np.zeros((len(newtime), data.shape[1]))
 71 |     # for ndi in range(len(newtime)):
 72 |     #         for di in range(len(oldtime)):
 73 |     #             newdata[ndi,:] += sincfun(cutoff, newtime[ndi]-oldtime[di], window, causal) * data[di,:]
 74 |     
 75 |     ## Build up sinc matrix ##
 76 |     sincmat = np.zeros((len(newtime), len(oldtime)))
 77 |     for ndi in range(len(newtime)):
 78 |         sincmat[ndi,:] = sincfun(cutoff, newtime[ndi]-oldtime, window, causal, renorm)
 79 |     
 80 |     ## Construct new signal by multiplying the sinc matrix by the data ##
 81 |     newdata = np.dot(sincmat, data)
 82 | 
 83 |     return newdata
 84 | 
 85 | def lanczosinterp2D(data, oldtime, newtime, window=3, cutoff_mult=1.0, rectify=False):
 86 |     """Interpolates the columns of [data], assuming that the i'th row of data corresponds to
 87 |     oldtime(i). A new matrix with the same number of columns and a number of rows given
 88 |     by the length of [newtime] is returned.
 89 |     
 90 |     The time points in [newtime] are assumed to be evenly spaced, and their frequency will
 91 |     be used to calculate the low-pass cutoff of the interpolation filter.
 92 |     
 93 |     [window] lobes of the sinc function will be used. [window] should be an integer.
 94 |     """
 95 |     ## Find the cutoff frequency ##
 96 |     cutoff = 1/np.mean(np.diff(newtime)) * cutoff_mult
 97 |     print ("Doing lanczos interpolation with cutoff=%0.3f and %d lobes." % (cutoff, window))
 98 |     
 99 |     ## Build up sinc matrix ##
100 |     sincmat = np.zeros((len(newtime), len(oldtime)))
101 |     for ndi in range(len(newtime)):
102 |         sincmat[ndi,:] = lanczosfun(cutoff, newtime[ndi]-oldtime, window)
103 |     
104 |     if rectify:
105 |         newdata = np.hstack([np.dot(sincmat, np.clip(data, -np.inf, 0)), 
106 |                             np.dot(sincmat, np.clip(data, 0, np.inf))])
107 |     else:
108 |         ## Construct new signal by multiplying the sinc matrix by the data ##
109 |         newdata = np.dot(sincmat, data)
110 | 
111 |     return newdata
112 | 
113 | def sincupinterp2D(data, oldtime, newtimes, cutoff, window=1):
114 |     """Uses sinc interpolation to upsample the columns of [data], assuming that the i'th
115 |     row of data comes from oldtime[i].  A new matrix with the same number of columns
116 |     and a number of rows given by the length of [newtime] is returned.
117 | 
118 |     The times points in [oldtime] are assumed to be evenly spaced, and their frequency
119 |     will be used to calculate the low-pass cutoff of the sinc interpolation filter.
120 | 
121 |     [window] lobes of the sinc function will be used.  [window] should be an integer.
122 |     Setting [window] to 1 yields a Lanczos filter.
123 |     """
124 |     #cutoff = 1/np.mean(np.diff(oldtime))
125 |     print ("Doing sinc interpolation with cutoff=%0.3f and %d lobes."%(cutoff, window))
126 |     
127 |     sincmat = np.zeros((len(newtimes), len(oldtime)))
128 |     for ndi in range(len(newtimes)):
129 |         sincmat[ndi,:] = sincfun(cutoff, newtimes[ndi]-oldtime, window, False)
130 | 
131 |     newdata = np.dot(sincmat, data)
132 |     return newdata
133 | 
134 | def sincfun(B, t, window=np.inf, causal=False, renorm=True):
135 |     """Compute the sinc function with some cutoff frequency [B] at some time [t].
136 |     [t] can be a scalar or any shaped numpy array.
137 |     If given a [window], only the lowest-order [window] lobes of the sinc function
138 |     will be non-zero.
139 |     If [causal], only past values (i.e. t<0) will have non-zero weights.
140 |     """
141 |     val = 2*B*np.sin(2*np.pi*B*t)/(2*np.pi*B*t+1e-20)
142 |     if t.shape:
143 |         val[np.abs(t)>window/(2*B)] = 0
144 |         if causal:
145 |             val[t<0] = 0
146 |         if not np.sum(val)==0.0 and renorm:
147 |             val = val/np.sum(val)
148 |     elif np.abs(t)>window/(2*B):
149 |         val = 0
150 |         if causal and t<0:
151 |             val = 0
152 |     return val
153 | 
154 | def lanczosfun(cutoff, t, window=3):
155 |     """Compute the lanczos function with some cutoff frequency [B] at some time [t].
156 |     [t] can be a scalar or any shaped numpy array.
157 |     If given a [window], only the lowest-order [window] lobes of the sinc function
158 |     will be non-zero.
159 |     """
160 |     t = t * cutoff
161 |     val = window * np.sin(np.pi*t) * np.sin(np.pi*t/window) / (np.pi**2 * t**2)
162 |     val[t==0] = 1.0
163 |     val[np.abs(t)>window] = 0.0
164 |     return val# / (val.sum() + 1e-10)
165 | 
166 | def expinterp2D(data, oldtime, newtime, theta):
167 |     intmat = np.zeros((len(newtime), len(oldtime)))
168 |     for ndi in range(len(newtime)):
169 |         intmat[ndi,:] = expfun(theta, newtime[ndi]-oldtime)
170 |     
171 |     ## Construct new signal by multiplying the sinc matrix by the data ##
172 |     newdata = np.dot(intmat, data)
173 |     return newdata
174 | 
175 | def expfun(theta, t):
176 |     """Computes an exponential weighting function for interpolation.
177 |     """
178 |     val = np.exp(-t*theta)
179 |     val[t<0] = 0.0
180 |     if not np.sum(val)==0.0:
181 |         val = val/np.sum(val)
182 |     return val
183 | 
184 | def gabor_xfm(data, oldtimes, newtimes, freqs, sigma):
185 |     sinvals = np.vstack([np.sin(oldtimes*f*2*np.pi) for f in freqs])
186 |     cosvals = np.vstack([np.cos(oldtimes*f*2*np.pi) for f in freqs])
187 |     outvals = np.zeros((len(newtimes), len(freqs)), dtype=np.complex128)
188 |     for ti,t in enumerate(newtimes):
189 |         ## Build gaussian function
190 |         gaussvals = np.exp(-0.5*(oldtimes-t)**2/(2*sigma**2))*data
191 |         ## Take product with sin/cos vals
192 |         sprod = np.dot(sinvals, gaussvals)
193 |         cprod = np.dot(cosvals, gaussvals)
194 |         ## Store the output
195 |         outvals[ti,:] = cprod + 1j*sprod
196 | 
197 |     return outvals
198 | 
199 | def gabor_xfm2D(ddata, oldtimes, newtimes, freqs, sigma):
200 |     return np.vstack([gabor_xfm(d, oldtimes, newtimes, freqs, sigma).T for d in ddata])
201 | 
202 | def test_interp(**kwargs):
203 |     """Tests sincinterp2D passing it the given [kwargs] and interpolating known signals 
204 |     between the two time domains.
205 |     """
206 |     oldtime = np.linspace(0, 10, 100)
207 |     newtime = np.linspace(0, 10, 49)
208 |     data = np.zeros((4, 100))
209 |     ## The first row has a single nonzero value
210 |     data[0,50] = 1.0
211 |     ## The second row has a few nonzero values in a row
212 |     data[1,45:55] = 1.0
213 |     ## The third row has a few nonzero values separated by zeros
214 |     data[2,40:45] = 1.0
215 |     data[2,55:60] = 1.0
216 |     ## The fourth row has different values
217 |     data[3,40:45] = 1.0
218 |     data[3,55:60] = 2.0
219 |     
220 |     ## Interpolate the data
221 |     interpdata = sincinterp2D(data.T, oldtime, newtime, **kwargs).T
222 |     
223 |     ## Plot the results
224 |     from matplotlib.pyplot import figure, show
225 |     fig = figure()
226 |     for d in range(4):
227 |         ax = fig.add_subplot(4,1,d+1)
228 |         ax.plot(newtime, interpdata[d,:], 'go-')
229 |         ax.plot(oldtime, data[d,:], 'bo-')
230 |         
231 |         #ax.tight()
232 |     show()
233 |     return newtime, interpdata
234 | 


--------------------------------------------------------------------------------
/npp.py:
--------------------------------------------------------------------------------
 1 | """This module contains one line functions that should, by all rights, by in numpy.
 2 | """
 3 | import numpy as np
 4 | 
 5 | ## Demean -- remove the mean from each column
 6 | demean = lambda v: v-v.mean(0)
 7 | demean.__doc__ = """Removes the mean from each column of [v]."""
 8 | dm = demean
 9 | 
10 | ## Z-score -- z-score each column
11 | zscore = lambda v: (v-v.mean(0))/v.std(0)
12 | zscore.__doc__ = """Z-scores (standardizes) each column of [v]."""
13 | zs = zscore
14 | 
15 | ## Rescale -- make each column have unit variance
16 | rescale = lambda v: v/v.std(0)
17 | rescale.__doc__ = """Rescales each column of [v] to have unit variance."""
18 | rs = rescale
19 | 
20 | ## Matrix corr -- find correlation between each column of c1 and the corresponding column of c2
21 | mcorr = lambda c1,c2: (zs(c1)*zs(c2)).mean(0)
22 | mcorr.__doc__ = """Matrix correlation. Find the correlation between each column of [c1] and the corresponding column of [c2]."""
23 | 
24 | ## Cross corr -- find corr. between each row of c1 and EACH row of c2
25 | xcorr = lambda c1,c2: np.dot(zs(c1.T).T,zs(c2.T)) / (c1.shape[1])
26 | xcorr.__doc__ = """Cross-column correlation. Finds the correlation between each row of [c1] and each row of [c2]."""
27 | 


--------------------------------------------------------------------------------
/ridge.py:
--------------------------------------------------------------------------------
  1 | #import scipy
  2 | from functools import reduce
  3 | import numpy as np
  4 | import logging
  5 | from utils import mult_diag, counter
  6 | import random
  7 | import itertools as itools
  8 | 
  9 | zs = lambda v: (v-v.mean(0))/v.std(0) ## z-score function
 10 | 
 11 | 
 12 | def ridge_corr(Rstim, Pstim, Rresp, Presp, alphas, normalpha=False, dtype=np.single, corrmin=0.2,
 13 |                singcutoff=1e-10, use_corr=True, logger=logging.getLogger("ridge_corr")):
 14 |     """Uses ridge regression to find a linear transformation of [Rstim] that approximates [Rresp].
 15 |     Then tests by comparing the transformation of [Pstim] to [Presp]. This procedure is repeated
 16 |     for each regularization parameter alpha in [alphas]. The correlation between each prediction and
 17 |     each response for each alpha is returned. Note that the regression weights are NOT returned.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     Rstim : array_like, shape (TR, N)
 22 |         Training stimuli with TR time points and N features. Each feature should be Z-scored across time.
 23 |     Pstim : array_like, shape (TP, N)
 24 |         Test stimuli with TP time points and N features. Each feature should be Z-scored across time.
 25 |     Rresp : array_like, shape (TR, M)
 26 |         Training responses with TR time points and M responses (voxels, neurons, what-have-you).
 27 |         Each response should be Z-scored across time.
 28 |     Presp : array_like, shape (TP, M)
 29 |         Test responses with TP time points and M responses.
 30 |     alphas : list or array_like, shape (A,)
 31 |         Ridge parameters to be tested. Should probably be log-spaced. np.logspace(0, 3, 20) works well.
 32 |     normalpha : boolean
 33 |         Whether ridge parameters should be normalized by the Frobenius norm of Rstim. Good for
 34 |         comparing models with different numbers of parameters.
 35 |     dtype : np.dtype
 36 |         All data will be cast as this dtype for computation. np.single is used by default for memory
 37 |         efficiency.
 38 |     corrmin : float in [0..1]
 39 |         Purely for display purposes. After each alpha is tested, the number of responses with correlation
 40 |         greater than corrmin minus the number of responses with correlation less than negative corrmin
 41 |         will be printed. For long-running regressions this vague metric of non-centered skewness can
 42 |         give you a rough sense of how well the model is working before it's done.
 43 |     singcutoff : float
 44 |         The first step in ridge regression is computing the singular value decomposition (SVD) of the
 45 |         stimulus Rstim. If Rstim is not full rank, some singular values will be approximately equal
 46 |         to zero and the corresponding singular vectors will be noise. These singular values/vectors
 47 |         should be removed both for speed (the fewer multiplications the better!) and accuracy. Any
 48 |         singular values less than singcutoff will be removed.
 49 |     use_corr : boolean
 50 |         If True, this function will use correlation as its metric of model fit. If False, this function
 51 |         will instead use variance explained (R-squared) as its metric of model fit. For ridge regression
 52 |         this can make a big difference -- highly regularized solutions will have very small norms and
 53 |         will thus explain very little variance while still leading to high correlations, as correlation
 54 |         is scale-free while R**2 is not.
 55 | 
 56 |     Returns
 57 |     -------
 58 |     Rcorrs : array_like, shape (A, M)
 59 |         The correlation between each predicted response and each column of Presp for each alpha.
 60 |     
 61 |     """
 62 |     ## Calculate SVD of stimulus matrix
 63 |     logger.info("Doing SVD...")
 64 |     try:
 65 |         U,S,Vh = np.linalg.svd(Rstim, full_matrices=False)
 66 |     except np.linalg.LinAlgError as e:
 67 |         logger.info("NORMAL SVD FAILED, trying more robust dgesvd..")
 68 |         from text.regression.svd_dgesvd import svd_dgesvd
 69 |         U,S,Vh = svd_dgesvd(Rstim, full_matrices=False)
 70 | 
 71 |     ## Truncate tiny singular values for speed
 72 |     origsize = S.shape[0]
 73 |     ngoodS = np.sum(S>singcutoff)
 74 |     nbad = origsize-ngoodS
 75 |     U = U[:,:ngoodS]
 76 |     S = S[:ngoodS]
 77 |     Vh = Vh[:ngoodS]
 78 |     logger.info("Dropped %d tiny singular values.. (U is now %s)"%(nbad, str(U.shape)))
 79 | 
 80 |     ## Normalize alpha by the Frobenius norm
 81 |     #frob = np.sqrt((S**2).sum()) ## Frobenius!
 82 |     frob = S[0]
 83 |     #frob = S.sum()
 84 |     logger.info("Training stimulus has Frobenius norm: %0.03f"%frob)
 85 |     if normalpha:
 86 |         nalphas = alphas * frob
 87 |     else:
 88 |         nalphas = alphas
 89 | 
 90 |     ## Precompute some products for speed
 91 |     UR = np.dot(U.T, Rresp) ## Precompute this matrix product for speed
 92 |     PVh = np.dot(Pstim, Vh.T) ## Precompute this matrix product for speed
 93 |     
 94 |     #Prespnorms = np.apply_along_axis(np.linalg.norm, 0, Presp) ## Precompute test response norms
 95 |     zPresp = zs(Presp)
 96 |     Prespvar = Presp.var(0)
 97 |     Rcorrs = [] ## Holds training correlations for each alpha
 98 |     for na, a in zip(nalphas, alphas):
 99 |         #D = np.diag(S/(S**2+a**2)) ## Reweight singular vectors by the ridge parameter 
100 |         D = S/(S**2+na**2) ## Reweight singular vectors by the (normalized?) ridge parameter
101 |         
102 |         pred = np.dot(mult_diag(D, PVh, left=False), UR) ## Best (1.75 seconds to prediction in test)
103 |         # pred = np.dot(mult_diag(D, np.dot(Pstim, Vh.T), left=False), UR) ## Better (2.0 seconds to prediction in test)
104 |         
105 |         # pvhd = reduce(np.dot, [Pstim, Vh.T, D]) ## Pretty good (2.4 seconds to prediction in test)
106 |         # pred = np.dot(pvhd, UR)
107 |         
108 |         # wt = reduce(np.dot, [Vh.T, D, UR]).astype(dtype) ## Bad (14.2 seconds to prediction in test)
109 |         # wt = reduce(np.dot, [Vh.T, D, U.T, Rresp]).astype(dtype) ## Worst
110 |         # pred = np.dot(Pstim, wt) ## Predict test responses
111 | 
112 |         if use_corr:
113 |             #prednorms = np.apply_along_axis(np.linalg.norm, 0, pred) ## Compute predicted test response norms
114 |             #Rcorr = np.array([np.corrcoef(Presp[:,ii], pred[:,ii].ravel())[0,1] for ii in range(Presp.shape[1])]) ## Slowly compute correlations
115 |             #Rcorr = np.array(np.sum(np.multiply(Presp, pred), 0)).squeeze()/(prednorms*Prespnorms) ## Efficiently compute correlations
116 |             Rcorr = (zPresp*zs(pred)).mean(0)
117 |         else:
118 |             ## Compute variance explained
119 |             resvar = (Presp-pred).var(0)
120 |             Rcorr = np.clip(1-(resvar/Prespvar), 0, 1)
121 |             
122 |         Rcorr[np.isnan(Rcorr)] = 0
123 |         Rcorrs.append(Rcorr)
124 |         
125 |         log_template = "Training: alpha=%0.3f, mean corr=%0.5f, max corr=%0.5f, over-under(%0.2f)=%d"
126 |         log_msg = log_template % (a,
127 |                                   np.mean(Rcorr),
128 |                                   np.max(Rcorr),
129 |                                   corrmin,
130 |                                   (Rcorr>corrmin).sum()-(-Rcorr>corrmin).sum())
131 |         if logger is not None:
132 |             logger.info(log_msg)
133 |         else:
134 |             print (log_msg)
135 |     
136 |     return Rcorrs
137 | 
138 | 
139 | def bootstrap_ridge(Rstim, Rresp, Pstim, Presp, alphas, nboots, chunklen, nchunks, dtype=np.single,
140 |                     corrmin=0.2, joined=None, singcutoff=1e-10, normalpha=False, single_alpha=False,
141 |                     use_corr=True, logger=logging.getLogger("ridge_corr")):
142 |     """Uses ridge regression with a bootstrapped held-out set to get optimal alpha values for each response.
143 |     [nchunks] random chunks of length [chunklen] will be taken from [Rstim] and [Rresp] for each regression
144 |     run.  [nboots] total regression runs will be performed.  The best alpha value for each response will be
145 |     averaged across the bootstraps to estimate the best alpha for that response.
146 |     
147 |     If [joined] is given, it should be a list of lists where the STRFs for all the voxels in each sublist 
148 |     will be given the same regularization parameter (the one that is the best on average).
149 |     
150 |     Parameters
151 |     ----------
152 |     Rstim : array_like, shape (TR, N)
153 |         Training stimuli with TR time points and N features. Each feature should be Z-scored across time.
154 |     Rresp : array_like, shape (TR, M)
155 |         Training responses with TR time points and M different responses (voxels, neurons, what-have-you).
156 |         Each response should be Z-scored across time.
157 |     Pstim : array_like, shape (TP, N)
158 |         Test stimuli with TP time points and N features. Each feature should be Z-scored across time.
159 |     Presp : array_like, shape (TP, M)
160 |         Test responses with TP time points and M different responses. Each response should be Z-scored across
161 |         time.
162 |     alphas : list or array_like, shape (A,)
163 |         Ridge parameters that will be tested. Should probably be log-spaced. np.logspace(0, 3, 20) works well.
164 |     nboots : int
165 |         The number of bootstrap samples to run. 15 to 30 works well.
166 |     chunklen : int
167 |         On each sample, the training data is broken into chunks of this length. This should be a few times 
168 |         longer than your delay/STRF. e.g. for a STRF with 3 delays, I use chunks of length 10.
169 |     nchunks : int
170 |         The number of training chunks held out to test ridge parameters for each bootstrap sample. The product
171 |         of nchunks and chunklen is the total number of training samples held out for each sample, and this 
172 |         product should be about 20 percent of the total length of the training data.
173 |     dtype : np.dtype
174 |         All data will be cast as this dtype for computation. np.single is used by default for memory efficiency,
175 |         as using np.double will thrash most machines on a big problem. If you want to do regression on 
176 |         complex variables, this should be changed to np.complex128.
177 |     corrmin : float in [0..1]
178 |         Purely for display purposes. After each alpha is tested for each bootstrap sample, the number of 
179 |         responses with correlation greater than this value will be printed. For long-running regressions this
180 |         can give a rough sense of how well the model works before it's done.
181 |     joined : None or list of array_like indices
182 |         If you want the STRFs for two (or more) responses to be directly comparable, you need to ensure that
183 |         the regularization parameter that they use is the same. To do that, supply a list of the response sets
184 |         that should use the same ridge parameter here. For example, if you have four responses, joined could
185 |         be [np.array([0,1]), np.array([2,3])], in which case responses 0 and 1 will use the same ridge parameter
186 |         (which will be parameter that is best on average for those two), and likewise for responses 2 and 3.
187 |     singcutoff : float
188 |         The first step in ridge regression is computing the singular value decomposition (SVD) of the
189 |         stimulus Rstim. If Rstim is not full rank, some singular values will be approximately equal
190 |         to zero and the corresponding singular vectors will be noise. These singular values/vectors
191 |         should be removed both for speed (the fewer multiplications the better!) and accuracy. Any
192 |         singular values less than singcutoff will be removed.
193 |     normalpha : boolean
194 |         Whether ridge parameters (alphas) should be normalized by the Frobenius norm of Rstim. Good for rigorously
195 |         comparing models with different numbers of parameters.
196 |     single_alpha : boolean
197 |         Whether to use a single alpha for all responses. Good for identification/decoding.
198 |     use_corr : boolean
199 |         If True, this function will use correlation as its metric of model fit. If False, this function
200 |         will instead use variance explained (R-squared) as its metric of model fit. For ridge regression
201 |         this can make a big difference -- highly regularized solutions will have very small norms and
202 |         will thus explain very little variance while still leading to high correlations, as correlation
203 |         is scale-free while R**2 is not.
204 |     
205 |     Returns
206 |     -------
207 |     wt : array_like, shape (N, M)
208 |         Regression weights for N features and M responses.
209 |     corrs : array_like, shape (M,)
210 |         Validation set correlations. Predicted responses for the validation set are obtained using the regression
211 |         weights: pred = np.dot(Pstim, wt), and then the correlation between each predicted response and each 
212 |         column in Presp is found.
213 |     alphas : array_like, shape (M,)
214 |         The regularization coefficient (alpha) selected for each voxel using bootstrap cross-validation.
215 |     bootstrap_corrs : array_like, shape (A, M, B)
216 |         Correlation between predicted and actual responses on randomly held out portions of the training set,
217 |         for each of A alphas, M voxels, and B bootstrap samples.
218 |     valinds : array_like, shape (TH, B)
219 |         The indices of the training data that were used as "validation" for each bootstrap sample.
220 |     """
221 |     nresp, nvox = Rresp.shape
222 |     bestalphas = np.zeros((nboots, nvox))  ## Will hold the best alphas for each voxel
223 |     valinds = [] ## Will hold the indices into the validation data for each bootstrap
224 |     
225 |     Rcmats = []
226 |     for bi in counter(range(nboots), countevery=1, total=nboots):
227 |         logger.info("Selecting held-out test set..")
228 |         allinds = range(nresp)
229 |         indchunks = list(zip(*[iter(allinds)]*chunklen))
230 |         random.shuffle(indchunks)
231 |         heldinds = list(itools.chain(*indchunks[:nchunks]))
232 |         notheldinds = list(set(allinds)-set(heldinds))
233 |         valinds.append(heldinds)
234 |         
235 |         RRstim = Rstim[notheldinds,:]
236 |         PRstim = Rstim[heldinds,:]
237 |         RRresp = Rresp[notheldinds,:]
238 |         PRresp = Rresp[heldinds,:]
239 |         
240 |         ## Run ridge regression using this test set
241 |         Rcmat = ridge_corr(RRstim, PRstim, RRresp, PRresp, alphas,
242 |                            dtype=dtype, corrmin=corrmin, singcutoff=singcutoff,
243 |                            normalpha=normalpha, use_corr=use_corr)
244 |         
245 |         Rcmats.append(Rcmat)
246 |     
247 |     ## Find weights for each voxel
248 |     try:
249 |         U,S,Vh = np.linalg.svd(Rstim, full_matrices=False)
250 |     except np.linalg.LinAlgError as e:
251 |         logger.info("NORMAL SVD FAILED, trying more robust dgesvd..")
252 |         from text.regression.svd_dgesvd import svd_dgesvd
253 |         U,S,Vh = svd_dgesvd(Rstim, full_matrices=False)
254 | 
255 |     ## Normalize alpha by the Frobenius norm
256 |     #frob = np.sqrt((S**2).sum()) ## Frobenius!
257 |     frob = S[0]
258 |     #frob = S.sum()
259 |     logger.info("Total training stimulus has Frobenius norm: %0.03f"%frob)
260 |     if normalpha:
261 |         nalphas = alphas * frob
262 |     else:
263 |         nalphas = alphas
264 | 
265 |     allRcorrs = np.dstack(Rcmats)
266 |     if not single_alpha:
267 |         logger.info("Finding best alpha for each response..")
268 |         if joined is None:
269 |             ## Find best alpha for each voxel
270 |             meanbootcorrs = allRcorrs.mean(2)
271 |             bestalphainds = np.argmax(meanbootcorrs, 0)
272 |             valphas = nalphas[bestalphainds]
273 |         else:
274 |             ## Find best alpha for each group of voxels
275 |             valphas = np.zeros((nvox,))
276 |             for jl in joined:
277 |                 jcorrs = allRcorrs[:,jl,:].mean(1).mean(1) ## Mean across voxels in the set, then mean across bootstraps
278 |                 bestalpha = np.argmax(jcorrs)
279 |                 valphas[jl] = nalphas[bestalpha]
280 |     else:
281 |         logger.info("Finding single best alpha..")
282 |         meanbootcorr = allRcorrs.mean(2).mean(1)
283 |         bestalphaind = np.argmax(meanbootcorr)
284 |         bestalpha = alphas[bestalphaind]
285 |         valphas = np.array([bestalpha]*nvox)
286 |         logger.info("Best alpha = %0.3f"%bestalpha)
287 | 
288 |     logger.info("Computing weights for each response using entire training set..")
289 |     UR = np.dot(U.T, np.nan_to_num(Rresp))
290 |     pred = np.zeros(Presp.shape)
291 |     wt = np.zeros((Rstim.shape[1], Rresp.shape[1]))
292 |     for ai,alpha in enumerate(nalphas):
293 |         selvox = np.nonzero(valphas==alpha)[0]
294 |         awt = reduce(np.dot, [Vh.T, np.diag(S/(S**2+alpha**2)), UR[:,selvox]])
295 |         pred[:,selvox] = np.dot(Pstim, awt)
296 |         wt[:,selvox] = awt
297 | 
298 |     ## Find test correlations
299 |     nnpred = np.nan_to_num(pred)
300 |     corrs = np.nan_to_num(np.array([np.corrcoef(Presp[:,ii], nnpred[:,ii].ravel())[0,1] for ii in range(Presp.shape[1])]))
301 | 
302 |     return wt, corrs, valphas, allRcorrs, valinds
303 | 


--------------------------------------------------------------------------------
/stimulus_utils.py:
--------------------------------------------------------------------------------
  1 | from textgrid import TextGrid
  2 | import os
  3 | import numpy as np
  4 | from collections import defaultdict
  5 | 
  6 | def load_grid(story, grid_dir="data/grids"):
  7 |     """Loads the TextGrid for the given [story] from the directory [grid_dir].
  8 |     The first file that starts with [story] will be loaded, so if there are
  9 |     multiple versions of a grid for a story, beward.
 10 |     """
 11 |     gridfile = [os.path.join(grid_dir, gf) for gf in os.listdir(grid_dir) if gf.startswith(story)][0]
 12 |     return TextGrid(open(gridfile).read())
 13 | 
 14 | def load_grids_for_stories(stories):
 15 |     """Loads grids for the given [stories], puts them in a dictionary.
 16 |     """
 17 |     return dict([(st, load_grid(st)) for st in stories])
 18 | 
 19 | def load_5tier_grids_for_stories(stories, rootdir):
 20 |     grids = dict()
 21 |     for story in stories:
 22 |         storydir = os.path.join(rootdir, [sd for sd in os.listdir(rootdir) if sd.startswith(story)][0])
 23 |         storyfile = os.path.join(storydir, [sf for sf in os.listdir(storydir) if sf.endswith("TextGrid")][0])
 24 |         grids[story] = TextGrid(open(storyfile).read())
 25 |     return grids
 26 | 
 27 | 
 28 | class TRFile(object):
 29 |     def __init__(self, trfilename, expectedtr=2.0045):
 30 |         """Loads data from [trfilename], should be output from stimulus presentation code.
 31 |         """
 32 |         self.trtimes = []
 33 |         self.soundstarttime = -1
 34 |         self.soundstoptime = -1
 35 |         self.otherlabels = []
 36 |         self.expectedtr = expectedtr
 37 |         
 38 |         if trfilename is not None:
 39 |             self.load_from_file(trfilename)
 40 |         
 41 | 
 42 |     def load_from_file(self, trfilename):
 43 |         """Loads TR data from report with given [trfilename].
 44 |         """
 45 |         ## Read the report file and populate the datastructure
 46 |         for ll in open(trfilename):
 47 |             timestr = ll.split()[0]
 48 |             label = " ".join(ll.split()[1:])
 49 |             time = float(timestr)
 50 | 
 51 |             if label in ("init-trigger", "trigger"):
 52 |                 self.trtimes.append(time)
 53 | 
 54 |             elif label=="sound-start":
 55 |                 self.soundstarttime = time
 56 | 
 57 |             elif label=="sound-stop":
 58 |                 self.soundstoptime = time
 59 | 
 60 |             else:
 61 |                 self.otherlabels.append((time, label))
 62 |         
 63 |         ## Fix weird TR times
 64 |         itrtimes = np.diff(self.trtimes)
 65 |         badtrtimes = np.nonzero(itrtimes>(itrtimes.mean()*1.5))[0]
 66 |         newtrs = []
 67 |         for btr in badtrtimes:
 68 |             ## Insert new TR where it was missing..
 69 |             newtrtime = self.trtimes[btr]+self.expectedtr
 70 |             newtrs.append((newtrtime,btr))
 71 | 
 72 |         for ntr,btr in newtrs:
 73 |             self.trtimes.insert(btr+1, ntr)
 74 | 
 75 |     def simulate(self, ntrs):
 76 |         """Simulates [ntrs] TRs that occur at the expected TR.
 77 |         """
 78 |         self.trtimes = list(np.arange(ntrs)*self.expectedtr)
 79 |     
 80 |     def get_reltriggertimes(self):
 81 |         """Returns the times of all trigger events relative to the sound.
 82 |         """
 83 |         return np.array(self.trtimes)-self.soundstarttime
 84 | 
 85 |     @property
 86 |     def avgtr(self):
 87 |         """Returns the average TR for this run.
 88 |         """
 89 |         return np.diff(self.trtimes).mean()
 90 | 
 91 | def load_generic_trfiles(stories, root="data/trfiles"):
 92 |     """Loads a dictionary of generic TRFiles (i.e. not specifically from the session
 93 |     in which the data was collected.. this should be fine) for the given stories.
 94 |     """
 95 |     trdict = dict()
 96 | 
 97 |     for story in stories:
 98 |         try:
 99 |             trf = TRFile(os.path.join(root, "%s.report"%story))
100 |             trdict[story] = [trf]
101 |         except Exception as e:
102 |             print (e)
103 |     
104 |     return trdict
105 | 


--------------------------------------------------------------------------------
/textgrid.py:
--------------------------------------------------------------------------------
  1 | # Natural Language Toolkit: TextGrid analysis
  2 | #
  3 | # Copyright (C) 2001-2011 NLTK Project
  4 | # Author: Margaret Mitchell <itallow@gmail.com>
  5 | #         Steven Bird <sb@csse.unimelb.edu.au> (revisions)
  6 | # URL: <http://www.nltk.org>
  7 | # For license information, see LICENSE.TXT
  8 | #
  9 | 
 10 | """
 11 | Tools for reading TextGrid files, the format used by Praat.
 12 | 
 13 | Module contents
 14 | ===============
 15 | 
 16 | The textgrid corpus reader provides 4 data items and 1 function
 17 | for each textgrid file.  For each tier in the file, the reader
 18 | provides 10 data items and 2 functions.
 19 |  
 20 | For the full textgrid file: 
 21 | 
 22 |   - size
 23 |     The number of tiers in the file.
 24 | 
 25 |   - xmin
 26 |     First marked time of the file.
 27 | 
 28 |   - xmax
 29 |     Last marked time of the file.
 30 | 
 31 |   - t_time
 32 |     xmax - xmin.
 33 | 
 34 |   - text_type
 35 |     The style of TextGrid format:
 36 |         - ooTextFile:  Organized by tier.
 37 |         - ChronTextFile:  Organized by time.
 38 |         - OldooTextFile:  Similar to ooTextFile.
 39 | 
 40 |   - to_chron()
 41 |     Convert given file to a ChronTextFile format.
 42 | 
 43 |   - to_oo()
 44 |     Convert given file to an ooTextFile format.
 45 | 
 46 | For each tier:
 47 | 
 48 |   - text_type
 49 |     The style of TextGrid format, as above.
 50 | 
 51 |   - classid
 52 |     The style of transcription on this tier:
 53 |         - IntervalTier:  Transcription is marked as intervals.
 54 |         - TextTier:  Transcription is marked as single points.
 55 | 
 56 |   - nameid
 57 |     The name of the tier.
 58 | 
 59 |   - xmin
 60 |     First marked time of the tier.
 61 | 
 62 |   - xmax
 63 |     Last marked time of the tier.
 64 | 
 65 |   - size
 66 |     Number of entries in the tier.
 67 | 
 68 |   - transcript
 69 |     The raw transcript for the tier.
 70 | 
 71 |   - simple_transcript
 72 |     The transcript formatted as a list of tuples: (time1, time2, utterance).
 73 | 
 74 |   - tier_info
 75 |     List of (classid, nameid, xmin, xmax, size, transcript).
 76 | 
 77 |   - min_max()
 78 |     A tuple of (xmin, xmax).  
 79 | 
 80 |   - time(non_speech_marker)
 81 |     Returns the utterance time of a given tier.
 82 |     Excludes entries that begin with a non-speech marker.
 83 | 
 84 | """
 85 | 
 86 | # needs more cleanup, subclassing, epydoc docstrings
 87 | 
 88 | import sys
 89 | import re
 90 | 
 91 | TEXTTIER = "TextTier"
 92 | INTERVALTIER = "IntervalTier"
 93 | 
 94 | OOTEXTFILE = re.compile(r"""(?x)
 95 |             xmin\ =\ (.*)[\r\n]+
 96 |             xmax\ =\ (.*)[\r\n]+
 97 |             [\s\S]+?size\ =\ (.*)[\r\n]+ 
 98 | """)
 99 | 
100 | CHRONTEXTFILE = re.compile(r"""(?x)
101 |             [\r\n]+(\S+)\ 
102 |             (\S+)\ +!\ Time\ domain.\ *[\r\n]+
103 |             (\S+)\ +!\ Number\ of\ tiers.\ *[\r\n]+"
104 | """)
105 | 
106 | OLDOOTEXTFILE = re.compile(r"""(?x)
107 |             [\r\n]+(\S+)
108 |             [\r\n]+(\S+)
109 |             [\r\n]+.+[\r\n]+(\S+)
110 | """)
111 | 
112 | 
113 | 
114 | #################################################################
115 | # TextGrid Class
116 | #################################################################
117 | 
118 | class TextGrid(object):
119 |     """
120 |     Class to manipulate the TextGrid format used by Praat.
121 |     Separates each tier within this file into its own Tier
122 |     object.  Each TextGrid object has
123 |     a number of tiers (size), xmin, xmax, a text type to help
124 |     with the different styles of TextGrid format, and tiers with their
125 |     own attributes.
126 |     """
127 | 
128 |     def __init__(self, read_file):
129 |         """
130 |         Takes open read file as input, initializes attributes 
131 |         of the TextGrid file.
132 |         @type read_file: An open TextGrid file, mode "r".
133 |         @param size:  Number of tiers.
134 |         @param xmin: xmin.
135 |         @param xmax: xmax.
136 |         @param t_time:  Total time of TextGrid file.
137 |         @param text_type:  TextGrid format.
138 |         @type tiers:  A list of tier objects.
139 |         """
140 | 
141 |         self.read_file = read_file
142 |         self.size = 0
143 |         self.xmin = 0
144 |         self.xmax = 0
145 |         self.t_time = 0
146 |         self.text_type = self._check_type()
147 |         self.tiers = self._find_tiers()
148 | 
149 |     def __iter__(self):
150 |         for tier in self.tiers:
151 |             yield tier
152 | 
153 |     def next(self):
154 |         if self.idx == (self.size - 1):
155 |             raise StopIteration
156 |         self.idx += 1
157 |         return self.tiers[self.idx]
158 | 
159 |     @staticmethod
160 |     def load(file):
161 |         """
162 |         @param file: a file in TextGrid format
163 |         """
164 | 
165 |         return TextGrid(open(file).read())
166 | 
167 |     def _load_tiers(self, header):
168 |         """
169 |         Iterates over each tier and grabs tier information.
170 |         """ 
171 | 
172 |         tiers = []
173 |         if self.text_type == "ChronTextFile":
174 |             m = re.compile(header)
175 |             tier_headers = m.findall(self.read_file)
176 |             tier_re = " \d+.?\d* \d+.?\d*[\r\n]+\"[^\"]*\""
177 |             for i in range(0, self.size):
178 |                 tier_info = [tier_headers[i]] + \
179 |                 re.findall(str(i + 1) + tier_re, self.read_file)
180 |                 tier_info = "\n".join(tier_info)
181 |                 tiers.append(Tier(tier_info, self.text_type, self.t_time))
182 |             return tiers
183 | 
184 |         tier_re = header + "[\s\S]+?(?=" + header + "|$$)"
185 |         m = re.compile(tier_re)
186 |         tier_iter = m.finditer(self.read_file)
187 |         for iterator in tier_iter:
188 |             (begin, end) = iterator.span()
189 |             tier_info = self.read_file[begin:end]
190 |             tiers.append(Tier(tier_info, self.text_type, self.t_time))
191 |         return tiers
192 |     
193 |     def _check_type(self):
194 |         """
195 |         Figures out the TextGrid format.
196 |         """
197 | 
198 |         m = re.match("(.*)[\r\n](.*)[\r\n](.*)[\r\n](.*)", self.read_file)
199 |         try:
200 |             type_id = m.group(1).strip()
201 |         except AttributeError:
202 |             raise TypeError("Cannot read file -- try TextGrid.load()")
203 |         xmin = m.group(4)
204 |         if type_id == "File type = \"ooTextFile\"":
205 |             if "xmin" not in xmin:
206 |                 text_type = "OldooTextFile"
207 |             else:
208 |                 text_type = "ooTextFile"
209 |         elif type_id == "\"Praat chronological TextGrid text file\"":
210 |             text_type = "ChronTextFile"
211 |         else: 
212 |             raise TypeError("Unknown format '(%s)'", (type_id))
213 |         return text_type
214 |         
215 |     def _find_tiers(self):
216 |         """
217 |         Splits the textgrid file into substrings corresponding to tiers. 
218 |         """
219 | 
220 |         if self.text_type == "ooTextFile":
221 |             m = OOTEXTFILE
222 |             header = " +item \["
223 |         elif self.text_type == "ChronTextFile":
224 |             m = CHRONTEXTFILE
225 |             header = "\"\S+\" \".*\" \d+\.?\d* \d+\.?\d*"
226 |         elif self.text_type == "OldooTextFile":
227 |             m = OLDOOTEXTFILE
228 |             header = "\".*\"[\r\n]+\".*\""
229 | 
230 |         file_info = m.findall(self.read_file)[0]
231 |         self.xmin = float(file_info[0])
232 |         self.xmax = float(file_info[1])
233 |         self.t_time = self.xmax - self.xmin
234 |         self.size = int(file_info[2])
235 |         tiers = self._load_tiers(header)
236 |         return tiers
237 | 
238 |     def to_chron(self):
239 |         """ 
240 |         @return:  String in Chronological TextGrid file format.
241 |         """
242 | 
243 |         chron_file = ""
244 |         chron_file += "\"Praat chronological TextGrid text file\"\n"
245 |         chron_file += str(self.xmin) + " " + str(self.xmax)
246 |         chron_file += "   ! Time domain.\n"
247 |         chron_file += str(self.size) + "   ! Number of tiers.\n"
248 |         for tier in self.tiers:
249 |             idx = (self.tiers.index(tier)) + 1
250 |             tier_header = "\"" + tier.classid + "\" \"" \
251 |                           + tier.nameid + "\" " + str(tier.xmin) \
252 |                           + " " + str(tier.xmax)
253 |             chron_file += tier_header + "\n"
254 |             transcript = tier.simple_transcript
255 |             for (xmin, xmax, utt) in transcript:
256 |                 chron_file += str(idx) + " " + str(xmin) 
257 |                 chron_file += " " + str(xmax) +"\n"
258 |                 chron_file += "\"" + utt + "\"\n"
259 |         return chron_file
260 | 
261 |     def to_oo(self):
262 |         """ 
263 |         @return:  A string in OoTextGrid file format.
264 |         """
265 |    
266 |         oo_file = ""
267 |         oo_file += "File type = \"ooTextFile\"\n"
268 |         oo_file += "Object class = \"TextGrid\"\n\n"
269 |         oo_file += "xmin = ", self.xmin, "\n"
270 |         oo_file += "xmax = ", self.xmax, "\n"
271 |         oo_file += "tiers? <exists>\n"
272 |         oo_file += "size = ", self.size, "\n"
273 |         oo_file += "item []:\n"
274 |         for i in range(len(self.tiers)):
275 |             oo_file += "%4s%s [%s]" % ("", "item", i + 1)
276 |             _curr_tier = self.tiers[i]
277 |             for (x, y) in _curr_tier.header:
278 |                 oo_file += "%8s%s = \"%s\"" % ("", x, y)
279 |             if _curr_tier.classid != TEXTTIER:
280 |                 for (xmin, xmax, text) in _curr_tier.simple_transcript:
281 |                     oo_file += "%12s%s = %s" % ("", "xmin", xmin)
282 |                     oo_file += "%12s%s = %s" % ("", "xmax", xmax)
283 |                     oo_file += "%12s%s = \"%s\"" % ("", "text", text)
284 |             else:
285 |                 for (time, mark) in _curr_tier.simple_transcript:
286 |                     oo_file += "%12s%s = %s" % ("", "time", time)
287 |                     oo_file += "%12s%s = %s" % ("", "mark", mark)
288 |         return oo_file
289 | 
290 | 
291 | #################################################################
292 | # Tier Class
293 | #################################################################
294 | 
295 | class Tier(object):
296 |     """ 
297 |     A container for each tier.
298 |     """
299 | 
300 |     def __init__(self, tier, text_type, t_time):
301 |         """
302 |         Initializes attributes of the tier: class, name, xmin, xmax
303 |         size, transcript, total time.  
304 |         Utilizes text_type to guide how to parse the file.
305 |         @type tier: a tier object; single item in the TextGrid list.
306 |         @param text_type:  TextGrid format
307 |         @param t_time:  Total time of TextGrid file.
308 |         @param classid:  Type of tier (point or interval).
309 |         @param nameid:  Name of tier.
310 |         @param xmin:  xmin of the tier.
311 |         @param xmax:  xmax of the tier.
312 |         @param size:  Number of entries in the tier
313 |         @param transcript:  The raw transcript for the tier.
314 |         """
315 | 
316 |         self.tier = tier
317 |         self.text_type = text_type
318 |         self.t_time = t_time
319 |         self.classid = ""
320 |         self.nameid = ""
321 |         self.xmin = 0
322 |         self.xmax = 0
323 |         self.size = 0
324 |         self.transcript = ""
325 |         self.tier_info = ""
326 |         self._make_info()
327 |         self.simple_transcript = self.make_simple_transcript()
328 |         if self.classid != TEXTTIER:
329 |             self.mark_type = "intervals"
330 |         else:
331 |             self.mark_type = "points"
332 |             self.header = [("class", self.classid), ("name", self.nameid), \
333 |             ("xmin", self.xmin), ("xmax", self.xmax), ("size", self.size)]
334 | 
335 |     def __iter__(self):
336 |         return self
337 |   
338 |     def _make_info(self):
339 |         """
340 |         Figures out most attributes of the tier object:
341 |         class, name, xmin, xmax, transcript.
342 |         """
343 | 
344 |         trans = "([\S\s]*)"
345 |         if self.text_type == "ChronTextFile":
346 |             classid = "\"(.*)\" +"
347 |             nameid = "\"(.*)\" +"
348 |             xmin = "(\d+\.?\d*) +"
349 |             xmax = "(\d+\.?\d*) *[\r\n]+"
350 |             # No size values are given in the Chronological Text File format.
351 |             self.size = None
352 |             size = ""
353 |         elif self.text_type == "ooTextFile":
354 |             classid = " +class = \"(.*)\" *[\r\n]+"
355 |             nameid = " +name = \"(.*)\" *[\r\n]+"
356 |             xmin = " +xmin = (\d+\.?\d*) *[\r\n]+"
357 |             xmax = " +xmax = (\d+\.?\d*) *[\r\n]+"
358 |             size = " +\S+: size = (\d+) *[\r\n]+"
359 |         elif self.text_type == "OldooTextFile":
360 |             classid = "\"(.*)\" *[\r\n]+"
361 |             nameid = "\"(.*)\" *[\r\n]+"
362 |             xmin = "(\d+\.?\d*) *[\r\n]+"
363 |             xmax = "(\d+\.?\d*) *[\r\n]+"
364 |             size = "(\d+) *[\r\n]+"
365 |         m = re.compile(classid + nameid + xmin + xmax + size + trans)
366 |         self.tier_info = m.findall(self.tier)[0]
367 |         self.classid = self.tier_info[0]
368 |         self.nameid = self.tier_info[1]
369 |         self.xmin = float(self.tier_info[2])
370 |         self.xmax = float(self.tier_info[3])
371 |         if self.size != None:
372 |             self.size = int(self.tier_info[4])
373 |         self.transcript = self.tier_info[-1]
374 |             
375 |     def make_simple_transcript(self):
376 |         """ 
377 |         @return:  Transcript of the tier, in form [(start_time end_time label)]
378 |         """
379 | 
380 |         if self.text_type == "ChronTextFile":
381 |             trans_head = ""
382 |             trans_xmin = " (\S+)"
383 |             trans_xmax = " (\S+)[\r\n]+"
384 |             trans_text = "\"([\S\s]*?)\""
385 |         elif self.text_type == "ooTextFile":
386 |             trans_head = " +\S+ \[\d+\]: *[\r\n]+"
387 |             trans_xmin = " +\S+ = (\S+) *[\r\n]+"
388 |             trans_xmax = " +\S+ = (\S+) *[\r\n]+"
389 |             trans_text = " +\S+ = \"([^\"]*?)\""    
390 |         elif self.text_type == "OldooTextFile":
391 |             trans_head = ""
392 |             trans_xmin = "(.*)[\r\n]+"
393 |             trans_xmax = "(.*)[\r\n]+"
394 |             trans_text = "\"([\S\s]*?)\""
395 |         if self.classid == TEXTTIER:
396 |             trans_xmin = ""
397 |         trans_m = re.compile(trans_head + trans_xmin + trans_xmax + trans_text)
398 |         self.simple_transcript = trans_m.findall(self.transcript)
399 |         return self.simple_transcript
400 | 
401 |     def transcript(self):
402 |         """
403 |         @return:  Transcript of the tier, as it appears in the file.
404 |         """
405 |        
406 |         return self.transcript
407 | 
408 |     def time(self, non_speech_char="."):
409 |         """
410 |         @return: Utterance time of a given tier.
411 |         Screens out entries that begin with a non-speech marker.        
412 |         """
413 | 
414 |         total = 0.0
415 |         if self.classid != TEXTTIER:
416 |             for (time1, time2, utt) in self.simple_transcript:
417 |                 utt = utt.strip()
418 |                 if utt and not utt[0] == ".":
419 |                     total += (float(time2) - float(time1))
420 |         return total
421 |                     
422 |     def tier_name(self):
423 |         """
424 |         @return:  Tier name of a given tier.
425 |         """
426 | 
427 |         return self.nameid
428 | 
429 |     def classid(self):
430 |         """
431 |         @return:  Type of transcription on tier.
432 |         """
433 | 
434 |         return self.classid
435 | 
436 |     def min_max(self):
437 |         """
438 |         @return:  (xmin, xmax) tuple for a given tier.
439 |         """
440 | 
441 |         return (self.xmin, self.xmax)
442 | 
443 |     def __repr__(self):
444 |         return "<%s \"%s\" (%.2f, %.2f) %.2f%%>" % (self.classid, self.nameid, self.xmin, self.xmax, 100*self.time()/self.t_time)
445 | 
446 |     def __str__(self):
447 |         return self.__repr__() + "\n  " + "\n  ".join(" ".join(row) for row in self.simple_transcript)
448 | 
449 | def demo_TextGrid(demo_data):
450 |     print ("** Demo of the TextGrid class. **")
451 | 
452 |     fid = TextGrid(demo_data)
453 |     print ("Tiers:", fid.size)
454 | 
455 |     for i, tier in enumerate(fid):
456 |         print ("\n***")
457 |         print ("Tier:", i + 1)
458 |         print (tier)
459 | 
460 | def demo():
461 |     # Each demo demonstrates different TextGrid formats.
462 |     print ("Format 1")
463 |     demo_TextGrid(demo_data1)
464 |     print ("\nFormat 2")
465 |     demo_TextGrid(demo_data2)
466 |     print ("\nFormat 3")
467 |     demo_TextGrid(demo_data3)
468 | 
469 | 
470 | demo_data1 = """File type = "ooTextFile"
471 | Object class = "TextGrid"
472 | 
473 | xmin = 0 
474 | xmax = 2045.144149659864
475 | tiers? <exists> 
476 | size = 3 
477 | item []: 
478 |     item [1]:
479 |         class = "IntervalTier" 
480 |         name = "utterances" 
481 |         xmin = 0 
482 |         xmax = 2045.144149659864 
483 |         intervals: size = 5 
484 |         intervals [1]:
485 |             xmin = 0 
486 |             xmax = 2041.4217474125382 
487 |             text = "" 
488 |         intervals [2]:
489 |             xmin = 2041.4217474125382 
490 |             xmax = 2041.968276643991 
491 |             text = "this" 
492 |         intervals [3]:
493 |             xmin = 2041.968276643991 
494 |             xmax = 2042.5281632653062 
495 |             text = "is" 
496 |         intervals [4]:
497 |             xmin = 2042.5281632653062 
498 |             xmax = 2044.0487352585324 
499 |             text = "a" 
500 |         intervals [5]:
501 |             xmin = 2044.0487352585324 
502 |             xmax = 2045.144149659864 
503 |             text = "demo" 
504 |     item [2]:
505 |         class = "TextTier" 
506 |         name = "notes" 
507 |         xmin = 0 
508 |         xmax = 2045.144149659864 
509 |         points: size = 3 
510 |         points [1]:
511 |             time = 2041.4217474125382 
512 |             mark = ".begin_demo"
513 |         points [2]:
514 |             time = 2043.8338291031832
515 |             mark = "voice gets quiet here" 
516 |         points [3]:
517 |             time = 2045.144149659864
518 |             mark = ".end_demo" 
519 |     item [3]:
520 |         class = "IntervalTier" 
521 |         name = "phones" 
522 |         xmin = 0 
523 |         xmax = 2045.144149659864
524 |         intervals: size = 12
525 |         intervals [1]:
526 |             xmin = 0 
527 |             xmax = 2041.4217474125382 
528 |             text = "" 
529 |         intervals [2]:
530 |             xmin = 2041.4217474125382 
531 |             xmax = 2041.5438290324326 
532 |             text = "D"
533 |         intervals [3]:
534 |             xmin = 2041.5438290324326
535 |             xmax = 2041.7321032910372
536 |             text = "I"
537 |         intervals [4]:
538 |             xmin = 2041.7321032910372            
539 |             xmax = 2041.968276643991 
540 |             text = "s" 
541 |         intervals [5]:
542 |             xmin = 2041.968276643991 
543 |             xmax = 2042.232189031843
544 |             text = "I"
545 |         intervals [6]:
546 |             xmin = 2042.232189031843
547 |             xmax = 2042.5281632653062 
548 |             text = "z" 
549 |         intervals [7]:
550 |             xmin = 2042.5281632653062 
551 |             xmax = 2044.0487352585324 
552 |             text = "eI" 
553 |         intervals [8]:
554 |             xmin = 2044.0487352585324 
555 |             xmax = 2044.2487352585324
556 |             text = "dc"
557 |         intervals [9]:
558 |             xmin = 2044.2487352585324
559 |             xmax = 2044.3102321849011
560 |             text = "d"
561 |         intervals [10]:
562 |             xmin = 2044.3102321849011
563 |             xmax = 2044.5748932104329
564 |             text = "E"
565 |         intervals [11]:
566 |             xmin = 2044.5748932104329
567 |             xmax = 2044.8329108578437
568 |             text = "m"
569 |         intervals [12]:
570 |             xmin = 2044.8329108578437
571 |             xmax = 2045.144149659864 
572 |             text = "oU" 
573 | """
574 | 
575 | demo_data2 = """File type = "ooTextFile"
576 | Object class = "TextGrid"
577 | 
578 | 0
579 | 2.8
580 | <exists>
581 | 2
582 | "IntervalTier"
583 | "utterances"
584 | 0
585 | 2.8
586 | 3
587 | 0
588 | 1.6229213249309031
589 | ""
590 | 1.6229213249309031
591 | 2.341428074708195
592 | "demo"
593 | 2.341428074708195
594 | 2.8
595 | ""
596 | "IntervalTier"
597 | "phones"
598 | 0
599 | 2.8
600 | 6
601 | 0
602 | 1.6229213249309031
603 | ""
604 | 1.6229213249309031
605 | 1.6428291382019483
606 | "dc"
607 | 1.6428291382019483
608 | 1.65372183721983721
609 | "d"
610 | 1.65372183721983721
611 | 1.94372874328943728
612 | "E"
613 | 1.94372874328943728
614 | 2.13821938291038210
615 | "m"
616 | 2.13821938291038210
617 | 2.341428074708195
618 | "oU"
619 | 2.341428074708195
620 | 2.8
621 | ""
622 | """
623 | 
624 | demo_data3 = """"Praat chronological TextGrid text file"
625 | 0 2.8   ! Time domain.
626 | 2   ! Number of tiers.
627 | "IntervalTier" "utterances" 0 2.8
628 | "IntervalTier" "utterances" 0 2.8
629 | 1 0 1.6229213249309031
630 | ""
631 | 2 0 1.6229213249309031
632 | ""
633 | 2 1.6229213249309031 1.6428291382019483
634 | "dc"
635 | 2 1.6428291382019483 1.65372183721983721
636 | "d"
637 | 2 1.65372183721983721 1.94372874328943728
638 | "E"
639 | 2 1.94372874328943728 2.13821938291038210
640 | "m"
641 | 2 2.13821938291038210 2.341428074708195
642 | "oU"
643 | 1 1.6229213249309031 2.341428074708195
644 | "demo"
645 | 1 2.341428074708195 2.8
646 | ""
647 | 2 2.341428074708195 2.8
648 | ""
649 | """
650 | 
651 | if __name__ == "__main__":
652 |     demo()
653 | 
654 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tables
  3 | #from matplotlib.pyplot import figure, show
  4 | import scipy.linalg
  5 | 
  6 | def make_delayed(stim, delays, circpad=False):
  7 |     """Creates non-interpolated concatenated delayed versions of [stim] with the given [delays] 
  8 |     (in samples).
  9 |     
 10 |     If [circpad], instead of being padded with zeros, [stim] will be circularly shifted.
 11 |     """
 12 |     nt,ndim = stim.shape
 13 |     dstims = []
 14 |     for di,d in enumerate(delays):
 15 |         dstim = np.zeros((nt, ndim))
 16 |         if d<0: ## negative delay
 17 |             dstim[:d,:] = stim[-d:,:]
 18 |             if circpad:
 19 |                 dstim[d:,:] = stim[:-d,:]
 20 |         elif d>0:
 21 |             dstim[d:,:] = stim[:-d,:]
 22 |             if circpad:
 23 |                 dstim[:d,:] = stim[-d:,:]
 24 |         else: ## d==0
 25 |             dstim = stim.copy()
 26 |         dstims.append(dstim)
 27 |     return np.hstack(dstims)
 28 | 
 29 | def best_corr_vec(wvec, vocab, SU, n=10):
 30 |     """Returns the [n] words from [vocab] most similar to the given [wvec], where each word is represented
 31 |     as a row in [SU].  Similarity is computed using correlation."""
 32 |     wvec = wvec - np.mean(wvec)
 33 |     nwords = len(vocab)
 34 |     corrs = np.nan_to_num([np.corrcoef(wvec, SU[wi,:]-np.mean(SU[wi,:]))[1,0] for wi in range(nwords-1)])
 35 |     scorrs = np.argsort(corrs)
 36 |     words = list(reversed([(corrs[i],vocab[i]) for i in scorrs[-n:]]))
 37 |     return words
 38 | 
 39 | def get_word_prob():
 40 |     """Returns the probabilities of all the words in the mechanical turk video labels.
 41 |     """
 42 |     import constants as c
 43 |     import cPickle
 44 |     data = cPickle.load(open(c.datafile)) # Read in the words from the labels
 45 |     wordcount = dict()
 46 |     totalcount = 0
 47 |     for label in data:
 48 |         for word in label:
 49 |             totalcount += 1
 50 |             if word in wordcount:
 51 |                 wordcount[word] += 1
 52 |             else:
 53 |                 wordcount[word] = 1
 54 |     
 55 |     wordprob = dict([(word, float(wc)/totalcount) for word, wc in wordcount.items()])
 56 |     return wordprob
 57 | 
 58 | def best_prob_vec(wvec, vocab, space, wordprobs):
 59 |     """Orders the words by correlation with the given [wvec], but also weights the correlations by the prior
 60 |     probability of the word appearing in the mechanical turk video labels.
 61 |     """
 62 |     words = best_corr_vec(wvec, vocab, space, n=len(vocab)) ## get correlations for all words
 63 |     ## weight correlations by the prior probability of the word in the labels
 64 |     weightwords = []
 65 |     for wcorr,word in words:
 66 |         if word in wordprobs:
 67 |             weightwords.append((wordprobs[word]*wcorr, word))
 68 |     
 69 |     return sorted(weightwords, key=lambda ww: ww[0])
 70 | 
 71 | def find_best_words(vectors, vocab, wordspace, actual, display=True, num=15):
 72 |     cwords = []
 73 |     for si in range(len(vectors)):
 74 |         cw = best_corr_vec(vectors[si], vocab, wordspace, n=num)
 75 |         cwords.append(cw)
 76 |         if display:
 77 |             print ("Closest words to scene %d:" % si)
 78 |             print ([b[1] for b in cw])
 79 |             print ("Actual words:")
 80 |             print (actual[si])
 81 |             print ("")
 82 |     return cwords
 83 | 
 84 | def find_best_stims_for_word(wordvector, decstims, n):
 85 |     """Returns a list of the indexes of the [n] stimuli in [decstims] (should be decoded stimuli)
 86 |     that lie closest to the vector [wordvector], which should be taken from the same space as the
 87 |     stimuli.
 88 |     """
 89 |     scorrs = np.array([np.corrcoef(wordvector, ds)[0,1] for ds in decstims])
 90 |     scorrs[np.isnan(scorrs)] = -1
 91 |     return np.argsort(scorrs)[-n:][::-1]
 92 | 
 93 | def princomp(x, use_dgesvd=False):
 94 |     """Does principal components analysis on [x].
 95 |     Returns coefficients, scores and latent variable values.
 96 |     Translated from MATLAB princomp function.  Unlike the matlab princomp function, however, the
 97 |     rows of the returned value 'coeff' are the principal components, not the columns.
 98 |     """
 99 |     
100 |     n,p = x.shape
101 |     #cx = x-np.tile(x.mean(0), (n,1)) ## column-centered x
102 |     cx = x-x.mean(0)
103 |     r = np.min([n-1,p]) ## maximum possible rank of cx
104 | 
105 |     if use_dgesvd:
106 |         from svd_dgesvd import svd_dgesvd
107 |         U,sigma,coeff = svd_dgesvd(cx, full_matrices=False)
108 |     else:
109 |         U,sigma,coeff = np.linalg.svd(cx, full_matrices=False)
110 |     
111 |     sigma = np.diag(sigma)
112 |     score = np.dot(cx, coeff.T)
113 |     sigma = sigma/np.sqrt(n-1)
114 |     
115 |     latent = sigma**2
116 | 
117 |     return coeff, score, latent
118 | 
119 | def eigprincomp(x, npcs=None, norm=False, weights=None):
120 |     """Does principal components analysis on [x].
121 |     Returns coefficients (eigenvectors) and eigenvalues.
122 |     If given, only the [npcs] greatest eigenvectors/values will be returned.
123 |     If given, the covariance matrix will be computed using [weights] on the samples.
124 |     """
125 |     n,p = x.shape
126 |     #cx = x-np.tile(x.mean(0), (n,1)) ## column-centered x
127 |     cx = x-x.mean(0)
128 |     r = np.min([n-1,p]) ## maximum possible rank of cx
129 |     
130 |     xcov = np.cov(cx.T)
131 |     if norm:
132 |         xcov /= n
133 |     
134 |     if npcs is not None:
135 |         latent,coeff = scipy.linalg.eigh(xcov, eigvals=(p-npcs,p-1))
136 |     else:
137 |         latent,coeff = np.linalg.eigh(xcov)
138 |     
139 |     ## Transpose coeff, reverse its rows
140 |     return coeff.T[::-1], latent[::-1]
141 | 
142 | def weighted_cov(x, weights=None):
143 |     """If given [weights], the covariance will be computed using those weights on the samples.
144 |     Otherwise the simple covariance will be returned.
145 |     """
146 |     if weights is None:
147 |         return np.cov(x)
148 |     else:
149 |         w = weights/weights.sum() ## Normalize the weights
150 |         dmx = (x.T-(w*x).sum(1)).T ## Subtract the WEIGHTED mean
151 |         wfact = 1/(1-(w**2).sum()) ## Compute the weighting factor
152 |         return wfact*np.dot(w*dmx, dmx.T.conj()) ## Take the weighted inner product
153 | 
154 | def test_weighted_cov():
155 |     """Runs a test on the weighted_cov function, creating a dataset for which the covariance is known
156 |     for two different populations, and weights are used to reproduce the individual covariances.
157 |     """
158 |     T = 1000 ## number of time points
159 |     N = 100 ## A signals
160 |     M = 100 ## B signals
161 |     snr = 5 ## signal to noise ratio
162 |     
163 |     ## Create the two datasets
164 |     siga = np.random.rand(T)
165 |     noisea = np.random.rand(T, N)
166 |     respa = (noisea.T+snr*siga).T
167 | 
168 |     sigb = np.random.rand(T)
169 |     noiseb = np.random.rand(T, M)
170 |     respb = (noiseb.T+snr*sigb).T
171 | 
172 |     ## Compute self-covariance matrixes
173 |     cova = np.cov(respa)
174 |     covb = np.cov(respb)
175 | 
176 |     ## Compute the full covariance matrix
177 |     allresp = np.hstack([respa, respb])
178 |     fullcov = np.cov(allresp)
179 | 
180 |     ## Make weights that will recover individual covariances
181 |     wta = np.ones([N+M,])
182 |     wta[N:] = 0
183 | 
184 |     wtb = np.ones([N+M,])
185 |     wtb[:N] = 0
186 | 
187 |     recova = weighted_cov(allresp, wta)
188 |     recovb = weighted_cov(allresp, wtb)
189 |     
190 |     return locals()
191 | 
192 | def fixPCs(orig, new):
193 |     """Finds and fixes sign-flips in PCs by finding the coefficient with the greatest
194 |     magnitude in the [orig] PCs, then negating the [new] PCs if that coefficient has
195 |     a different sign.
196 |     """
197 |     flipped = []
198 |     for o,n in zip(orig, new):
199 |         maxind = np.abs(o).argmax()
200 |         if o[maxind]*n[maxind]>0:
201 |             ## Same sign, no need to flip
202 |             flipped.append(n)
203 |         else:
204 |             ## Different sign, flip
205 |             flipped.append(-n)
206 |     
207 |     return np.vstack(flipped)
208 | 
209 | 
210 | def plot_model_comparison(corrs1, corrs2, name1, name2, thresh=0.35):
211 |     fig = figure(figsize=(8,8))
212 |     ax = fig.add_subplot(1,1,1)
213 |     
214 |     good1 = corrs1>thresh
215 |     good2 = corrs2>thresh
216 |     better1 = corrs1>corrs2
217 |     #both = np.logical_and(good1, good2)
218 |     neither = np.logical_not(np.logical_or(good1, good2))
219 |     only1 = np.logical_and(good1, better1)
220 |     only2 = np.logical_and(good2, np.logical_not(better1))
221 |     
222 |     ptalpha = 0.3
223 |     ax.plot(corrs1[neither], corrs2[neither], 'ko', alpha=ptalpha)
224 |     #ax.plot(corrs1[both], corrs2[both], 'go', alpha=ptalpha)
225 |     ax.plot(corrs1[only1], corrs2[only1], 'ro', alpha=ptalpha)
226 |     ax.plot(corrs1[only2], corrs2[only2], 'bo', alpha=ptalpha)
227 |     
228 |     lims = [-0.5, 1.0]
229 |     
230 |     ax.plot([thresh, thresh], [lims[0], thresh], 'r-')
231 |     ax.plot([lims[0], thresh], [thresh,thresh], 'b-')
232 |     
233 |     ax.text(lims[0]+0.05, thresh, "$n=%d$"%np.sum(good2), horizontalalignment="left", verticalalignment="bottom")
234 |     ax.text(thresh, lims[0]+0.05, "$n=%d$"%np.sum(good1), horizontalalignment="left", verticalalignment="bottom")
235 |     
236 |     ax.plot(lims, lims, '-', color="gray")
237 |     ax.set_xlim(lims)
238 |     ax.set_ylim(lims)
239 |     ax.set_xlabel(name1)
240 |     ax.set_ylabel(name2)
241 |     
242 |     show()
243 |     return fig
244 | 
245 | import matplotlib.colors
246 | bwr = matplotlib.colors.LinearSegmentedColormap.from_list("bwr", ((0.0, 0.0, 1.0), (1.0, 1.0, 1.0), (1.0, 0.0, 0.0)))
247 | bkr = matplotlib.colors.LinearSegmentedColormap.from_list("bkr", ((0.0, 0.0, 1.0), (0.0, 0.0, 0.0), (1.0, 0.0, 0.0)))
248 | bgr = matplotlib.colors.LinearSegmentedColormap.from_list("bgr", ((0.0, 0.0, 1.0), (0.5, 0.5, 0.5), (1.0, 0.0, 0.0)))
249 | 
250 | def plot_model_comparison2(corrFile1, corrFile2, name1, name2, thresh=0.35):    
251 |     fig = figure(figsize=(9,10))
252 |     #ax = fig.add_subplot(3,1,[1,2], aspect="equal")
253 |     ax = fig.add_axes([0.25, 0.4, 0.6, 0.5], aspect="equal")
254 | 
255 |     corrs1 = tables.openFile(corrFile1).root.semcorr.read()
256 |     corrs2 = tables.openFile(corrFile2).root.semcorr.read()
257 |     maxcorr = np.clip(np.vstack([corrs1, corrs2]).max(0), 0, thresh)/thresh
258 |     corrdiff = (corrs1-corrs2) + 0.5
259 |     colors = (bgr(corrdiff).T*maxcorr).T
260 |     colors[:,3] = 1.0 ## Don't scale alpha
261 |     
262 |     ptalpha = 0.8
263 |     ax.scatter(corrs1, corrs2, s=10, c=colors, alpha=ptalpha, edgecolors="none")
264 |     lims = [-0.5, 1.0]
265 |     
266 |     ax.plot([thresh, thresh], [lims[0], thresh], color="gray")
267 |     ax.plot([lims[0], thresh], [thresh,thresh], color="gray")
268 | 
269 |     good1 = corrs1>thresh
270 |     good2 = corrs2>thresh
271 |     ax.text(lims[0]+0.05, thresh, "$n=%d$"%np.sum(good2), horizontalalignment="left", verticalalignment="bottom")
272 |     ax.text(thresh, lims[0]+0.05, "$n=%d$"%np.sum(good1), horizontalalignment="left", verticalalignment="bottom")
273 |     
274 |     ax.plot(lims, lims, '-', color="gray")
275 |     ax.set_xlim(lims)
276 |     ax.set_ylim(lims)
277 |     ax.set_xlabel(name1+" model")
278 |     ax.set_ylabel(name2+" model")
279 | 
280 |     fig.canvas.draw()
281 |     show()
282 |     ## Add over-under comparison
283 |     #ax_left = ax.get_window_extent()._bbox.x0
284 |     #ax_right = ax.get_window_extent()._bbox.x1
285 |     #ax_width = ax_right-ax_left
286 |     #print ax_left, ax_right
287 |     #ax2 = fig.add_axes([ax_left, 0.1, ax_width, 0.2])
288 |     ax2 = fig.add_axes([0.25, 0.1, 0.6, 0.25])#, sharex=ax)
289 |     #ax2 = fig.add_subplot(3, 1, 3)
290 |     #plot_model_overunder_comparison(corrs1, corrs2, name1, name2, thresh=thresh, ax=ax2)
291 |     plot_model_histogram_comparison(corrs1, corrs2, name1, name2, thresh=thresh, ax=ax2)
292 | 
293 |     fig.suptitle("Model comparison: %s vs. %s"%(name1, name2))
294 |     show()
295 |     return fig
296 | 
297 | 
298 | def plot_model_overunder_comparison(corrs1, corrs2, name1, name2, thresh=0.35, ax=None):
299 |     """Plots over-under difference between two models.
300 |     """
301 |     if ax is None:
302 |         fig = figure(figsize=(8,8))
303 |         ax = fig.add_subplot(1,1,1)
304 | 
305 |     maxcorr = max(corrs1.max(), corrs2.max())
306 |     vals = np.linspace(0, maxcorr, 500)
307 |     overunder = lambda c: np.array([np.sum(c>v)-np.sum(c<-v) for v in vals])
308 | 
309 |     ou1 = overunder(corrs1)
310 |     ou2 = overunder(corrs2)
311 | 
312 |     oud = ou2-ou1
313 | 
314 |     ax.fill_between(vals, 0, np.clip(oud, 0, 1e9), facecolor="blue")
315 |     ax.fill_between(vals, 0, np.clip(oud, -1e9, 0), facecolor="red")
316 | 
317 |     yl = np.max(np.abs(np.array(ax.get_ylim())))
318 |     ax.plot([thresh, thresh], [-yl, yl], '-', color="gray")
319 |     ax.set_ylim(-yl, yl)
320 |     ax.set_xlim(0, maxcorr)
321 |     ax.set_xlabel("Voxel correlation")
322 |     ax.set_ylabel("%s better           %s better"%(name1, name2))
323 | 
324 |     show()
325 |     return ax
326 | 
327 | def plot_model_histogram_comparison(corrs1, corrs2, name1, name2, thresh=0.35, ax=None):
328 |     """Plots over-under difference between two models.
329 |     """
330 |     if ax is None:
331 |         fig = figure(figsize=(8,8))
332 |         ax = fig.add_subplot(1,1,1)
333 |     
334 |     maxcorr = max(corrs1.max(), corrs2.max())
335 |     nbins = 100
336 |     hist1 = np.histogram(corrs1, nbins, range=(-1,1))
337 |     hist2 = np.histogram(corrs2, nbins, range=(-1,1))
338 | 
339 |     ouhist1 = hist1[0][nbins/2:]-hist1[0][:nbins/2][::-1]
340 |     ouhist2 = hist2[0][nbins/2:]-hist2[0][:nbins/2][::-1]
341 | 
342 |     oud = ouhist2-ouhist1
343 |     bwidth = 2.0/nbins
344 |     barlefts = hist1[1][nbins/2:-1]
345 | 
346 |     #ax.fill_between(vals, 0, np.clip(oud, 0, 1e9), facecolor="blue")
347 |     #ax.fill_between(vals, 0, np.clip(oud, -1e9, 0), facecolor="red")
348 | 
349 |     ax.bar(barlefts, np.clip(oud, 0, 1e9), bwidth, facecolor="blue")
350 |     ax.bar(barlefts, np.clip(oud, -1e9, 0), bwidth, facecolor="red")
351 | 
352 |     yl = np.max(np.abs(np.array(ax.get_ylim())))
353 |     ax.plot([thresh, thresh], [-yl, yl], '-', color="gray")
354 |     ax.set_ylim(-yl, yl)
355 |     ax.set_xlim(0, maxcorr)
356 |     ax.set_xlabel("Voxel correlation")
357 |     ax.set_ylabel("%s better           %s better"%(name1, name2))
358 | 
359 |     show()
360 |     return ax
361 | 
362 | 
363 | def plot_model_comparison_rois(corrs1, corrs2, name1, name2, roivoxels, roinames, thresh=0.35):
364 |     """Plots model correlation comparisons per ROI.
365 |     """
366 |     fig = figure()
367 |     ptalpha = 0.3
368 |     
369 |     for ri in range(len(roinames)):
370 |         ax = fig.add_subplot(4, 4, ri+1)
371 |         ax.plot(corrs1[roivoxels[ri]], corrs2[roivoxels[ri]], 'bo', alpha=ptalpha)
372 |         lims = [-0.3, 1.0]
373 |         ax.plot(lims, lims, '-', color="gray")
374 |         ax.set_xlim(lims)
375 |         ax.set_ylim(lims)
376 |         ax.set_title(roinames[ri])
377 |     
378 |     show()
379 |     return fig
380 | 
381 | def save_table_file(filename, filedict):
382 |     """Saves the variables in [filedict] in a hdf5 table file at [filename].
383 |     """
384 |     hf = tables.openFile(filename, mode="w", title="save_file")
385 |     for vname, var in filedict.items():
386 |         hf.createArray("/", vname, var)
387 |     hf.close()
388 | 
389 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | #import scipy.stats
  4 | import random
  5 | import sys
  6 | import os
  7 | 
  8 | def zscore(mat, return_unzvals=False):
  9 |     """Z-scores the rows of [mat] by subtracting off the mean and dividing
 10 |     by the standard deviation.
 11 |     If [return_unzvals] is True, a matrix will be returned that can be used
 12 |     to return the z-scored values to their original state.
 13 |     """
 14 |     zmat = np.empty(mat.shape, mat.dtype)
 15 |     unzvals = np.zeros((zmat.shape[0], 2), mat.dtype)
 16 |     for ri in range(mat.shape[0]):
 17 |         unzvals[ri,0] = np.std(mat[ri,:])
 18 |         unzvals[ri,1] = np.mean(mat[ri,:])
 19 |         zmat[ri,:] = (mat[ri,:]-unzvals[ri,1]) / (1e-10+unzvals[ri,0])
 20 |     
 21 |     if return_unzvals:
 22 |         return zmat, unzvals
 23 |     
 24 |     return zmat
 25 | 
 26 | def center(mat, return_uncvals=False):
 27 |     """Centers the rows of [mat] by subtracting off the mean, but doesn't 
 28 |     divide by the SD.
 29 |     Can be undone like zscore.
 30 |     """
 31 |     cmat = np.empty(mat.shape)
 32 |     uncvals = np.ones((mat.shape[0], 2))
 33 |     for ri in range(mat.shape[0]):
 34 |         uncvals[ri,1] = np.mean(mat[ri,:])
 35 |         cmat[ri,:] = mat[ri,:]-uncvals[ri,1]
 36 |     
 37 |     if return_uncvals:
 38 |         return cmat, uncvals
 39 |     
 40 |     return cmat
 41 | 
 42 | def unzscore(mat, unzvals):
 43 |     """Un-Z-scores the rows of [mat] by multiplying by unzvals[:,0] (the standard deviations)
 44 |     and then adding unzvals[:,1] (the row means).
 45 |     """
 46 |     unzmat = np.empty(mat.shape)
 47 |     for ri in range(mat.shape[0]):
 48 |         unzmat[ri,:] = mat[ri,:]*(1e-10+unzvals[ri,0])+unzvals[ri,1]
 49 |     return unzmat
 50 | 
 51 | def ridge(A, b, alpha):
 52 |     """Performs ridge regression, estimating x in Ax=b with a regularization
 53 |     parameter of alpha.
 54 |     With $G=\alpha I(m_A)$, this function returns $W$ with:
 55 |     $W=(A^TA+G^TG)^{-1}A^Tb^T$
 56 |     Tantamount to minimizing $||Ax-b||+||\alpha I||$.
 57 |     """
 58 |     G = np.matrix(np.identity(A.shape[1]) * alpha)
 59 |     return np.dot(np.dot(np.linalg.inv(np.dot(A.T,A) + np.dot(G.T,G)), A.T), b.T)
 60 | 
 61 | def model_voxels(Rstim, Pstim, Rresp, Presp, alpha):
 62 |     """Use ridge regression with regularization parameter [alpha] to model [Rresp]
 63 |     using [Rstim].  Correlation coefficients on the test set ([Presp] and [Pstim])
 64 |     will be returned for each voxel, as well as the linear weights.
 65 |     """
 66 |     print ("Z-scoring stimuli (with a flip)... (or not)")
 67 |     #zRstim = zscore(Rstim.T).T
 68 |     #zPstim = zscore(Pstim.T).T
 69 |     
 70 |     Rresp[np.isnan(Rresp)] = 0.0
 71 |     Presp[np.isnan(Presp)] = 0.0
 72 |     
 73 |     print ("Running ridge regression...")
 74 |     rwts = ridge(Rstim, Rresp.T, alpha)
 75 |     print ("Finding correlations...")
 76 |     pred = np.dot(Pstim, rwts)
 77 |     prednorms = np.apply_along_axis(np.linalg.norm, 0, pred)
 78 |     respnorms = np.apply_along_axis(np.linalg.norm, 0, Presp)
 79 |     correlations = np.array(np.sum(np.multiply(Presp, pred), 0)).squeeze()/(prednorms*respnorms)
 80 |     
 81 |     print ("Max correlation: %0.3f" % np.max(correlations))
 82 |     print ("Skewness: %0.3f" % scipy.stats.skew(correlations))
 83 |     return np.array(correlations), rwts
 84 | 
 85 | def model_voxels_old(Rstim, Pstim, Rresp, Presp, alpha):
 86 |     """Use ridge regression with regularization parameter [alpha] to model [Rresp]
 87 |     using [Rstim].  Correlation coefficients on the test set ([Presp] and [Pstim])
 88 |     will be returned for each voxel, as well as the linear weights.
 89 |     """
 90 |     print ("Z-scoring stimuli (with a flip)...")
 91 |     #zRstim = zscore(Rstim.T).T
 92 |     #zPstim = zscore(Pstim.T).T
 93 |     
 94 |     Rresp[np.isnan(Rresp)] = 0.0
 95 |     Presp[np.isnan(Presp)] = 0.0
 96 |     
 97 |     print ("Running ridge regression...")
 98 |     rwts = ridge(Rstim, Rresp.T, alpha)
 99 |     print ("Finding correlations...")
100 |     correlations = []
101 |     for vi in range(Presp.shape[1]):
102 |         rcorr = np.corrcoef(Presp[:,vi].T,np.array((np.matrix(Pstim) * np.matrix(rwts[:,vi]))).T)[0,1]
103 |         correlations.append(rcorr)
104 |         
105 |     print ("Max correlation: %0.3f" % np.max(correlations))
106 |     print ("Skewness: %0.3f" % scipy.stats.skew(correlations))
107 |     return np.array(correlations), rwts
108 | 
109 | def gaussianize(vec):
110 |     """Uses a look-up table to force the values in [vec] to be gaussian."""
111 |     ranks = np.argsort(np.argsort(vec))
112 |     cranks = (ranks+1).astype(float)/(ranks.max()+2)
113 |     vals = scipy.stats.norm.isf(1-cranks)
114 |     zvals = vals/vals.std()
115 |     return zvals
116 | 
117 | def gaussianize_mat(mat):
118 |     """Gaussianizes each column of [mat]."""
119 |     gmat = np.empty(mat.shape)
120 |     for ri in range(mat.shape[1]):
121 |         gmat[:,ri] = gaussianize(mat[:,ri])
122 |     return gmat
123 | 
124 | def make_delayed(stim, delays, circpad=False):
125 |     """Creates non-interpolated concatenated delayed versions of [stim] with the given [delays] 
126 |     (in samples).
127 |     
128 |     If [circpad], instead of being padded with zeros, [stim] will be circularly shifted.
129 |     """
130 |     nt,ndim = stim.shape
131 |     dstims = []
132 |     for di,d in enumerate(delays):
133 |         dstim = np.zeros((nt, ndim))
134 |         if d<0: ## negative delay
135 |             dstim[:d,:] = stim[-d:,:]
136 |             if circpad:
137 |                 dstim[d:,:] = stim[:-d,:]
138 |         elif d>0:
139 |             dstim[d:,:] = stim[:-d,:]
140 |             if circpad:
141 |                 dstim[:d,:] = stim[-d:,:]
142 |         else: ## d==0
143 |             dstim = stim.copy()
144 |         dstims.append(dstim)
145 |     return np.hstack(dstims)
146 | 
147 | def mult_diag(d, mtx, left=True):
148 |     """Multiply a full matrix by a diagonal matrix.
149 |     This function should always be faster than dot.
150 | 
151 |     Input:
152 |       d -- 1D (N,) array (contains the diagonal elements)
153 |       mtx -- 2D (N,N) array
154 | 
155 |     Output:
156 |       mult_diag(d, mts, left=True) == dot(diag(d), mtx)
157 |       mult_diag(d, mts, left=False) == dot(mtx, diag(d))
158 |     
159 |     By Pietro Berkes
160 |     From http://mail.scipy.org/pipermail/numpy-discussion/2007-March/026807.html
161 |     """
162 |     if left:
163 |         return (d*mtx.T).T
164 |     else:
165 |         return d*mtx
166 | 
167 | import time
168 | import logging
169 | def counter(iterable, countevery=100, total=None, logger=logging.getLogger("counter")):
170 |     """Logs a status and timing update to [logger] every [countevery] draws from [iterable].
171 |     If [total] is given, log messages will include the estimated time remaining.
172 |     """
173 |     start_time = time.time()
174 | 
175 |     ## Check if the iterable has a __len__ function, use it if no total length is supplied
176 |     if total is None:
177 |         if hasattr(iterable, "__len__"):
178 |             total = len(iterable)
179 |     
180 |     for count, thing in enumerate(iterable):
181 |         yield thing
182 |         
183 |         if not count%countevery:
184 |             current_time = time.time()
185 |             rate = float(count+1)/(current_time-start_time)
186 | 
187 |             if rate>1: ## more than 1 item/second
188 |                 ratestr = "%0.2f items/second"%rate
189 |             else: ## less than 1 item/second
190 |                 ratestr = "%0.2f seconds/item"%(rate**-1)
191 |             
192 |             if total is not None:
193 |                 remitems = total-(count+1)
194 |                 remtime = remitems/rate
195 |                 timestr = ", %s remaining" % time.strftime('%H:%M:%S', time.gmtime(remtime))
196 |                 itemstr = "%d/%d"%(count+1, total)
197 |             else:
198 |                 timestr = ""
199 |                 itemstr = "%d"%(count+1)
200 | 
201 |             formatted_str = "%s items complete (%s%s)"%(itemstr,ratestr,timestr)
202 |             if logger is None:
203 |                 print (formatted_str)
204 |             else:
205 |                 logger.info(formatted_str)
206 | 
207 | 
208 | def wait_for_disk(dir, maxtime=0.2, retrytime=10.0, maxtries=100):
209 |     """Waits to continue until disk is not slammed.
210 |     """
211 |     for trynum in range(maxtries):
212 |         stime = time.time()
213 |         os.listdir(dir)
214 |         lstime = time.time() - stime
215 |         if lstime < maxtime:
216 |             print ("Disk access is quick (%0.3f seconds to ls), continuing.." % lstime)
217 |             return
218 |         else:
219 |             print ("Disk access is slow (%0.3f seconds to ls), waiting more.." % lstime)
220 |             time.sleep(retrytime)
221 | 
222 |     print ("Disk access is slow but fuck it, I'm starting anyway..")
223 | 


--------------------------------------------------------------------------------