├── .gitignore ├── DataSequence.py ├── README.md ├── SemanticModel.py ├── SpeechModelTutorial - Pre-run.html ├── SpeechModelTutorial.ipynb ├── dsutils.py ├── english1000.py ├── features.py ├── interpdata.py ├── npp.py ├── ridge.py ├── stimulus_utils.py ├── textgrid.py ├── util.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /DataSequence.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import itertools as itools 3 | from interpdata import sincinterp2D, gabor_xfm2D, lanczosinterp2D 4 | 5 | class DataSequence(object): 6 | """DataSequence class provides a nice interface for handling data that is both continuous 7 | and discretely chunked. For example, semantic projections of speech stimuli must be 8 | considered both at the level of single words (which are continuous throughout the stimulus) 9 | and at the level of TRs (which contain discrete chunks of words). 10 | """ 11 | def __init__(self, data, split_inds, data_times=None, tr_times=None): 12 | """Initializes the DataSequence with the given [data] object (which can be any iterable) 13 | and a collection of [split_inds], which should be the indices where the data is split into 14 | separate TR chunks. 15 | """ 16 | self.data = data 17 | self.split_inds = split_inds 18 | self.data_times = data_times 19 | self.tr_times = tr_times 20 | 21 | def mapdata(self, fun): 22 | """Creates a new DataSequence where each element of [data] is produced by mapping the 23 | function [fun] onto this DataSequence's [data]. 24 | 25 | The [split_inds] are preserved exactly. 26 | """ 27 | return DataSequence(self, map(fun, self.data), self.split_inds) 28 | 29 | def chunks(self): 30 | """Splits the stored [data] into the discrete chunks and returns them. 31 | """ 32 | return np.split(self.data, self.split_inds) 33 | 34 | def data_to_chunk_ind(self, dataind): 35 | """Returns the index of the chunk containing the data with the given index. 36 | """ 37 | zc = np.zeros((len(self.data),)) 38 | zc[dataind] = 1.0 39 | ch = np.array([ch.sum() for ch in np.split(zc, self.split_inds)]) 40 | return np.nonzero(ch)[0][0] 41 | 42 | def chunk_to_data_ind(self, chunkind): 43 | """Returns the indexes of the data contained in the chunk with the given index. 44 | """ 45 | return list(np.split(np.arange(len(self.data)), self.split_inds)[chunkind]) 46 | 47 | def chunkmeans(self): 48 | """Splits the stored [data] into the discrete chunks, then takes the mean of each chunk 49 | (this is assuming that [data] is a numpy array) and returns the resulting matrix with 50 | one row per chunk. 51 | """ 52 | dsize = self.data.shape[1] 53 | outmat = np.zeros((len(self.split_inds)+1, dsize)) 54 | for ci, c in enumerate(self.chunks()): 55 | if len(c): 56 | outmat[ci] = np.vstack(c).mean(0) 57 | 58 | return outmat 59 | 60 | def chunksums(self, interp="rect", **kwargs): 61 | """Splits the stored [data] into the discrete chunks, then takes the sum of each chunk 62 | (this is assuming that [data] is a numpy array) and returns the resulting matrix with 63 | one row per chunk. 64 | 65 | If [interp] is "sinc", the signal will be downsampled using a truncated sinc filter 66 | instead of a rectangular filter. 67 | 68 | if [interp] is "lanczos", the signal will be downsampled using a Lanczos filter. 69 | 70 | [kwargs] are passed to the interpolation function. 71 | """ 72 | if interp=="sinc": 73 | ## downsample using sinc filter 74 | return sincinterp2D(self.data, self.data_times, self.tr_times, **kwargs) 75 | elif interp=="lanczos": 76 | ## downsample using Lanczos filter 77 | return lanczosinterp2D(self.data, self.data_times, self.tr_times, **kwargs) 78 | elif interp=="gabor": 79 | ## downsample using Gabor filter 80 | return np.abs(gabor_xfm2D(self.data.T, self.data_times, self.tr_times, **kwargs)).T 81 | else: 82 | dsize = self.data.shape[1] 83 | outmat = np.zeros((len(self.split_inds)+1, dsize)) 84 | for ci, c in enumerate(self.chunks()): 85 | if len(c): 86 | outmat[ci] = np.vstack(c).sum(0) 87 | 88 | return outmat 89 | 90 | def copy(self): 91 | """Returns a copy of this DataSequence. 92 | """ 93 | return DataSequence(list(self.data), self.split_inds.copy(), self.data_times, self.tr_times) 94 | 95 | @classmethod 96 | def from_grid(cls, grid_transcript, trfile): 97 | """Creates a new DataSequence from a [grid_transript] and a [trfile]. 98 | grid_transcript should be the product of the 'make_simple_transcript' method of TextGrid. 99 | """ 100 | data_entries = list(zip(*grid_transcript))[2] 101 | if isinstance(data_entries[0], str): 102 | data = list(map(str.lower, list(zip(*grid_transcript))[2])) 103 | else: 104 | data = data_entries 105 | word_starts = np.array(list(map(float, list(zip(*grid_transcript))[0]))) 106 | word_ends = np.array(list(map(float, list(zip(*grid_transcript))[1]))) 107 | word_avgtimes = (word_starts + word_ends)/2.0 108 | 109 | tr = trfile.avgtr 110 | trtimes = trfile.get_reltriggertimes() 111 | 112 | split_inds = [(word_starts<(t+tr)).sum() for t in trtimes][:-1] 113 | return cls(data, split_inds, word_avgtimes, trtimes+tr/2.0) 114 | 115 | @classmethod 116 | def from_chunks(cls, chunks): 117 | """The inverse operation of DataSequence.chunks(), this function concatenates 118 | the [chunks] and infers split_inds. 119 | """ 120 | lens = map(len, chunks) 121 | split_inds = np.cumsum(lens)[:-1] 122 | #data = reduce(list.__add__, map(list, chunks)) ## 2.26s for 10k 6-w chunks 123 | data = list(itools.chain(*map(list, chunks))) ## 19.6ms for 10k 6-w chunks 124 | return cls(data, split_inds) 125 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # speechmodeltutorial 2 | 3 | Originally given as a tutorial at EACL 2014 by Alex Huth. 4 | 5 | In this tutorial you will step through a voxel-wise modeling analysis. You will use computational models to extract semantic features from a natural speech stimulus. Then these features will be used to build linear models of fMRI data, and model weights and prediction performance will be visualized. 6 | 7 | If you so desire, you can step through this entire tutorial without modifying any code. But there are a few points where you will be able to make simple modifications and then see what effect those modifications have on the results. Additionally, at the end you can re-run the model using phoneme features instead of semantic features. 8 | 9 | #### Acknowledgements 10 | This fMRI data used in this tutorial was collected by Alex Huth and Wendy de Heer at the University of California, Berkeley. All work was supervised by professors Jack Gallant and Frederic Theunissen of the UC Berkeley Psychology Department. Please do not redistribute the code or data used here. Visualization is done using [pycortex](https://pycortex.github.io/). 11 | 12 | #### Citation 13 | The analysis demonstrated in this tutorial forms the basis of this paper: 14 | [Huth, A. G. et al., "Natural speech reveals the semantic maps that tile human cerebral cortex" (2016) _Nature_.](https://www.nature.com/articles/nature17637) 15 | 16 | Installation 17 | ------------ 18 | 1. Download the [data files](https://utexas.box.com/shared/static/4n3lemyec0wlj5rcr80991nxwflsbks9.zip) and unzip in this directory. Should create a directory called `data`. 19 | 2. (If not using Anaconda) install dependencies: 20 | `sudo apt-get update` 21 | `sudo apt-get install -y ipython ipython-notebook python-numpy python-scipy python-matplotlib cython python-pip python-pip python-dev python-h5py python-nibabel python-lxml python-shapely python-html5lib mayavi2 python-tables git` 22 | 23 | (If using Conda): `conda install python 'cython=0.29.36' pytables h5py jupyter matplotlib numpy scipy` (NOTE: some packages may be missing from this list) 24 | 25 | (The cython requirement is from this issue: https://github.com/gallantlab/pycortex/issues/490#issuecomment-1644641810 ) 26 | 27 | 4. Fetch and install pycortex: 28 | `git clone https://github.com/gallantlab/pycortex.git` 29 | `cd pycortex; python setup.py install` 30 | 5. Start a Jupyter notebook server in this directory (if you don't have one): 31 | `jupyter notebook` 32 | -------------------------------------------------------------------------------- /SemanticModel.py: -------------------------------------------------------------------------------- 1 | import tables 2 | import pickle 3 | import numpy as np 4 | 5 | import logging 6 | logger = logging.getLogger("SemanticModel") 7 | 8 | class SemanticModel(object): 9 | """This class defines a semantic vector-space model based on HAL or LSA with some 10 | prescribed preprocessing pipeline. 11 | 12 | It contains two important variables: vocab and data. 13 | vocab is a 1D list (or array) of words. 14 | data is a 2D array (features by words) of word-feature values. 15 | """ 16 | def __init__(self, data, vocab): 17 | """Initializes a SemanticModel with the given [data] and [vocab]. 18 | """ 19 | self.data = data 20 | self.vocab = vocab 21 | 22 | def get_ndim(self): 23 | """Returns the number of dimensions in this model. 24 | """ 25 | return self.data.shape[0] 26 | ndim = property(get_ndim) 27 | 28 | def get_vindex(self): 29 | """Return {vocab: index} dictionary. 30 | """ 31 | if "_vindex" not in dir(self): 32 | self._vindex = dict([(v,i) for (i,v) in enumerate(self.vocab)]) 33 | return self._vindex 34 | vindex = property(get_vindex) 35 | 36 | def __getitem__(self, word): 37 | """Returns the vector corresponding to the given [word]. 38 | """ 39 | return self.data[:,self.vindex[word]] 40 | 41 | def load_root(self, rootfile, vocab): 42 | """Load the SVD-generated semantic vector space from [rootfile], assumed to be 43 | an HDF5 file. 44 | """ 45 | roothf = tables.openFile(rootfile) 46 | self.data = roothf.getNode("/R").read() 47 | self.vocab = vocab 48 | roothf.close() 49 | 50 | def load_ascii_root(self, rootfile, vocab): 51 | """Loads the SVD-generated semantic vector space from [rootfile], assumed to be 52 | an ASCII dense matrix output from SDVLIBC. 53 | """ 54 | vtfile = open(rootfile) 55 | nrows, ncols = map(int, vtfile.readline().split()) 56 | Vt = np.zeros((nrows,ncols)) 57 | nrows_done = 0 58 | for row in vtfile: 59 | Vt[nrows_done,:] = map(float, row.split()) 60 | nrows_done += 1 61 | 62 | self.data = Vt 63 | self.vocab = vocab 64 | 65 | def restrict_by_occurrence(self, min_rank=60, max_rank=60000): 66 | """Restricts the data to words that have an occurrence rank lower than 67 | [min_rank] and higher than [max_rank]. 68 | """ 69 | logger.debug("Restricting words by occurrence..") 70 | nwords = self.data.shape[1] 71 | wordranks = np.argsort(np.argsort(self.data[0,:])) 72 | goodwords = np.nonzero(np.logical_and((nwords-wordranks)>min_rank, 73 | (nwords-wordranks) Insert cell below`), enter `%load libraryname.py`, and evaluate." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# Run this cell if your computer has a 'retina' or high DPI display. It will make the figures look much nicer.\n", 40 | "%config InlineBackend.figure_format = 'retina'" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# This cell imports libraries that you will need\n", 50 | "# Run this.\n", 51 | "from matplotlib.pyplot import figure, cm\n", 52 | "import numpy as np\n", 53 | "import logging\n", 54 | "logging.basicConfig(level=logging.DEBUG)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## The semantic model: English1000\n", 62 | "Here you will load a precomputed vector-space semantic model. This semantic model will transform any word (well, any word it knows about) into a 985-dimensional vector. This 985-dimensional semantic space has the nice property that words that are close together tend to have similar meanings. Although it would have been fun to have tried reconstructing this semantic model in this tutorial, it takes a really long time and it doesn't seem like the parameters matter that much. So today you're just going to work with the preconstructed semantic model.\n", 63 | "\n", 64 | "The semantic model was constructed using a decently large corpus of text (a couple billion words, comprising the stories used as stimuli here, 604 popular books, 2,405,569 wikipedia pages, and 36,333,459 user comments scraped from reddit.com) and a lexicon of roughly 10,000 words. We selected 985 \"basis words\" from the Wikipedia \"List of 1000 basic words\" (contrary to the title, this lost does not actually contain 1000 words, but this is where the title of the model comes from). These are common words that span many topics.\n", 65 | "\n", 66 | "We constructed a word co-occurrence matrix, $M$, with 985 rows and 10,470 columns. Iterating through the training corpus, we added 1 to $M_{ij}$ each time word $j$ appeared within 15 words of basis word $i$. The window size of 15 was selected to be large enough to suppress syntactic effects (word order) but no larger. Once the co-occurrence matrix was complete, we log transformed the counts, replacing $M_{ij}$ with $\\log(1 + M_{ij})$. Then each row of M was z-scored to correct for differences in basis word frequency, and finally each column of $M$ was z-scored to correct for word frequency. The resulting matrix is the one you're loading here.\n", 67 | "\n", 68 | "(As an aside, while I constructed this model in a totally ad hoc and unplanned way, it has properties that are very similar to Mikolov's [word2vec model](https://code.google.com/p/word2vec/) that's recently gained a lot of popularity.)\n", 69 | "\n", 70 | "Anyway, here you are going to load the model and then play with it a bit to see how it works." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# Load semantic model\n", 80 | "# The SemanticModel class is something I wrote to make it easy to deal with vector-space semantic models.\n", 81 | "from SemanticModel import SemanticModel\n", 82 | "eng1000 = SemanticModel.load(\"data/english1000sm.hf5\")" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# You can get the vector for a word by indexing the model with that word\n", 92 | "# For example, the vector for \"finger\":\n", 93 | "print(eng1000[\"finger\"])" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "### Visualizing a word\n", 101 | "First let's plot the length 985 vector for one word to see what it looks like." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "plot_word = \"finger\"\n", 111 | "\n", 112 | "f = figure(figsize=(15,5))\n", 113 | "ax = f.add_subplot(1,1,1)\n", 114 | "ax.plot(eng1000[plot_word], 'k')\n", 115 | "ax.axis(\"tight\")\n", 116 | "ax.set_title(\"English1000 representation for %s\" % plot_word)\n", 117 | "ax.set_xlabel(\"Feature number\")\n", 118 | "ax.set_ylabel(\"Feature value\")" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "### Visualizing more than one word\n", 126 | "Next let's plot the vectors for three words: \"finger\", \"fingers\", and \"grief\". Here you will see that \"finger\" (in black) and \"fingers\" (in red) look very similar, but \"grief\" (in blue) looks very different. Neat." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "plot_words = [\"finger\", \"fingers\", \"language\"]\n", 136 | "colors = [\"k\", \"r\", \"b\"]\n", 137 | "\n", 138 | "f = figure(figsize=(15,5))\n", 139 | "ax = f.add_subplot(1,1,1)\n", 140 | "wordlines = []\n", 141 | "\n", 142 | "for word, color in zip(plot_words, colors):\n", 143 | " wordlines.append(ax.plot(eng1000[word], color)[0])\n", 144 | "\n", 145 | "ax.axis(\"tight\")\n", 146 | "ax.set_title(\"English1000 representations for some words\")\n", 147 | "ax.set_xlabel(\"Feature number\")\n", 148 | "ax.set_ylabel(\"Feature value\")\n", 149 | "ax.legend(wordlines, plot_words)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "### Semantic smoothness\n", 157 | "One nice test of a vector-space semantic model is whether it results in a \"semantically smooth\" representation of the words. That is, do nearby words in the space have intuitively similar meanings? Here you can test that using the method `find_words_like_word`. \n", 158 | "\n", 159 | "Give any word (that the model knows about), and it will print out the 10 closest words (that it knows about) and their cosine similarities (or correlations, same thing in this case). This includes the word you supplied.\n", 160 | "\n", 161 | "In this next example it prints the closest words to \"finger\". All of the 10 closest words are semantically related: 9 are nouns, and 1 is a verb (\"stick\"; of course this is also a noun, I'm just assuming that the sense of \"stick\" that's close to \"finger\" is probably the verb sense, but this brings up an important point: this model does nothing to disambiguate between different word senses!).\n", 162 | "\n", 163 | "You can put different words in here and see what the model comes up with. \n", 164 | "\n", 165 | "*(Be warned: the model knows some dirty words. It was trained using the internet, after all.)*" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "# Test semantic model\n", 175 | "eng1000.find_words_like_word(\"finger\")" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Here is just another example, but this one an abstract noun, \"language\". Again the model does a pretty good job at finding related words." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "eng1000.find_words_like_word(\"language\")" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "eng1000.find_words_like_vec(eng1000[\"king\"] - eng1000[\"man\"] + eng1000[\"woman\"])" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "## The stimuli: Moth stories\n", 208 | "Next we're going to load up the stimuli. We're not going to be dealing with the actual audio of the stories that were presented, but instead with aligned transcripts. These were generated using the UPenn forced aligner (P2FA), which figures out when each word was spoken given the transcript and the audio. The transcripts are stored in TextGrid format (native to Praat), which can be loaded directly into Python using some code from the natural language toolkit (NLTK).\n", 209 | "\n", 210 | "Here you will load the TextGrids for the stories, as well as 'TRfiles', which specify the time points relative to story onset when the fMRI data was collected (roughly every 2 seconds).\n", 211 | "\n", 212 | "Finally the TextGrids and TRfiles will be combined together into a representation I call a DataSequence. There is nothing interesting going on here scientifically, this is just something to make subsequent steps more manageable." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "# These are lists of the stories\n", 222 | "# Rstories are the names of the training (or Regression) stories, which we will use to fit our models\n", 223 | "Rstories = ['alternateithicatom', 'avatar', 'howtodraw', 'legacy', \n", 224 | " 'life', 'myfirstdaywiththeyankees', 'naked', \n", 225 | " 'odetostepfather', 'souls', 'undertheinfluence']\n", 226 | "\n", 227 | "# Pstories are the test (or Prediction) stories (well, story), which we will use to test our models\n", 228 | "Pstories = ['wheretheressmoke']\n", 229 | "\n", 230 | "allstories = Rstories + Pstories\n", 231 | "\n", 232 | "# Load TextGrids\n", 233 | "from stimulus_utils import load_grids_for_stories\n", 234 | "grids = load_grids_for_stories(allstories)\n", 235 | "\n", 236 | "# Load TRfiles\n", 237 | "from stimulus_utils import load_generic_trfiles\n", 238 | "trfiles = load_generic_trfiles(allstories)\n", 239 | "\n", 240 | "# Make word and phoneme datasequences\n", 241 | "from dsutils import make_word_ds, make_phoneme_ds\n", 242 | "wordseqs = make_word_ds(grids, trfiles) # dictionary of {storyname : word DataSequence}\n", 243 | "phonseqs = make_phoneme_ds(grids, trfiles) # dictionary of {storyname : phoneme DataSequence}" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "Before going on, let's play with the DataSequences a bit, both so you can see what the data structure looks like, and also so you can see what the stimuli look like." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "naked = wordseqs[\"naked\"]\n", 260 | "# The DataSequence stores a lot of information\n", 261 | "# naked.data is a list of all the words in the story\n", 262 | "print (\"There are %d words in the story called 'naked'\" % len(list(naked.data)))" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "# We can print out the first 100 words like this\n", 272 | "print (list(naked.data)[:100])" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "# or, if you want it to be more readable, like this\n", 282 | "print (\" \".join(list(naked.data)[:100]))" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "# the datasequence also stores when exactly each word was spoken (this time corresponds to the middle of each word)\n", 292 | "print (naked.data_times[:10])" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "# and it also stores the time of the middle of each fMRI acquisition (each acqusition takes 2.0045 seconds)\n", 302 | "# these times are relative to story start, so the fMRI scan started 10 seconds before the story\n", 303 | "print (naked.tr_times[:10])" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "# and it also makes it easy to, for example, find the words that were spoken during each fMRI acquisition\n", 313 | "# (the first few are empty because they came before the story started)\n", 314 | "print (naked.chunks()[:10])" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "## Projecting the stimuli into the semantic space\n", 322 | "The next step in this analysis is that you need to project each word in the stimulus into the English1000 semantic feature space that you loaded above. I wrote a nice function to do this called `make_semantic_model` that simply takes the word DataSequence and the semantic model, and spits out a new DataSequence where each word is replaced by a 985-dimensional vector." 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "# Project stimuli\n", 332 | "from dsutils import make_semantic_model\n", 333 | "semanticseqs = dict() # dictionary to hold projected stimuli {story name : projected DataSequence}\n", 334 | "for story in allstories:\n", 335 | " semanticseqs[story] = make_semantic_model(wordseqs[story], eng1000)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "# take a look at the projected stimuli\n", 345 | "naked_proj = semanticseqs[\"naked\"]\n", 346 | "\n", 347 | "print (naked_proj.data.shape) # prints the shape of 'data' as (rows, columns)\n", 348 | "print (naked_proj.data[:10]) # print the first 10 rows (this will be truncated)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "### Downsample the projected stimuli\n", 356 | "In order to build a model, you need to downsample the semantic representations of the stimuli to the same temporal scale as the fMRI responses that you will be modeling. The DataSequence provides a method that does this, called `chunksums`.\n", 357 | "\n", 358 | "For those of you who are interested, downsampling is accomplished here using a 3-lobe Lanczos filter (see [here](http://en.wikipedia.org/wiki/Lanczos_window) for details about the math). You can try changing the number of lobes, it shouldn't affect the results much." 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "# Downsample stimuli\n", 368 | "interptype = \"lanczos\" # filter type\n", 369 | "window = 3 # number of lobes in Lanczos filter\n", 370 | "\n", 371 | "downsampled_semanticseqs = dict() # dictionary to hold downsampled stimuli\n", 372 | "for story in allstories:\n", 373 | " downsampled_semanticseqs[story] = semanticseqs[story].chunksums(interptype, window=window)\n" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "### Visualizing the downsampling\n", 381 | "Next you're going to visualize what the downsampling did. Here you're going to plot the value of one semantic feature (feature 2, which is actually the third feature: zero-based indexing) for each word, and also the downsampled vector." 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "# Plot the result\n", 391 | "s_words = wordseqs[\"naked\"]\n", 392 | "s_sem = semanticseqs[\"naked\"]\n", 393 | "s_semdown = downsampled_semanticseqs[\"naked\"]\n", 394 | "\n", 395 | "f = figure(figsize=(15,5))\n", 396 | "f.clf()\n", 397 | "schan = 2\n", 398 | "ax = f.add_subplot(1,1,1)\n", 399 | "wordstems = ax.stem(s_sem.data_times, \n", 400 | " s_sem.data[:,schan] / np.abs(s_sem.data[:,schan]).max(), \n", 401 | " linefmt=\"k-\", markerfmt=\"k.\", basefmt=\"k-\")\n", 402 | "interps = ax.plot(s_sem.tr_times, \n", 403 | " s_semdown[:,schan] / np.abs(s_semdown[:,schan]).max(), 'r.-')\n", 404 | "ax.set_xlim(-6, 60)\n", 405 | "ax.set_ylim(-1, 1)\n", 406 | "ax.set_xlabel(\"Time (seconds since story start)\")\n", 407 | "ax.set_ylabel(\"Semantic feature value\")\n", 408 | "ax.legend((wordstems, interps[0]), (\"Individual words\", \"Downsampled feature\"));" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": {}, 414 | "source": [ 415 | "### Concatenating features across stories\n", 416 | "Next you're going to combine together all the features from all the stories into one big matrix. Within this operation, you're also going to [z-score](http://en.wikipedia.org/wiki/Z-score) each feature within each story. This operation subtracts off the mean and then divides by the standard deviation. This might seem like a weird or incomprehensible thing to do, but I do it because the responses to each story are z-scored individually. Anyway not a big deal.\n", 417 | "\n", 418 | "The features for each story are also trimmed a bit (the variable `trim` determines how many time points are removed from the beginning and end of each story). The fMRI responses at the beginnings and ends of the stories are often noisier than at other times because of transients and problems with detrending (an fMRI preprocessing step that you don't need to worry about here aside from this point).\n", 419 | "\n", 420 | "The combined features are stored in big matrices called `Rstim` (with the training, or Regression stimuli) and `Pstim` (with the test, or Prediction stimuli)." 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "# Combine stimuli\n", 430 | "from npp import zscore\n", 431 | "trim = 5\n", 432 | "Rstim = np.vstack([zscore(downsampled_semanticseqs[story][5+trim:-trim]) for story in Rstories])\n", 433 | "Pstim = np.vstack([zscore(downsampled_semanticseqs[story][5+trim:-trim]) for story in Pstories])\n" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "storylens = [len(downsampled_semanticseqs[story][5+trim:-trim]) for story in Rstories]\n", 443 | "print(storylens)\n", 444 | "\n", 445 | "print(np.cumsum(storylens))" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "# Print the sizes of these matrices\n", 455 | "print (\"Rstim shape: \", Rstim.shape)\n", 456 | "print (\"Pstim shape: \", Pstim.shape)" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "### Visualizing the combined stimuli\n", 464 | "Next you're going to plot some of the feature channels. This is just to see what the feature look like that are going to go into the regression model." 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "# Plot some of the combined stimuli\n", 474 | "f = figure(figsize=(20, 5))\n", 475 | "ax = f.add_subplot(1,1,1)\n", 476 | "\n", 477 | "for ii in range(10):\n", 478 | " # Plot each feature, offset by 5 vertically so they are easier to see\n", 479 | " ax.plot(Rstim[:750,ii] - 5 * ii)\n", 480 | "\n", 481 | "ax.set_xlim(0, 750)\n", 482 | "ax.set_yticks([])\n", 483 | "ax.set_xticks(range(0, 750, 50))\n", 484 | "ax.set_xlabel(\"Time (fMRI volumes)\")\n", 485 | "ax.set_ylabel(\"Features 1-10\")\n", 486 | "ax.grid()" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "### Concatenate delayed stimuli for FIR model\n", 494 | "Next you are going to concatenate multiple delayed versions of the stimuli, in order to create a linear [finite impulse response (FIR) model](http://en.wikipedia.org/wiki/Fir_filter). This is a vitally important step, and is conceptually a bit difficult, so take a few minutes to make sure you understand what is going on here.\n", 495 | "\n", 496 | "#### Background: the hemodynamic response\n", 497 | "First you need to understand the problem that the FIR model is solving. fMRI measures the blood-oxygen level dependent (BOLD) signal, which is a complicated and nonlinear combination of blood oxygenation and blood volume. When neurons in an area of the brain become active, they start using up lots of energy. To compensate, nearby blood vessels dilate so that more oxygen and glucose become available to the neurons. The resulting changes in blood oxygenation (which increases) and volume (which also increases) create the magnetic signature that is recorded by fMRI. \n", 498 | "\n", 499 | "But this process is **slow**. It takes seconds after the neural activity begins for the blood vessels to dilate and for the BOLD response to become apparent. And then it takes more seconds for the response to go away. So although a neural response might only last milliseconds, the associated BOLD response will rise and fall over a span of maybe 10 seconds, orders of magnitude slower. The shape of this rise and fall is called the [hemodynamic response function (HRF)](http://en.wikipedia.org/wiki/Haemodynamic_response).\n", 500 | "\n", 501 | "Here is a pretty standard looking example of an HRF:\n", 502 | "\n", 503 | "\n", 504 | "\n", 505 | "#### FIR model\n", 506 | "To accurately model how the brain responds to these stimuli we must also model the HRF. There are many ways to do this. The most common is to assume that the HRF follows a canonical shape. But this approach turns out to not work very well: different parts of the brain have very different vasculature (blood vessels), so the HRF shape can vary a lot. \n", 507 | "\n", 508 | "Instead, what you are going to here is estimate a separate HRF for each semantic feature in each voxel that is being modeled. This estimate is going to take the form of a linear finite impulse response (FIR) model. The linear FIR form is particularly nice to use because it's very simple to estimate and powerful (if anything, it might be too powerful.. more on that later). To build a linear FIR model all you have to do is concatenate together multiple delayed copies of the stimulus. I usually use four delays: 1, 2, 3, and 4 time points. The resulting delayed features can be thought of as representing the stimulus 1, 2, 3, and 4 time points ago. So the regression weights for those features will represent how a particular voxel responds to a feature 1, 2, 3, or 4 time points in the past, and these regression weights are a 4-point estimate of the HRF for that feature in that voxel.\n", 509 | "\n", 510 | "The potential downside of the FIR model is that it may be too expressive. Each feature in each voxel is allowed to have any HRF, but this comes at the cost of multiplying the total number of regression weights that we must fit by the number of delays. In all likelihood the true HRFs vary, but they don't vary that much, so we probably don't need this many independent features. This cost becomes apparent if you increase the number of delays. This will slow down model fitting and likely decrease the stability of the regression weights, leading to decreased model performance. \n", 511 | "\n", 512 | "Feel free to play around with the number of delays and see how it affects the model results!" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [ 521 | "# Delay stimuli\n", 522 | "from util import make_delayed\n", 523 | "ndelays = 4\n", 524 | "delays = range(1, ndelays+1)\n", 525 | "\n", 526 | "print (\"FIR model delays: \", delays)\n", 527 | "\n", 528 | "delRstim = make_delayed(Rstim, delays)\n", 529 | "delPstim = make_delayed(Pstim, delays)\n" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "# Print the sizes of these matrices\n", 539 | "print (\"delRstim shape: \", delRstim.shape)\n", 540 | "print (\"delPstim shape: \", delPstim.shape)" 541 | ] 542 | }, 543 | { 544 | "cell_type": "markdown", 545 | "metadata": {}, 546 | "source": [ 547 | "### Visualizing FIR features\n", 548 | "Here you will visualize the first semantic feature at each of the delays." 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "# Plot the same feature at different delays\n", 558 | "f = figure(figsize=(15, 4))\n", 559 | "ax = f.add_subplot(1,1,1)\n", 560 | "for ii in range(ndelays):\n", 561 | " ax.plot(delRstim[:500, ii * Rstim.shape[1]] - 5 * ii)\n", 562 | "ax.set_xlim(0, 500)\n", 563 | "ax.set_yticks([])\n", 564 | "ax.set_xticks(range(0, 500, 50))\n", 565 | "ax.set_xlabel(\"Time (fMRI volumes)\")\n", 566 | "ax.set_ylabel(\"Feature 1 across delays\")\n", 567 | "ax.grid()" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": {}, 573 | "source": [ 574 | "## Response data\n", 575 | "Next you will load the fMRI data. This is totally the most exciting part! These responses have already been preprocessed (the 3D images were motion corrected and aligned to each other, detrended, and then z-scored within each stimulus) so you don't have to worry about that.\n", 576 | "\n", 577 | "You will load three different variables: `zRresp`, the responses to the regression dataset; `zPresp`, the responses to the prediction dataset; and `mask`, which is a 3D mask showing which voxels have been selected (we are not modeling every voxel in the scan, that would take forever, we are only modeling the voxels that overlap with the cerebral cortex)." 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [ 586 | "# Load responses\n", 587 | "import tables\n", 588 | "resptf = tables.open_file(\"data/fmri-responses.hf5\")\n", 589 | "zRresp = resptf.root.zRresp.read()\n", 590 | "zPresp = resptf.root.zPresp.read()\n", 591 | "mask = resptf.root.mask.read()" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "# Print matrix shapes\n", 601 | "print (\"zRresp shape (num time points, num voxels): \", zRresp.shape)\n", 602 | "print (\"zPresp shape (num time points, num voxels): \", zPresp.shape)\n", 603 | "print (\"mask shape (Z, Y, X): \", mask.shape)" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "### Visualize where the voxels are coming from (mask)\n", 611 | "Next you will visualize where the voxels are coming from in the brain. This will give you an idea of where the data come from.\n", 612 | "\n", 613 | "First you will plot a single slice through the mask in the Z dimension. This is called an \"axial\" slice. The top of the image is the front of the brain, the bottom is the back. The left side of the image is the right side of the brain, and the right side of the image is the left side of the brain (as if you are looking up at the brain from under the subject's chin; this left-right reversal is often referred to as \"radiological coordinates\", as opposed to \"neurological coordinates\" where you are looking down from the top).\n", 614 | "\n", 615 | "Then you will plot a mosaic of all the slices. This is done using the function `mosaic` from James Gao's pyCortex package." 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "# Plot one slice of the mask that was used to select the voxels\n", 625 | "f = figure()\n", 626 | "ax = f.add_subplot(1,1,1)\n", 627 | "ax.matshow(mask[16], interpolation=\"nearest\", cmap=cm.gray) # show the 17th slice of the mask" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | "# Plot mask mosaic\n", 637 | "import cortex\n", 638 | "f = figure(figsize=(10,10))\n", 639 | "cortex.mosaic(mask, cmap=cm.gray, interpolation=\"nearest\");" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": {}, 645 | "source": [ 646 | "### Visualizing the responses of a few voxels over time\n", 647 | "Next you will visualize the responses of a few selected voxels over time. I selected these particular voxels because they are reasonably well explained by the semantic model, but have some differences in their responses across time." 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "# Plot the response of a few voxels over time\n", 657 | "selvoxels = [20710, 27627, 24344, 34808, 22423, 25397]\n", 658 | "\n", 659 | "f = figure(figsize=(15, 5))\n", 660 | "ax = f.add_subplot(1,1,1)\n", 661 | "for ii,vi in enumerate(selvoxels):\n", 662 | " ax.plot(zRresp[:500, vi] - 5 * ii)\n", 663 | "ax.set_xlim(0, 500)\n", 664 | "ax.set_yticks([])\n", 665 | "ax.set_xticks(range(0, 500, 50))\n", 666 | "ax.set_xlabel(\"Time (fMRI volumes)\")\n", 667 | "ax.set_ylabel(\"Voxel responses\")\n", 668 | "ax.grid()" 669 | ] 670 | }, 671 | { 672 | "cell_type": "markdown", 673 | "metadata": {}, 674 | "source": [ 675 | "## Regression model\n", 676 | "Finally, the core of the analysis: you will fit a regression model that predicts the responses of each voxel as a weighted sum of the semantic features. This model will then be tested using a held out dataset (the Prediction dataset). And if the model proves to be reasonably predictive, then the weights of the regression model will tell us what semantic features each voxel responds to.\n", 677 | "\n", 678 | "This is a linear regression model, so if the response time course for voxel $j$ is $R_j$, the stimulus time course for semantic feature $i$ is $S_i$, and the regression weight for feature $i$ in voxel $j$ is $\\beta_{ij}$, then the model can be written as:\n", 679 | "\n", 680 | "$$\\hat{R}_j = \\beta_{0j} S_0 + \\beta_{1j} S_1 + \\cdots$$\n", 681 | "\n", 682 | "or:\n", 683 | "\n", 684 | "$$\\hat{R}_j = \\sum_i \\beta_{ij} S_i$$\n", 685 | "\n", 686 | "The trick, of course, is accurately estimating the $\\beta_j$ values. This is commonly done by minimizing the sum of the squared error (here across time, $t$):\n", 687 | "\n", 688 | "$$E_j(\\beta) = \\sum_t (R_{jt} - \\hat{R}_{jt})^2 = \\sum_t (R_{jt} - \\sum_i \\beta_{i} S_{it})^2$$\n", 689 | "\n", 690 | "$$\\beta_j = \\underset{\\beta}{\\operatorname{argmin}} E_j(\\beta)$$\n", 691 | "\n", 692 | "Computing $\\beta$ this way is called ordinary least squares (OLS), and this will not work in our case because the total number of features (3940) is smaller than the number of time points (3737). (It would be possible if the number of delays was smaller than 4, but it would give terrible results.. feel free to try it! OLS can be performed using the function `np.linalg.lstsq`.)\n", 693 | "\n", 694 | "In almost every case, linear regression can be improved by making some prior assumptions about the weights (or, equivalently, about the covariance structure of the stimuli). This is called **regularization**, or **regularized linear regression**. One way to do this is to penalize the error function by the sum of the squared weights. This is commonly known as **ridge regression**, and is a special case of [Tikhonov regularization](http://en.wikipedia.org/wiki/Ridge_regression). It finds the $\\beta$ that minimizes the following error function:\n", 695 | "\n", 696 | "$$E_j(\\beta) = \\sum_t (R_{jt} - \\sum_i \\beta_{i} S_{it})^2 + \\alpha \\sum_i \\beta_i^2$$\n", 697 | "\n", 698 | "(In practice we will use a different formulation that involves re-weighting the singular values of the matrix $S$ before computing its pseudoinverse. This method achieves the same results but is extremely efficient because it uses all the linear algebra machinery that computers are so good at to build many models in parallel.)\n", 699 | "\n", 700 | "### The hyperparameter: $\\alpha$\n", 701 | "You may have noticed in the equation above that we have introduced a new parameter, $\\alpha$, which controls the strength of the regularization. If $\\alpha$ is set to zero, then we get back to exactly the OLS formulation (above). As $\\alpha$ goes to infinity, the regularization forces all the weights to go to zero (in practice this also has the slightly weirder effect of making all the weights independent, as if each feature was regressed separately on the responses).\n", 702 | "\n", 703 | "So how do we choose $\\alpha$? We're going to do it here using cross-validation. First, we split the Regression dataset up into two parts. Then we estimate the weights for a given $\\alpha$ on the first part, and test how well we can predict responses on the second part. This is repeated for each possible $\\alpha$ that we want to test, and for a couple different splits of the Regression dataset. Then we find the $\\alpha^*$ that gave us the best predictions within the split Regression dataset. Finally we estimate the weights using the entire Regression dataset and the selected $\\alpha^*$.\n", 704 | "\n", 705 | "Because this is an annoying and laborious process, I've encapsulated it within the function `bootstrap_ridge`. You simply give this function your datasets, the possible $\\alpha$ values, and a few parameters for the cross-validation, and it does all the rest. The parameter `nboots` determines the number of cross-validation tests that will be run. \n", 706 | "\n", 707 | "To do cross-validation, `bootstrap_ridge` divides the Regression dataset into many small chunks, and then splits those chunks into the two groups that will be used to estimate weights and test $\\alpha$ values. This is better than just choosing individual time points because both the fMRI data and stimuli are autocorrelated (i.e. correlated across time). The parameter `chunklen` determines the length of the chunks, and the parameter `nchunks` determines the number of chunks in the $\\alpha$-testing dataset. By default I set `chunklen` to 40 time points (80-second chunks), and set `nchunks` to 20 (40 * 20 = 800 time points for testing $\\alpha$ values, 3737-800 = 2937 time points for estimating weights). These values should not matter too much.\n", 708 | "\n", 709 | "Running the regression will take a few minutes." 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": null, 715 | "metadata": {}, 716 | "outputs": [], 717 | "source": [ 718 | "# Run regression\n", 719 | "from ridge import bootstrap_ridge\n", 720 | "alphas = np.logspace(1, 3, 10) # Equally log-spaced alphas between 10 and 1000. The third number is the number of alphas to test.\n", 721 | "nboots = 1 # Number of cross-validation runs.\n", 722 | "chunklen = 40 # \n", 723 | "nchunks = 20\n", 724 | "\n", 725 | "wt, corr, alphas, bscorrs, valinds = bootstrap_ridge(delRstim, zRresp, delPstim, zPresp,\n", 726 | " alphas, nboots, chunklen, nchunks,\n", 727 | " singcutoff=1e-10, single_alpha=True)\n" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": null, 733 | "metadata": {}, 734 | "outputs": [], 735 | "source": [ 736 | "f = figure()\n", 737 | "ax = f.add_subplot(1,1,1)\n", 738 | "ax.semilogx( np.logspace(1, 3, 10), bscorrs.mean(2).mean(1), 'o-')" 739 | ] 740 | }, 741 | { 742 | "cell_type": "markdown", 743 | "metadata": {}, 744 | "source": [ 745 | "### Variables returned by the regression\n", 746 | "Next let's have a look at the variables returned by the regression function." 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": null, 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [ 755 | "# wt is the regression weights\n", 756 | "print (\"wt has shape: \", wt.shape)\n", 757 | "\n", 758 | "# corr is the correlation between predicted and actual voxel responses in the Prediction dataset\n", 759 | "print (\"corr has shape: \", corr.shape)\n", 760 | "\n", 761 | "# alphas is the selected alpha value for each voxel, here it should be the same across voxels\n", 762 | "print (\"alphas has shape: \", alphas.shape)\n", 763 | "\n", 764 | "# bscorrs is the correlation between predicted and actual voxel responses for each round of cross-validation\n", 765 | "# within the Regression dataset\n", 766 | "print (\"bscorrs has shape (num alphas, num voxels, nboots): \", bscorrs.shape)\n", 767 | "\n", 768 | "# valinds is the indices of the time points in the Regression dataset that were used for each\n", 769 | "# round of cross-validation\n", 770 | "print (\"valinds has shape: \", np.array(valinds).shape)" 771 | ] 772 | }, 773 | { 774 | "cell_type": "markdown", 775 | "metadata": {}, 776 | "source": [ 777 | "### Testing the regression models by predicting responses\n", 778 | "The `bootstrap_ridge` function already computed predictions and correlations for the Prediction dataset, but this is important so let's reproduce that step more explicitly.\n", 779 | "\n", 780 | "Remember that according to the linear model, the predicted responses for each voxel are a weighted sum of the semantic features. An easy way to compute that is by taking the dot product between the weights and semantic features: $$\\hat{R} = S \\beta$$" 781 | ] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": null, 786 | "metadata": {}, 787 | "outputs": [], 788 | "source": [ 789 | "# Predict responses in the Prediction dataset\n", 790 | "\n", 791 | "# First let's refresh ourselves on the shapes of these matrices\n", 792 | "print (\"zPresp has shape: \", zPresp.shape)\n", 793 | "print (\"wt has shape: \", wt.shape)\n", 794 | "print (\"delPstim has shape: \", delPstim.shape)" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": null, 800 | "metadata": {}, 801 | "outputs": [], 802 | "source": [ 803 | "# Then let's predict responses by taking the dot product of the weights and stim\n", 804 | "pred = np.dot(delPstim, wt)\n", 805 | "\n", 806 | "print (\"pred has shape: \", pred.shape)" 807 | ] 808 | }, 809 | { 810 | "cell_type": "markdown", 811 | "metadata": {}, 812 | "source": [ 813 | "#### Visualizing predicted and actual responses\n", 814 | "Next let's plot some predicted and actual responses side by side." 815 | ] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": null, 820 | "metadata": {}, 821 | "outputs": [], 822 | "source": [ 823 | "f = figure(figsize=(15,5))\n", 824 | "ax = f.add_subplot(1,1,1)\n", 825 | "\n", 826 | "selvox = 20710 # a decent voxel\n", 827 | "\n", 828 | "realresp = ax.plot(zPresp[:,selvox], 'k')[0]\n", 829 | "predresp = ax.plot(pred[:,selvox], 'r')[0]\n", 830 | "\n", 831 | "ax.set_xlim(0, 291)\n", 832 | "ax.set_xlabel(\"Time (fMRI time points)\")\n", 833 | "\n", 834 | "ax.legend((realresp, predresp), (\"Actual response\", \"Predicted response\"));" 835 | ] 836 | }, 837 | { 838 | "cell_type": "markdown", 839 | "metadata": {}, 840 | "source": [ 841 | "#### Visualizing predicted and actual responses cont'd\n", 842 | "You might notice above that the predicted and actual responses look pretty different scale-wise, although the patterns of ups and downs are vaguely similar. But we don't really care about the scale -- for fMRI it's relatively arbitrary anyway, so let's rescale them both to have unit standard deviation and re-plot." 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "f = figure(figsize=(15,5))\n", 852 | "ax = f.add_subplot(1,1,1)\n", 853 | "\n", 854 | "selvox = 20710 # a good voxel\n", 855 | "\n", 856 | "realresp = ax.plot(zPresp[:,selvox], 'k')[0]\n", 857 | "predresp = ax.plot(zscore(pred[:,selvox]), 'r')[0]\n", 858 | "\n", 859 | "ax.set_xlim(0, 291)\n", 860 | "ax.set_xlabel(\"Time (fMRI time points)\")\n", 861 | "\n", 862 | "ax.legend((realresp, predresp), (\"Actual response\", \"Predicted response (scaled)\"));" 863 | ] 864 | }, 865 | { 866 | "cell_type": "markdown", 867 | "metadata": {}, 868 | "source": [ 869 | "Now you see that the actual and scaled predicted responses look very similar. We can quantify this similarity by computing the correlation between the two (correlation is scale-free, so it effectively automatically does the re-scaling that we did here). This voxel has high correlation." 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": null, 875 | "metadata": {}, 876 | "outputs": [], 877 | "source": [ 878 | "# Compute correlation between single predicted and actual response\n", 879 | "# (np.corrcoef returns a correlation matrix; pull out the element [0,1] to get \n", 880 | "# correlation between the two vectors)\n", 881 | "voxcorr = np.corrcoef(zPresp[:,selvox], pred[:,selvox])[0,1]\n", 882 | "print (\"Correlation between predicted and actual responses for voxel %d: %f\" % (selvox, voxcorr))" 883 | ] 884 | }, 885 | { 886 | "cell_type": "markdown", 887 | "metadata": {}, 888 | "source": [ 889 | "#### Computing correlations for all voxels\n", 890 | "Next let's compute this correlation for every voxel in the dataset. There are some very efficient ways to do this, but here I've written a for loop so that it's very explicit what's happening. (This should give exactly the same values as the variable `corr`, which was returned by `bootstrap_ridge`.)" 891 | ] 892 | }, 893 | { 894 | "cell_type": "code", 895 | "execution_count": null, 896 | "metadata": {}, 897 | "outputs": [], 898 | "source": [ 899 | "voxcorrs = np.zeros((zPresp.shape[1],)) # create zero-filled array to hold correlations\n", 900 | "for vi in range(zPresp.shape[1]):\n", 901 | " voxcorrs[vi] = np.corrcoef(zPresp[:,vi], pred[:,vi])[0,1]\n", 902 | "print (voxcorrs)" 903 | ] 904 | }, 905 | { 906 | "cell_type": "markdown", 907 | "metadata": {}, 908 | "source": [ 909 | "### Visualizing correlations across the brain\n", 910 | "Let's start with a supposition: the correlation should not be high everywhere, even if this is a good model of how the brain represents the semantic content of speech. There are parts of the brain that just don't respond to speech, so the correlation should be low in those areas. There are other parts of the brain that respond to speech, but maybe don't represent semantic information, so the correlation should be low in those areas as well.\n", 911 | "\n", 912 | "But let's begin by plotting a histogram of the correlations across the entire brain. This will show generally whether the model is working well or not." 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": null, 918 | "metadata": {}, 919 | "outputs": [], 920 | "source": [ 921 | "# Plot histogram of correlations\n", 922 | "f = figure(figsize=(8,8))\n", 923 | "ax = f.add_subplot(1,1,1)\n", 924 | "ax.hist(voxcorrs, 100) # histogram correlations with 100 bins\n", 925 | "ax.set_xlabel(\"Correlation\")\n", 926 | "ax.set_ylabel(\"Num. voxels\");" 927 | ] 928 | }, 929 | { 930 | "cell_type": "markdown", 931 | "metadata": {}, 932 | "source": [ 933 | "If the semantic features didn't capture anything about brain activity, then we would expect the histogram to be symmetric and centered around zero. But here we see that it's highly skewed, with lots of positive values. This looks good! This model is working!\n", 934 | "\n", 935 | "Next, let's plot a mosaic of the correlations across the brain, as we plotted the mask earlier." 936 | ] 937 | }, 938 | { 939 | "cell_type": "code", 940 | "execution_count": null, 941 | "metadata": {}, 942 | "outputs": [], 943 | "source": [ 944 | "# Plot mosaic of correlations\n", 945 | "corrvolume = np.zeros(mask.shape)\n", 946 | "corrvolume[mask>0] = voxcorrs\n", 947 | "\n", 948 | "f = figure(figsize=(10,10))\n", 949 | "cortex.mosaic(corrvolume, vmin=0, vmax=0.5, cmap=cm.hot);" 950 | ] 951 | }, 952 | { 953 | "cell_type": "markdown", 954 | "metadata": {}, 955 | "source": [ 956 | "#### 3D visualization of correlations\n", 957 | "In the mosaic we can see that there seem to be some concentrated areas of high correlation. But it's hard to say where in the brain those areas are based on the mosaic. So next you're going to create a fancy 3D visualization of the correlations using pyCortex.\n", 958 | "\n", 959 | "Once you've opened the viewer you'll be presented with a 3D view of the brain with colors showing the correlations. White outlines and labels show the locations of known brain areas (motor, somatosensory, visual, and some language areas). Drag around with your left mouse button to rotate the view, and the right mouse button to zoom in or out. \n", 960 | "\n", 961 | "By default you'll see a view of the cortex as it looks in reality: folded and convoluted. To better see parts of the brain that are hidden down in the folds, you can press \"i\" to see an inflated view (or drag the \"Mix\" slider at the bottom of the screen to the middle). This helps to see the data, but you will still need to rotate the brain to see all of it. To make the entire cortex visible at once, you can press \"f\" to see a flattened view. To create this view we cut the cortical surface at a few locations, and then flattened it out so that it can all be seen at once (but this introduces some distortions)." 962 | ] 963 | }, 964 | { 965 | "cell_type": "code", 966 | "execution_count": null, 967 | "metadata": {}, 968 | "outputs": [], 969 | "source": [ 970 | "# Plot correlations on cortex\n", 971 | "import cortex\n", 972 | "corrvol = cortex.Volume(corr, \"S1\", \"fullhead\", mask=mask, vmin=0, vmax=0.5, cmap='hot')\n", 973 | "cortex.webshow(corrvol, port=8889, open_browser=False)\n" 974 | ] 975 | }, 976 | { 977 | "cell_type": "code", 978 | "execution_count": null, 979 | "metadata": {}, 980 | "outputs": [], 981 | "source": [ 982 | "# View 3D model\n", 983 | "# You will need to change where it says SERVERIP below to the IP you are connected to\n", 984 | "from IPython.display import HTML\n", 985 | "HTML(\"Click here for viewer\")" 986 | ] 987 | }, 988 | { 989 | "cell_type": "markdown", 990 | "metadata": {}, 991 | "source": [ 992 | "### Simpler view of the correlations\n", 993 | "pyCortex also offers a simpler way to view the correlations. This method only shows the flat view, but can be embedded right here in the ipython notebook. This should look like the flat view in the 3D viewer." 994 | ] 995 | }, 996 | { 997 | "cell_type": "code", 998 | "execution_count": null, 999 | "metadata": {}, 1000 | "outputs": [], 1001 | "source": [ 1002 | "# Plot correlation flatmap\n", 1003 | "cortex.quickshow(corrvol, with_rois=False, with_labels=False);" 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "markdown", 1008 | "metadata": {}, 1009 | "source": [ 1010 | "## What semantic features are the voxels responding to?\n", 1011 | "Now that we have a working model, let's try to figure out what semantic features are making each voxel respond. One way to do this is to simulate how the voxel will respond to individual words, and then find the most preferred words for that voxel.\n", 1012 | "\n", 1013 | "But first we have an issue to contend with: we have separate weights for each delay. We could look at the weights for each delay, but instead here you will average the weights across delays to get a single set of weights for the voxel." 1014 | ] 1015 | }, 1016 | { 1017 | "cell_type": "code", 1018 | "execution_count": null, 1019 | "metadata": {}, 1020 | "outputs": [], 1021 | "source": [ 1022 | "# Undelay voxel weights (average across delays)\n", 1023 | "import operator\n", 1024 | "from functools import reduce\n", 1025 | "udwt = reduce(operator.add, np.split(wt/ndelays, ndelays))" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": null, 1031 | "metadata": {}, 1032 | "outputs": [], 1033 | "source": [ 1034 | "udwt.shape" 1035 | ] 1036 | }, 1037 | { 1038 | "cell_type": "markdown", 1039 | "metadata": {}, 1040 | "source": [ 1041 | "Next you will pick which voxel to visualize. Since many voxels are modeled poorly, we will pick from among the best modeled voxels." 1042 | ] 1043 | }, 1044 | { 1045 | "cell_type": "code", 1046 | "execution_count": null, 1047 | "metadata": {}, 1048 | "outputs": [], 1049 | "source": [ 1050 | "# Sort voxels by correlation so that we can pick a good voxel\n", 1051 | "# This will sort voxels in decreasing order of correlation\n", 1052 | "corrsort = np.argsort(corr)[::-1]" 1053 | ] 1054 | }, 1055 | { 1056 | "cell_type": "code", 1057 | "execution_count": null, 1058 | "metadata": {}, 1059 | "outputs": [], 1060 | "source": [ 1061 | "# Define function that will print best words for a voxel\n", 1062 | "import pprint\n", 1063 | "\n", 1064 | "def print_voxel_words(voxnum):\n", 1065 | " # find_words_like_vec returns 10 words most correlated with the given vector, and the correlations\n", 1066 | " voxwords = eng1000.find_words_like_vec(udwt[:,voxnum])\n", 1067 | " print (\"Best words for voxel %d (correlation %0.3f):\" % (voxnum, voxcorrs[voxnum]))\n", 1068 | " pprint.pprint(voxwords)" 1069 | ] 1070 | }, 1071 | { 1072 | "cell_type": "code", 1073 | "execution_count": null, 1074 | "metadata": {}, 1075 | "outputs": [], 1076 | "source": [ 1077 | "# Print best words for some voxels\n", 1078 | "print_voxel_words(corrsort[0]) # best voxel\n", 1079 | "print_voxel_words(corrsort[14]) # 15th best voxel" 1080 | ] 1081 | }, 1082 | { 1083 | "cell_type": "markdown", 1084 | "metadata": {}, 1085 | "source": [ 1086 | "### That's it!\n", 1087 | "That's the semantic model! Since you made it this far, well done. If you're interested, you can go back and try changing some of the parameters and see how it affects the model. One easy parameter to change is the number of delays (or the delays themselves). Try using just one delay. Or try using 10 delays (that might be slow). You could also try pruning off some of the semantic features. How does the model work if you only use the first 100 semantic features?\n", 1088 | "\n", 1089 | "Alternatively, you can try using a different type of feature to model the fMRI responses: phonemes. Below are some blocks of code that will create stimulus vectors representing the number of times each different phoneme is spoken. The phoneme model will predict some voxels much better than the semantic model, and some voxels worse.\n" 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "markdown", 1094 | "metadata": {}, 1095 | "source": [ 1096 | "## Optional: phoneme model\n", 1097 | "Another type of information that the brain extracts from speech is the phonemic content. The following blocks of code will extract phonemic features from the stimuli. Run this, and then go back and run the \"Regression Model\" block, above. You can compare the correlations of the semantic model and phoneme model to see which works best in each voxel.\n", 1098 | "\n", 1099 | "You can also visualize the phoneme stimuli and model as you go, building on the code blocks used above." 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "code", 1104 | "execution_count": null, 1105 | "metadata": {}, 1106 | "outputs": [], 1107 | "source": [ 1108 | "# Create phoneme histogram DataSequences\n", 1109 | "from dsutils import histogram_phonemes2, phonemes\n", 1110 | "phonemehistseqs = dict() # dictionary to hold phoneme histograms {story name : DataSequence}\n", 1111 | "for story in allstories:\n", 1112 | " phonemehistseqs[story] = histogram_phonemes2(phonseqs[story])" 1113 | ] 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "execution_count": null, 1118 | "metadata": {}, 1119 | "outputs": [], 1120 | "source": [ 1121 | "# Phonemes were labeled using the ARPABET. The labeled phonemes are listed here.\n", 1122 | "print (phonemes)" 1123 | ] 1124 | }, 1125 | { 1126 | "cell_type": "code", 1127 | "execution_count": null, 1128 | "metadata": {}, 1129 | "outputs": [], 1130 | "source": [ 1131 | "# Downsample phoneme histograms\n", 1132 | "interptype = \"lanczos\"\n", 1133 | "window = 3\n", 1134 | "\n", 1135 | "downsampled_phonemehistseqs = dict()\n", 1136 | "for story in allstories:\n", 1137 | " downsampled_phonemehistseqs[story] = phonemehistseqs[story].chunksums(interptype, window=window)\n" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "code", 1142 | "execution_count": null, 1143 | "metadata": {}, 1144 | "outputs": [], 1145 | "source": [ 1146 | "# Combine phoneme stimuli\n", 1147 | "trim = 5\n", 1148 | "phRstim = np.vstack([np.nan_to_num(zscore(downsampled_phonemehistseqs[story][5+trim:-trim])) for story in Rstories])\n", 1149 | "phPstim = np.vstack([np.nan_to_num(zscore(downsampled_phonemehistseqs[story][5+trim:-trim])) for story in Pstories])\n" 1150 | ] 1151 | }, 1152 | { 1153 | "cell_type": "code", 1154 | "execution_count": null, 1155 | "metadata": {}, 1156 | "outputs": [], 1157 | "source": [ 1158 | "# Delay stimuli\n", 1159 | "ndelays = 4\n", 1160 | "delays = range(1, ndelays+1)\n", 1161 | "\n", 1162 | "delRstim = make_delayed(phRstim, delays)\n", 1163 | "delPstim = make_delayed(phPstim, delays)" 1164 | ] 1165 | }, 1166 | { 1167 | "cell_type": "code", 1168 | "execution_count": null, 1169 | "metadata": {}, 1170 | "outputs": [], 1171 | "source": [ 1172 | "# Now go back to \"Regression Model\" and run that to fit a phoneme-based model!" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "markdown", 1177 | "metadata": {}, 1178 | "source": [ 1179 | "### " 1180 | ] 1181 | } 1182 | ], 1183 | "metadata": { 1184 | "kernelspec": { 1185 | "display_name": "Python 3", 1186 | "language": "python", 1187 | "name": "python3" 1188 | }, 1189 | "language_info": { 1190 | "codemirror_mode": { 1191 | "name": "ipython", 1192 | "version": 3 1193 | }, 1194 | "file_extension": ".py", 1195 | "mimetype": "text/x-python", 1196 | "name": "python", 1197 | "nbconvert_exporter": "python", 1198 | "pygments_lexer": "ipython3", 1199 | "version": "3.6.5" 1200 | } 1201 | }, 1202 | "nbformat": 4, 1203 | "nbformat_minor": 1 1204 | } 1205 | -------------------------------------------------------------------------------- /dsutils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import itertools as itools 3 | from DataSequence import DataSequence 4 | 5 | DEFAULT_BAD_WORDS = frozenset(["sentence_start", "sentence_end", "br", "lg", "ls", "ns", "sp"]) 6 | 7 | def make_word_ds(grids, trfiles, bad_words=DEFAULT_BAD_WORDS): 8 | """Creates DataSequence objects containing the words from each grid, with any words appearing 9 | in the [bad_words] set removed. 10 | """ 11 | ds = dict() 12 | stories = grids.keys() 13 | for st in stories: 14 | grtranscript = grids[st].tiers[1].make_simple_transcript() 15 | ## Filter out bad words 16 | goodtranscript = [x for x in grtranscript 17 | if x[2].lower().strip("{}").strip() not in bad_words] 18 | d = DataSequence.from_grid(goodtranscript, trfiles[st][0]) 19 | ds[st] = d 20 | 21 | return ds 22 | 23 | def make_phoneme_ds(grids, trfiles): 24 | """Creates DataSequence objects containing the phonemes from each grid. 25 | """ 26 | ds = dict() 27 | stories = grids.keys() 28 | for st in stories: 29 | grtranscript = grids[st].tiers[0].make_simple_transcript() 30 | d = DataSequence.from_grid(grtranscript, trfiles[st][0]) 31 | ds[st] = d 32 | 33 | return ds 34 | 35 | phonemes = ['AA', 'AE','AH','AO','AW','AY','B','CH','D', 36 | 'DH', 'EH', 'ER', 'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 37 | 'K', 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH', 38 | 'T', 'TH', 'UH', 'UW', 'V', 'W', 'Y', 'Z', 'ZH'] 39 | 40 | def make_character_ds(grids, trfiles): 41 | ds = dict() 42 | stories = grids.keys() 43 | for st in stories: 44 | grtranscript = grids[st].tiers[2].make_simple_transcript() 45 | fixed_grtranscript = [(s,e,map(int, c.split(","))) for s,e,c in grtranscript if c] 46 | d = DataSequence.from_grid(fixed_grtranscript, trfiles[st][0]) 47 | ds[st] = d 48 | return ds 49 | 50 | def make_dialogue_ds(grids, trfiles): 51 | ds = dict() 52 | for st, gr in grids.iteritems(): 53 | grtranscript = gr.tiers[3].make_simple_transcript() 54 | fixed_grtranscript = [(s,e,c) for s,e,c in grtranscript if c] 55 | ds[st] = DataSequence.from_grid(fixed_grtranscript, trfiles[st][0]) 56 | return ds 57 | 58 | def histogram_phonemes(ds, phonemeset=phonemes): 59 | """Histograms the phonemes in the DataSequence [ds]. 60 | """ 61 | olddata = ds.data 62 | N = len(ds.data) 63 | newdata = np.zeros((N, len(phonemeset))) 64 | phind = dict(enumerate(phonemeset)) 65 | for ii,ph in enumerate(olddata): 66 | try: 67 | #ind = phonemeset.index(ph.upper().strip("0123456789")) 68 | ind = phind[ph.upper().strip("0123456789")] 69 | newdata[ii][ind] = 1 70 | except Exception as e: 71 | pass 72 | 73 | return DataSequence(newdata, ds.split_inds, ds.data_times, ds.tr_times) 74 | 75 | def histogram_phonemes2(ds, phonemeset=phonemes): 76 | """Histograms the phonemes in the DataSequence [ds]. 77 | """ 78 | olddata = np.array([ph.upper().strip("0123456789") for ph in ds.data]) 79 | newdata = np.vstack([olddata==ph for ph in phonemeset]).T 80 | return DataSequence(newdata, ds.split_inds, ds.data_times, ds.tr_times) 81 | 82 | def make_semantic_model(ds, lsasm): 83 | newdata = [] 84 | for w in ds.data: 85 | try: 86 | v = lsasm[w] 87 | except KeyError as e: 88 | v = np.zeros((lsasm.data.shape[0],)) 89 | newdata.append(v) 90 | return DataSequence(np.array(newdata), ds.split_inds, ds.data_times, ds.tr_times) 91 | 92 | def make_character_model(dss): 93 | """Make character indicator model for a dict of datasequences. 94 | """ 95 | stories = dss.keys() 96 | storychars = dict([(st,np.unique(np.hstack(ds.data))) for st,ds in dss.iteritems()]) 97 | total_chars = sum(map(len, storychars.values())) 98 | char_inds = dict() 99 | ncharsdone = 0 100 | for st in stories: 101 | char_inds[st] = dict(zip(storychars[st], range(ncharsdone, ncharsdone+len(storychars[st])))) 102 | ncharsdone += len(storychars[st]) 103 | 104 | charmodels = dict() 105 | for st,ds in dss.iteritems(): 106 | charmat = np.zeros((len(ds.data), total_chars)) 107 | for ti,charlist in enumerate(ds.data): 108 | for char in charlist: 109 | charmat[ti, char_inds[st][char]] = 1 110 | charmodels[st] = DataSequence(charmat, ds.split_inds, ds.data_times, ds.tr_times) 111 | 112 | return charmodels, char_inds 113 | 114 | def make_dialogue_model(ds): 115 | return DataSequence(np.ones((len(ds.data),1)), ds.split_inds, ds.data_times, ds.tr_times) 116 | 117 | def modulate(ds, vec): 118 | """Multiplies each row (each word/phoneme) by the corresponding value in [vec]. 119 | """ 120 | return DataSequence((ds.data.T*vec).T, ds.split_inds, ds.data_times, ds.tr_times) 121 | 122 | def catmats(*seqs): 123 | keys = seqs[0].keys() 124 | return dict([(k, DataSequence(np.hstack([s[k].data for s in seqs]), seqs[0][k].split_inds)) for k in keys]) 125 | -------------------------------------------------------------------------------- /english1000.py: -------------------------------------------------------------------------------- 1 | english1000words = sorted(set([w.lower() for w in """a, about, above, across, act, active, activity, add, afraid, after, again, age, ago, agree, air, all, alone, along, already, always, am, amount, an, and, angry, another, answer, any, anyone, anything, appear, apple, are, area, arm, army, around, arrive, art, as, ask, at, attack, aunt, autumn, away, baby, base, back, bad, bag, ball, bank, basket, bath, be, bear, beautiful, beer, bed, bedroom, behave, before, begin, behind, bell, below, besides, best, better, between, big, bird, birth, birthday, bit, bite, black, block, blood, blow, blue, board, boat, body, boil, bone, book, border, born, borrow, both, bottle, bottom, bowl, box, boy, branch, brave, bread, break, breakfast, breathe, bridge, bright, bring, brother, brown, brush, build, burn, business, bus, busy, but, buy, by, cake, call, can, candle, cap, car, card, care, careful, careless, carry, case, cat, catch, central, century, certain, chair, chance, change, chase, cheap, cheese, chicken, child, children, chocolate, choice, choose, circle, city, class, clever, clean, clear, climb, clock, cloth, clothes, cloud, cloudy, close, coffee, coat, coin, cold, collect, colour, comb, comfortable, common, compare, come, complete, computer, condition, continue, control, cook, cool, copper, corn, corner, correct, cost, contain, count, country, course, cover, crash, cross, cry, cup, cupboard, cut, dance, dangerous, dark, daughter, day, dead, decide, deep, deer, depend, desk, destroy, develop, die, different, difficult, dinner, direction, dirty, discover, dish, do, dog, door, double, down, draw, dream, dress, drink, drive, drop, dry, duck, dust, duty, each, ear, early, earn, earth, east, easy, eat, education, effect, egg, eight, either, electric, elephant, else, empty, end, enemy, enjoy, enough, enter, equal, entrance, escape, even, evening, event, ever, every, everyone, exact, everybody, examination, example, except, excited, exercise, expect, expensive, explain, extremely, eye, face, fact, fail, fall, false, family, famous, far, farm, father, fast, fat, fault, fear, feed, feel, female, fever, few, fight, fill, film, find, fine, finger, finish, fire, first, fit, five, fix, flag, flat, float, floor, flower, fly, fold, food, fool, foot, football, for, force, foreign, forest, forget, forgive, fork, form, fox, four, free, freedom, freeze, fresh, friend, friendly, from, front, fruit, full, fun, funny, furniture, further, future, game, garden, gate, general, gentleman, get, gift, give, glad, glass, go, goat, god, gold, good, goodbye, grandfather, grandmother, grass, grave, great, green, grey, ground, group, grow, gun, hair, half, hall, hammer, hand, happen, happy, hard, hat, hate, have, he, head, healthy, hear, heavy, hello, help, heart, heaven, height, help, her, here, hers, hide, high, hill, him, his, hit, hobby, hold, hole, holiday, home, hope, horse, hospital, hot, hotel, house, how, hundred, hungry, hour, hurry, husband, hurt, I, ice, idea, if, important, in, increase, inside, into, introduce, iron, invite, is, island, it, its, jelly, job, join, juice, jump, just, keep, key, kill, kind, king, kitchen, knee, knife, knock, know, ladder, lady, lamp, land, large, last, late, lately, laugh, lazy, lead, leaf, learn, leave, leg, left, lend, length, less, lesson, let, letter, library, lie, life, light, like, lion, lip, list, listen, little, live, lock, lonely, long, look, lose, lot, love, low, lower, luck, machine, main, make, male, man, many, map, mark, market, marry, matter, may, me, meal, mean, measure, meat, medicine, meet, member, mention, method, middle, milk, million, mind, minute, miss, mistake, mix, model, modern, moment, money, monkey, month, moon, more, morning, most, mother, mountain, mouth, move, much, music, must, my, name, narrow, nation, nature, near, nearly, neck, need, needle, neighbour, neither, net, never, new, news, newspaper, next, nice, night, nine, no, noble, noise, none, nor, north, nose, not, nothing, notice, now, number, obey, object, ocean, of, off, offer, office, often, oil, old, on, one, only, open, opposite, or, orange, order, other, our, out, outside, over, own, page, pain, paint, pair, pan, paper, parent, park, part, partner, party, pass, past, path, pay, peace, pen, pencil, people, per, perfect, period, person, photograph, piano, pick, picture, piece, pig, pin, pink, place, plane, plant, plastic, plate, play, please, pleased, plenty, pocket, point, poison, police, polite, pool, poor, popular, position, possible, potato, pour, power, present, press, pretty, prevent, price, prince, prison, private, prize, probably, problem, produce, promise, proper, protect, provide, public, pull, punish, pupil, push, put, queen, question, quick, quiet, quite, radio, rain, raise, reach, read, ready, real, really, receive, record, red, remember, remind, remove, rent, repair, repeat, reply, report, rest, restaurant, result, return, rice, rich, ride, right, ring, rise, road, rob, rock, room, round, rubber, rude, rule, ruler, run, rush, sad, safe, sail, salt, same, sand, save, say, school, science, search, seat, second, see, seem, sell, send, sentence, serve, seven, several, sex, shade, shadow, shake, shape, share, sharp, she, sheep, sheet, shelf, shine, ship, shirt, shoe, shoot, shop, short, should, shoulder, shout, show, sick, side, signal, silence, silly, silver, similar, simple, single, since, sing, sink, sister, sit, six, size, skill, skin, skirt, sky, sleep, slip, slow, smoke, small, smell, smile, smoke, snow, so, soap, sock, soft, some, someone, something, sometimes, son, soon, sorry, sound, soup, south, space, speak, special, speed, spell, spend, spoon, sport, spread, spring, square, stamp, stand, star, start, station, stay, steal, steam, step, still, stomach, stone, stop, store, storm, story, strange, street, strong, structure, student, study, stupid, subject, substance, successful, such, sudden, sugar, suitable, summer, sun, sunny, support, sure, surprise, sweet, swim, sword, table, take, talk, tall, taste, taxi, tea, teach, team, tear, telephone, television, tell, ten, tennis, terrible, test, than, that, the, their, then, there, therefore, these, thick, thin, thing, think, third, this, though, threat, three, tidy, tie, title, to, today, toe, together, tomorrow, tonight, too, tool, tooth, top, total, touch, town, train, travel, tree, trouble, true, trust, two, twice, try, turn, type, uncle, under, understand, unit, until, up, use, useful, usual, usually, vegetable, very, village, voice, visit, wait, wake, walk, want, warm, wash, waste, watch, water, way, we, weak, wear, weather, wedding, week, weight, welcome, well, west, wet, what, wheel, when, where, which, while, white, who, why, wide, wife, wild, will, win, wind, window, wine, winter, wire, wise, wish, with, without, woman, wonder, word, work, world, worry, worst, write, wrong, year, yes, yesterday, yet, you, young, your, zero""".split(", ")])) -------------------------------------------------------------------------------- /features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cPickle 3 | import tables 4 | import os 5 | 6 | from text.story.util.dsutils import make_word_ds, make_phoneme_ds, histogram_phonemes, cstates_to_bigrams, DataSequence, makelsa, catmats, histogram_phonemes2, sliding_chunk_sum, modulate 7 | 8 | from text.models.semtax.Semtax import Semtax 9 | from text.movie.util.SemanticModel import SemanticModel 10 | 11 | 12 | mapdict = lambda d, fun: dict(zip(d.keys(), map(fun, d.values()))) 13 | 14 | class Features(object): 15 | def __init__(self, grids, trfiles, interp="rect", **kwargs): 16 | """Initializes a Features object that can be used to create feature-space 17 | representations of the stimulus with the given [grids] and [trfiles]. 18 | 19 | [interp] can be "rect" or "sinc". 20 | [kwargs] are passed to the interpolation function. 21 | """ 22 | self.grids = grids 23 | self.trfiles = trfiles 24 | 25 | self.interp = interp 26 | self.interpargs = kwargs 27 | 28 | ## Precache word sequences and phoneme sequences for later use 29 | self.wordseqs = make_word_ds(grids, trfiles) 30 | self.phonseqs = make_phoneme_ds(grids, trfiles) 31 | 32 | def downsample(self, dsdict): 33 | """Downsamples each DataSequence in [dsdict] using the settings specified in the 34 | initializer. 35 | """ 36 | return mapdict(dsdict, lambda h: h.chunksums(self.interp, 37 | **self.interpargs)) 38 | 39 | def perstory(self): 40 | """Simple model: a separate intercept regressor for each story. 41 | """ 42 | nstories = len(self.grids) 43 | storymats = dict() 44 | for ii,st in enumerate(sorted(self.grids.keys())): 45 | smat = np.zeros((len(self.wordseqs[st].tr_times), nstories)) 46 | smat[:,ii] = 1 47 | storymats[st] = smat 48 | return storymats 49 | 50 | def numwords(self): 51 | """Simple model: the number of words per TR. 52 | """ 53 | return mapdict(self.wordseqs, lambda s: np.atleast_2d(map(len, s.chunks())).T.astype(float)) 54 | 55 | def numphonemes(self): 56 | """Simple model: the number of phonemes per TR. 57 | """ 58 | return mapdict(self.phonseqs, lambda s: np.atleast_2d(map(len, s.chunks())).T.astype(float)) 59 | 60 | def phonemecounts(self, debug=False): 61 | """Number of times each phoneme appears per TR. 62 | """ 63 | phonhists = mapdict(self.phonseqs, histogram_phonemes2) 64 | if debug: 65 | return phonhists 66 | return self.downsample(phonhists) 67 | #return mapdict(phonhists, lambda h: h.chunksums()) 68 | 69 | def markov(self, log=False, modeldir="/auto/k8/huth/storydata/stories-semtax-wbooks-3+20_100", num=10, nC=20, debug=False): 70 | """Markov syntactic model. The [modeldir] and [num] will be passed to the 71 | function Semtax.load_from_dir. 72 | This function assumes that the first [nC] features are syntactic. 73 | """ 74 | stmodel = Semtax.load_from_dir(modeldir, num) 75 | stmodel.cphi[0] = stmodel.cphi[-1] ## Fix vocab * bug 76 | stmodel.zphi[0] = stmodel.zphi[-1] 77 | 78 | sm = stmodel.to_SemanticModel(True) 79 | sm.data = sm.data[:nC] ## Limit to only syntactic part 80 | 81 | makecs = lambda ds: DataSequence(stmodel.infer_word_cstates(ds.data)[:,:nC], 82 | ds.split_inds, 83 | ds.data_times, 84 | ds.tr_times) 85 | rstimseqs = mapdict(self.wordseqs, makecs) 86 | if log: 87 | rstimseqs = mapdict(rstimseqs, lambda ds: DataSequence(np.log(ds.data+1e-10), 88 | ds.split_inds, 89 | ds.data_times, 90 | ds.tr_times)) 91 | 92 | if debug: 93 | return rstimseqs 94 | #return mapdict(rstimseqs, lambda s: s.chunksums()) 95 | return self.downsample(rstimseqs) 96 | 97 | def markov_bigrams(self, log=False, modeldir="/auto/k8/huth/storydata/stories-semtax-wbooks-3+20_100", 98 | num=10, nC=20): 99 | """Markov bigram syntactic model. The [modeldir] and [num] will be passed to the 100 | function Semtax.load_from_dir. 101 | This function assumes that the first [nC] features are syntactic. 102 | """ 103 | stmodel = Semtax.load_from_dir(modeldir, num) 104 | stmodel.cphi[0] = stmodel.cphi[-1] ## Fix vocab * bug 105 | stmodel.zphi[0] = stmodel.zphi[-1] 106 | 107 | sm = stmodel.to_SemanticModel(True) 108 | sm.data = sm.data[:nC] ## Limit to only syntactic part 109 | 110 | makecs = lambda ds: DataSequence(stmodel.infer_word_cstates(ds.data)[:,:nC], 111 | ds.split_inds, 112 | ds.data_times, 113 | ds.tr_times) 114 | rstimseqs = mapdict(self.wordseqs, makecs) 115 | bigramseqs = mapdict(rstimseqs, cstates_to_bigrams) 116 | if log: 117 | bigramseqs = mapdict(bigramseqs, lambda ds: DataSequence(np.log(ds.data+1e-10), 118 | ds.split_inds, 119 | ds.data_times, 120 | ds.tr_times)) 121 | 122 | #return mapdict(bigramseqs, lambda s: s.chunksums()) 123 | return self.downsample(bigramseqs) 124 | 125 | def markov_bigram_ics(self, modeldir="/auto/k8/huth/storydata/stories-semtax-wbooks-3+20_100", 126 | num=10, nC=20, icfile="/auto/k8/huth/storydata/transmat-ics-150-wbooks-2.hf5"): 127 | """Markov bigram IC syntactic model. The [modeldir] and [num] will be passed to the 128 | function Semtax.load_from_dir. 129 | This function assumes that the first [nC] features are syntactic. 130 | """ 131 | stmodel = Semtax.load_from_dir(modeldir, num) 132 | stmodel.cphi[0] = stmodel.cphi[-1] ## Fix vocab * bug 133 | stmodel.zphi[0] = stmodel.zphi[-1] 134 | 135 | sm = stmodel.to_SemanticModel(True) 136 | sm.data = sm.data[:nC] ## Limit to only syntactic part 137 | 138 | makecs = lambda ds: DataSequence(stmodel.infer_word_cstates(ds.data)[:,:nC], 139 | ds.split_inds, 140 | ds.data_times, 141 | ds.tr_times) 142 | rstimseqs = mapdict(self.wordseqs, makecs) 143 | bigramseqs = mapdict(rstimseqs, cstates_to_bigrams) 144 | logbigramseqs = mapdict(bigramseqs, lambda ds: DataSequence(np.log(ds.data+1e-10), 145 | ds.split_inds, 146 | ds.data_times, 147 | ds.tr_times)) 148 | 149 | bgics = tables.openFile(icfile).root.ics.read() 150 | projics = lambda ds: DataSequence(np.dot(bgics, sliding_chunk_sum(ds.data, 7).T).T, 151 | ds.split_inds, 152 | ds.data_times, 153 | ds.tr_times) 154 | bgicseqs = mapdict(logbigramseqs, projics) 155 | 156 | #return mapdict(bgicseqs, lambda s: s.chunksums()) 157 | return self.downsample(bgicseqs) 158 | 159 | def lsa(self, ndim, rectify, zsaxes=(1,), basepath="/auto/k8/huth/storydata/stories-wbooks-lsa-2", debug=False): 160 | """LSA semantic model. 161 | """ 162 | vocab = cPickle.load(open(basepath+"-vocab")) 163 | lsasm = SemanticModel(None, None) 164 | lsasm.load_ascii_root(basepath+"-Vt", vocab) 165 | lsasm.data = lsasm.data[:ndim] 166 | 167 | for axis in zsaxes: 168 | lsasm.zscore(axis) 169 | 170 | if rectify: 171 | lsasm.rectify() 172 | 173 | lsastimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, lsasm)) 174 | #return mapdict(lsastimseqs, lambda s: s.chunksums()) 175 | if debug: 176 | return lsastimseqs 177 | return self.downsample(lsastimseqs) 178 | 179 | @staticmethod 180 | def get_newlsa_model(ndim, rectify, entweight, entcutoff=5, basepath="/auto/k6/huth/lsamats6/"): 181 | """Returns a new LSA semantic model. 182 | """ 183 | entropyfilename = os.path.join(basepath, "globnorm_lsa1_1.npy") 184 | modelfilename = os.path.join(basepath, "evd1.hf5") 185 | 186 | entropy = np.load(entropyfilename) 187 | lsafile = tables.openFile(modelfilename) 188 | 189 | Q = lsafile.root.Q.read() 190 | vocab = lsafile.root.vocab.read() 191 | 192 | if entweight: 193 | lsasm = SemanticModel(Q[:,-ndim:].T * (np.clip(entropy, entcutoff, np.inf)**-1), vocab) 194 | else: 195 | lsasm = SemanticModel(Q[:,-ndim:].T, vocab) 196 | 197 | if rectify: 198 | lsasm.rectify() 199 | 200 | ## Store entropies in there as well 201 | lsasm.wordentropy = entropy 202 | 203 | lsafile.close() 204 | 205 | return lsasm 206 | 207 | def newlsa(self, ndim, rectify, entweight, entcutoff=5, basepath="/auto/k6/huth/lsamats6/", debug=False): 208 | """New LSA semantic model. 209 | """ 210 | lsasm = self.get_newlsa_model(ndim, rectify, entweight, entcutoff, basepath) 211 | lsastimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, lsasm)) 212 | 213 | if debug: 214 | return lsastimseqs 215 | return self.downsample(lsastimseqs) 216 | 217 | def hal(self, wordset="verbs", zsaxes=(0,1), rectify=False, 218 | basepath="/auto/k8/huth/storydata/story+books+wiki+15w-densehal-mat", debug=False): 219 | """HAL semantic model (without dimensionality reduction). 220 | """ 221 | from text.story.util.HalModel import make_hal_wordset_model, verb_set, make_hal_sm, english1000 222 | haltf = tables.openFile(basepath+".hf5") 223 | halmat = np.array(haltf.root.halmat.read()) 224 | halvocab = cPickle.load(open(basepath+"-vocab")) 225 | 226 | ## Choose a wordset 227 | if wordset=="verbs": 228 | wordset = verb_set 229 | elif wordset=="cmuverbs": 230 | wordset = verb_set[:23] 231 | elif wordset=="english1000": 232 | wordset = english1000 233 | 234 | halsm = make_hal_sm(halmat, halvocab, wordset) 235 | 236 | for axis in zsaxes: 237 | halsm.zscore(axis) 238 | 239 | if rectify: 240 | halsm.rectify() 241 | 242 | halstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, halsm)) 243 | #return mapdict(halstimseqs, lambda s: s.chunksums()) 244 | if debug: 245 | return halstimseqs 246 | return self.downsample(halstimseqs) 247 | 248 | @staticmethod 249 | def get_co_model(wordset="english1000", zsaxes=(0,1), rectify=False, 250 | basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat"): 251 | """Co-occurence-based semantic model (without dimensionality reduction). 252 | """ 253 | from text.story.util.HalModel import make_hal_wordset_model, verb_set, make_hal_sm, english1000 254 | cotf = tables.openFile(basepath+".hf5") 255 | comat = np.array(cotf.root.mat.read()) 256 | covocab = cPickle.load(open(basepath+"-vocab")) 257 | 258 | ## Choose a wordset 259 | if wordset=="verbs": 260 | wordset = verb_set 261 | elif wordset=="cmuverbs": 262 | wordset = verb_set[:23] 263 | elif wordset=="english1000": 264 | wordset = english1000 265 | elif wordset=="story": 266 | wordset = [[w] for w in cPickle.load(open("/auto/k1/huth/text/story/storyvocab_2013.pickle"))] 267 | 268 | cosm = make_hal_sm(comat, covocab, wordset) 269 | 270 | for axis in zsaxes: 271 | cosm.zscore(axis) 272 | 273 | if rectify: 274 | cosm.rectify() 275 | 276 | return cosm 277 | 278 | 279 | def co(self, wordset="english1000", zsaxes=(0,1), rectify=False, 280 | basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat", debug=False): 281 | """Co-occurence-based semantic model (without dimensionality reduction). 282 | """ 283 | cosm = self.get_co_model(wordset, zsaxes, rectify, basepath) 284 | costimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, cosm)) 285 | #return mapdict(halstimseqs, lambda s: s.chunksums()) 286 | if debug: 287 | return costimseqs 288 | return self.downsample(costimseqs) 289 | 290 | @staticmethod 291 | def get_orthogonal_co_model(wordset="english1000", zsaxes=(0,1), rectify=False, 292 | basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat", 293 | debug=False): 294 | """Co-occurence-based semantic model with pre-whitening. 295 | """ 296 | cosm = Features.get_co_model(wordset, zsaxes, rectify, basepath) 297 | ## Orthogonalize cosm data 298 | from text.movie.util.util import make_delayed, save_table_file, eigprincomp 299 | coc, col = eigprincomp(cosm.data.T) 300 | ## Flip so that first value on each component is positive (makes result deterministic) 301 | fcoc = (coc.T * np.sign(coc[:,0])).T 302 | ## Make new orthogonal cosm 303 | ocosm = cosm.copy() 304 | ocosm.data = np.dot(fcoc, cosm.data) 305 | return ocosm 306 | 307 | def orthogonal_co(self, wordset="english1000", zsaxes=(0,1), rectify=False, 308 | basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat", 309 | debug=False): 310 | """Co-occurence-based semantic model with pre-whitening. 311 | """ 312 | ocosm = self.get_orthogonal_co_model(wordset, zsaxes, rectify, basepath) 313 | costimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, ocosm)) 314 | #return mapdict(halstimseqs, lambda s: s.chunksums()) 315 | if debug: 316 | return costimseqs 317 | return self.downsample(costimseqs) 318 | 319 | def commonwords(self, num=100, basepath="/auto/k8/huth/storydata/stories-wbooks-lsa-2-vocab"): 320 | """Common word indicator model. Based on old LSA model fitting, used less data. 321 | """ 322 | vocab = cPickle.load(open(basepath)) 323 | counts = cPickle.load(open(basepath+"-Rcounts")) 324 | selwords = np.argsort(counts)[-num:] 325 | wmodel = SemanticModel(np.eye(num), list(np.array(vocab)[selwords])) 326 | 327 | wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel)) 328 | #return mapdict(wordstimseqs, lambda s: s.chunksums()) 329 | return self.downsample(wordstimseqs) 330 | 331 | def commonwords2(self, num=100, basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat"): 332 | """Common word indicator model. Base on newer co model fitting, using more data. 333 | """ 334 | cotf = tables.openFile(basepath+".hf5") 335 | counts = cotf.root.wordcounts.read() 336 | covocab = cPickle.load(open(basepath+"-vocab")) 337 | selwords = np.argsort(counts)[-num:] 338 | wmodel = SemanticModel(np.eye(num), list(np.array(covocab)[selwords])) 339 | 340 | wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel)) 341 | return self.downsample(wordstimseqs) 342 | 343 | def allwords(self): 344 | """All word indicator model. 345 | """ 346 | from text.textcore import Corpus 347 | corpus_file = "/auto/k5/huth/corpora/story/raw-transcripts/stories1.tar.gz" 348 | corpus = Corpus(corpus_file, split_documents=200) 349 | corpus_file1 = "/auto/k5/huth/corpora/story/raw-transcripts/stories2.tar.gz" 350 | corpus.append_corpus(corpus_file1) 351 | 352 | storyvocab = sorted(list(set(corpus.get_vocabulary()))) 353 | num = len(storyvocab) 354 | wmodel = SemanticModel(np.eye(num), list(np.array(storyvocab))) 355 | 356 | wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel)) 357 | #return mapdict(wordstimseqs, lambda s: s.chunksums()) 358 | return self.downsample(wordstimseqs) 359 | 360 | def nmflsa(self): 361 | """NMF LSA model based on newLSA. 362 | """ 363 | tf = tables.openFile("/auto/k6/huth/nmf-lsa.hf5") 364 | vocab = tf.root.vocab.read() 365 | data = tf.root.data.read() 366 | nmodel = SemanticModel(data, vocab) 367 | wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, nmodel)) 368 | #return mapdict(wordstimseqs, lambda s: s.chunksums()) 369 | return self.downsample(wordstimseqs) 370 | 371 | def surprisal(self, template="/auto/k5/huth/story-surprisal/%s.npy", prob=False, debug=False): 372 | """Word surprisal model. 373 | """ 374 | ## Load surprisal for each story 375 | sseqs = dict() 376 | for story,wseq in self.wordseqs.iteritems(): 377 | surprisal = np.load(template%story) 378 | if prob: 379 | d = 1-np.atleast_2d(surprisal).T 380 | else: 381 | d = -np.log2(np.atleast_2d(surprisal).T) 382 | d[np.isinf(d)] = 100 383 | sseq = DataSequence(d, 384 | wseq.split_inds, 385 | wseq.data_times, 386 | wseq.tr_times) 387 | sseqs[story] = sseq 388 | 389 | if debug: 390 | return sseqs 391 | else: 392 | return self.downsample(sseqs) 393 | 394 | def sphal(self, halargs, spargs, debug=False): 395 | """HAL model modulated by surprisal. 396 | """ 397 | halargs["debug"] = True 398 | halseqs = self.hal(**halargs) 399 | spargs["debug"] = True 400 | spargs["prob"] = True 401 | spseqs = self.surprisal(**spargs) 402 | 403 | modhal = dict([(st, modulate(ds, spseqs[st].data[:,0])) for (st,ds) in halseqs.items()]) 404 | 405 | if debug: 406 | return modhal 407 | return self.downsample(modhal) 408 | 409 | @classmethod 410 | def _get_word2vec_model(cls, modelfile="/auto/k8/huth/GoogleNews-vectors-negative300.bin", 411 | norm=False): 412 | from gensim.models.word2vec import Word2Vec 413 | model = Word2Vec.load_word2vec_format(modelfile, binary=True) 414 | usevocab = set(cPickle.load(open("/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat-vocab"))) 415 | vocab, vocinds = zip(*[(w, model.vocab[w].index) for w in usevocab if w in model.vocab]) 416 | #w2v_usevocab = [(w,val.index) for w,val in w2v.vocab.items() if w in usevocab] 417 | #srtvocab = [w for w,voc in sorted(w2v.vocab.items(), key=lambda item:item[1].index)] 418 | #srtvocab,srtinds = zip(*sorted(w2v_usevocab, key=lambda item:item[1])) 419 | if norm: 420 | data = model.syn0norm[list(vocinds)] 421 | else: 422 | data = model.syn0[list(vocinds)] 423 | 424 | w2vsm = SemanticModel(data.T, vocab) 425 | return w2vsm 426 | 427 | @classmethod 428 | def get_word2vec_model(cls, *args, **kwargs): 429 | if "_w2v_cache" not in dir(cls): 430 | cls._w2v_cache = cls._get_word2vec_model(*args, **kwargs) 431 | return cls._w2v_cache 432 | 433 | def word2vec(self, modelfile="/auto/k8/huth/GoogleNews-vectors-negative300.bin", norm=False): 434 | """GenSim / word2vec model. 435 | """ 436 | model = self.get_word2vec_model(modelfile, norm) 437 | #modeldims = model["test"].shape[0] 438 | #model.data = np.zeros((modeldims,)) 439 | w2vstims = mapdict(self.wordseqs, lambda ds: makelsa(ds, model)) 440 | return self.downsample(w2vstims) 441 | 442 | def emoratings(self, subjects=("ah", "ds", "jg", "wh", "ml"), smoothing=1.0): 443 | from text.story.emotions import util 444 | storyemolevels = util.load_story_ratings(subjects, self.grids) 445 | return util.story_interp_grids(subjects, self.grids, self.trfiles, 446 | storyemolevels, [smoothing]) 447 | -------------------------------------------------------------------------------- /interpdata.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import logging 3 | 4 | logger = logging.getLogger("text.regression.interpdata") 5 | 6 | def interpdata(data, oldtime, newtime): 7 | """Interpolates the columns of [data] to find the values at [newtime], given that the current 8 | values are at [oldtime]. [oldtime] must have the same number of elements as [data] has rows. 9 | """ 10 | ## Check input sizes ## 11 | if not len(oldtime) == data.shape[0]: 12 | raise IndexError("oldtime must have same number of elements as data has rows.") 13 | 14 | ## Set up matrix to hold output ## 15 | newdata = np.empty((len(newtime), data.shape[1])) 16 | 17 | ## Interpolate each column of data ## 18 | for ci in range(data.shape[1]): 19 | if (ci%100) == 0: 20 | logger.info("Interpolating column %d/%d.." % (ci+1, data.shape[1])) 21 | 22 | newdata[:,ci] = np.interp(newtime, oldtime, data[:,ci]) 23 | 24 | ## Return interpolated data ## 25 | return newdata 26 | 27 | def sincinterp1D(data, oldtime, newtime, cutoff_mult=1.0, window=1): 28 | """Interpolates the one-dimensional signal [data] at the times given by [newtime], assuming 29 | that each sample in [data] was collected at the corresponding time in [oldtime]. Clearly, 30 | [oldtime] and [data] must have the same length, but [newtime] can have any length. 31 | 32 | This function will assume that the time points in [newtime] are evenly spaced and will use 33 | that frequency multipled by [cutoff_mult] as the cutoff frequency of the sinc filter. 34 | 35 | The sinc function will be computed with [window] lobes. With [window]=1, this will 36 | effectively compute the Lanczos filter. 37 | 38 | This is a very simplistic filtering algorithm, so will take O(N*M) time, where N is the 39 | length of [oldtime] and M is the length of [newtime]. 40 | 41 | This filter is non-causal. 42 | """ 43 | ## Find the cutoff frequency ## 44 | cutoff = 1/np.mean(np.diff(newtime)) * cutoff_mult 45 | print ("Doing sinc interpolation with cutoff=%0.3f and %d lobes." % (cutoff, window)) 46 | 47 | ## Construct new signal ## 48 | newdata = np.zeros((len(newtime),1)) 49 | for ndi in range(len(newtime)): 50 | for di in range(len(oldtime)): 51 | newdata[ndi] += sincfun(cutoff, newtime[ndi]-oldtime[di], window) * data[di] 52 | return newdata 53 | 54 | def sincinterp2D(data, oldtime, newtime, cutoff_mult=1.0, window=1, causal=False, renorm=True): 55 | """Interpolates the columns of [data], assuming that the i'th row of data corresponds to 56 | oldtime(i). A new matrix with the same number of columns and a number of rows given 57 | by the length of [newtime] is returned. If [causal], only past time points will be used 58 | to computed the present value, and future time points will be ignored. 59 | 60 | The time points in [newtime] are assumed to be evenly spaced, and their frequency will 61 | be used to calculate the low-pass cutoff of the sinc interpolation filter. 62 | 63 | [window] lobes of the sinc function will be used. [window] should be an integer. 64 | """ 65 | ## Find the cutoff frequency ## 66 | cutoff = 1/np.mean(np.diff(newtime)) * cutoff_mult 67 | print ("Doing sinc interpolation with cutoff=%0.3f and %d lobes." % (cutoff, window)) 68 | 69 | ## Construct new signal ## 70 | # newdata = np.zeros((len(newtime), data.shape[1])) 71 | # for ndi in range(len(newtime)): 72 | # for di in range(len(oldtime)): 73 | # newdata[ndi,:] += sincfun(cutoff, newtime[ndi]-oldtime[di], window, causal) * data[di,:] 74 | 75 | ## Build up sinc matrix ## 76 | sincmat = np.zeros((len(newtime), len(oldtime))) 77 | for ndi in range(len(newtime)): 78 | sincmat[ndi,:] = sincfun(cutoff, newtime[ndi]-oldtime, window, causal, renorm) 79 | 80 | ## Construct new signal by multiplying the sinc matrix by the data ## 81 | newdata = np.dot(sincmat, data) 82 | 83 | return newdata 84 | 85 | def lanczosinterp2D(data, oldtime, newtime, window=3, cutoff_mult=1.0, rectify=False): 86 | """Interpolates the columns of [data], assuming that the i'th row of data corresponds to 87 | oldtime(i). A new matrix with the same number of columns and a number of rows given 88 | by the length of [newtime] is returned. 89 | 90 | The time points in [newtime] are assumed to be evenly spaced, and their frequency will 91 | be used to calculate the low-pass cutoff of the interpolation filter. 92 | 93 | [window] lobes of the sinc function will be used. [window] should be an integer. 94 | """ 95 | ## Find the cutoff frequency ## 96 | cutoff = 1/np.mean(np.diff(newtime)) * cutoff_mult 97 | print ("Doing lanczos interpolation with cutoff=%0.3f and %d lobes." % (cutoff, window)) 98 | 99 | ## Build up sinc matrix ## 100 | sincmat = np.zeros((len(newtime), len(oldtime))) 101 | for ndi in range(len(newtime)): 102 | sincmat[ndi,:] = lanczosfun(cutoff, newtime[ndi]-oldtime, window) 103 | 104 | if rectify: 105 | newdata = np.hstack([np.dot(sincmat, np.clip(data, -np.inf, 0)), 106 | np.dot(sincmat, np.clip(data, 0, np.inf))]) 107 | else: 108 | ## Construct new signal by multiplying the sinc matrix by the data ## 109 | newdata = np.dot(sincmat, data) 110 | 111 | return newdata 112 | 113 | def sincupinterp2D(data, oldtime, newtimes, cutoff, window=1): 114 | """Uses sinc interpolation to upsample the columns of [data], assuming that the i'th 115 | row of data comes from oldtime[i]. A new matrix with the same number of columns 116 | and a number of rows given by the length of [newtime] is returned. 117 | 118 | The times points in [oldtime] are assumed to be evenly spaced, and their frequency 119 | will be used to calculate the low-pass cutoff of the sinc interpolation filter. 120 | 121 | [window] lobes of the sinc function will be used. [window] should be an integer. 122 | Setting [window] to 1 yields a Lanczos filter. 123 | """ 124 | #cutoff = 1/np.mean(np.diff(oldtime)) 125 | print ("Doing sinc interpolation with cutoff=%0.3f and %d lobes."%(cutoff, window)) 126 | 127 | sincmat = np.zeros((len(newtimes), len(oldtime))) 128 | for ndi in range(len(newtimes)): 129 | sincmat[ndi,:] = sincfun(cutoff, newtimes[ndi]-oldtime, window, False) 130 | 131 | newdata = np.dot(sincmat, data) 132 | return newdata 133 | 134 | def sincfun(B, t, window=np.inf, causal=False, renorm=True): 135 | """Compute the sinc function with some cutoff frequency [B] at some time [t]. 136 | [t] can be a scalar or any shaped numpy array. 137 | If given a [window], only the lowest-order [window] lobes of the sinc function 138 | will be non-zero. 139 | If [causal], only past values (i.e. t<0) will have non-zero weights. 140 | """ 141 | val = 2*B*np.sin(2*np.pi*B*t)/(2*np.pi*B*t+1e-20) 142 | if t.shape: 143 | val[np.abs(t)>window/(2*B)] = 0 144 | if causal: 145 | val[t<0] = 0 146 | if not np.sum(val)==0.0 and renorm: 147 | val = val/np.sum(val) 148 | elif np.abs(t)>window/(2*B): 149 | val = 0 150 | if causal and t<0: 151 | val = 0 152 | return val 153 | 154 | def lanczosfun(cutoff, t, window=3): 155 | """Compute the lanczos function with some cutoff frequency [B] at some time [t]. 156 | [t] can be a scalar or any shaped numpy array. 157 | If given a [window], only the lowest-order [window] lobes of the sinc function 158 | will be non-zero. 159 | """ 160 | t = t * cutoff 161 | val = window * np.sin(np.pi*t) * np.sin(np.pi*t/window) / (np.pi**2 * t**2) 162 | val[t==0] = 1.0 163 | val[np.abs(t)>window] = 0.0 164 | return val# / (val.sum() + 1e-10) 165 | 166 | def expinterp2D(data, oldtime, newtime, theta): 167 | intmat = np.zeros((len(newtime), len(oldtime))) 168 | for ndi in range(len(newtime)): 169 | intmat[ndi,:] = expfun(theta, newtime[ndi]-oldtime) 170 | 171 | ## Construct new signal by multiplying the sinc matrix by the data ## 172 | newdata = np.dot(intmat, data) 173 | return newdata 174 | 175 | def expfun(theta, t): 176 | """Computes an exponential weighting function for interpolation. 177 | """ 178 | val = np.exp(-t*theta) 179 | val[t<0] = 0.0 180 | if not np.sum(val)==0.0: 181 | val = val/np.sum(val) 182 | return val 183 | 184 | def gabor_xfm(data, oldtimes, newtimes, freqs, sigma): 185 | sinvals = np.vstack([np.sin(oldtimes*f*2*np.pi) for f in freqs]) 186 | cosvals = np.vstack([np.cos(oldtimes*f*2*np.pi) for f in freqs]) 187 | outvals = np.zeros((len(newtimes), len(freqs)), dtype=np.complex128) 188 | for ti,t in enumerate(newtimes): 189 | ## Build gaussian function 190 | gaussvals = np.exp(-0.5*(oldtimes-t)**2/(2*sigma**2))*data 191 | ## Take product with sin/cos vals 192 | sprod = np.dot(sinvals, gaussvals) 193 | cprod = np.dot(cosvals, gaussvals) 194 | ## Store the output 195 | outvals[ti,:] = cprod + 1j*sprod 196 | 197 | return outvals 198 | 199 | def gabor_xfm2D(ddata, oldtimes, newtimes, freqs, sigma): 200 | return np.vstack([gabor_xfm(d, oldtimes, newtimes, freqs, sigma).T for d in ddata]) 201 | 202 | def test_interp(**kwargs): 203 | """Tests sincinterp2D passing it the given [kwargs] and interpolating known signals 204 | between the two time domains. 205 | """ 206 | oldtime = np.linspace(0, 10, 100) 207 | newtime = np.linspace(0, 10, 49) 208 | data = np.zeros((4, 100)) 209 | ## The first row has a single nonzero value 210 | data[0,50] = 1.0 211 | ## The second row has a few nonzero values in a row 212 | data[1,45:55] = 1.0 213 | ## The third row has a few nonzero values separated by zeros 214 | data[2,40:45] = 1.0 215 | data[2,55:60] = 1.0 216 | ## The fourth row has different values 217 | data[3,40:45] = 1.0 218 | data[3,55:60] = 2.0 219 | 220 | ## Interpolate the data 221 | interpdata = sincinterp2D(data.T, oldtime, newtime, **kwargs).T 222 | 223 | ## Plot the results 224 | from matplotlib.pyplot import figure, show 225 | fig = figure() 226 | for d in range(4): 227 | ax = fig.add_subplot(4,1,d+1) 228 | ax.plot(newtime, interpdata[d,:], 'go-') 229 | ax.plot(oldtime, data[d,:], 'bo-') 230 | 231 | #ax.tight() 232 | show() 233 | return newtime, interpdata 234 | -------------------------------------------------------------------------------- /npp.py: -------------------------------------------------------------------------------- 1 | """This module contains one line functions that should, by all rights, by in numpy. 2 | """ 3 | import numpy as np 4 | 5 | ## Demean -- remove the mean from each column 6 | demean = lambda v: v-v.mean(0) 7 | demean.__doc__ = """Removes the mean from each column of [v].""" 8 | dm = demean 9 | 10 | ## Z-score -- z-score each column 11 | zscore = lambda v: (v-v.mean(0))/v.std(0) 12 | zscore.__doc__ = """Z-scores (standardizes) each column of [v].""" 13 | zs = zscore 14 | 15 | ## Rescale -- make each column have unit variance 16 | rescale = lambda v: v/v.std(0) 17 | rescale.__doc__ = """Rescales each column of [v] to have unit variance.""" 18 | rs = rescale 19 | 20 | ## Matrix corr -- find correlation between each column of c1 and the corresponding column of c2 21 | mcorr = lambda c1,c2: (zs(c1)*zs(c2)).mean(0) 22 | mcorr.__doc__ = """Matrix correlation. Find the correlation between each column of [c1] and the corresponding column of [c2].""" 23 | 24 | ## Cross corr -- find corr. between each row of c1 and EACH row of c2 25 | xcorr = lambda c1,c2: np.dot(zs(c1.T).T,zs(c2.T)) / (c1.shape[1]) 26 | xcorr.__doc__ = """Cross-column correlation. Finds the correlation between each row of [c1] and each row of [c2].""" 27 | -------------------------------------------------------------------------------- /ridge.py: -------------------------------------------------------------------------------- 1 | #import scipy 2 | from functools import reduce 3 | import numpy as np 4 | import logging 5 | from utils import mult_diag, counter 6 | import random 7 | import itertools as itools 8 | 9 | zs = lambda v: (v-v.mean(0))/v.std(0) ## z-score function 10 | 11 | 12 | def ridge_corr(Rstim, Pstim, Rresp, Presp, alphas, normalpha=False, dtype=np.single, corrmin=0.2, 13 | singcutoff=1e-10, use_corr=True, logger=logging.getLogger("ridge_corr")): 14 | """Uses ridge regression to find a linear transformation of [Rstim] that approximates [Rresp]. 15 | Then tests by comparing the transformation of [Pstim] to [Presp]. This procedure is repeated 16 | for each regularization parameter alpha in [alphas]. The correlation between each prediction and 17 | each response for each alpha is returned. Note that the regression weights are NOT returned. 18 | 19 | Parameters 20 | ---------- 21 | Rstim : array_like, shape (TR, N) 22 | Training stimuli with TR time points and N features. Each feature should be Z-scored across time. 23 | Pstim : array_like, shape (TP, N) 24 | Test stimuli with TP time points and N features. Each feature should be Z-scored across time. 25 | Rresp : array_like, shape (TR, M) 26 | Training responses with TR time points and M responses (voxels, neurons, what-have-you). 27 | Each response should be Z-scored across time. 28 | Presp : array_like, shape (TP, M) 29 | Test responses with TP time points and M responses. 30 | alphas : list or array_like, shape (A,) 31 | Ridge parameters to be tested. Should probably be log-spaced. np.logspace(0, 3, 20) works well. 32 | normalpha : boolean 33 | Whether ridge parameters should be normalized by the Frobenius norm of Rstim. Good for 34 | comparing models with different numbers of parameters. 35 | dtype : np.dtype 36 | All data will be cast as this dtype for computation. np.single is used by default for memory 37 | efficiency. 38 | corrmin : float in [0..1] 39 | Purely for display purposes. After each alpha is tested, the number of responses with correlation 40 | greater than corrmin minus the number of responses with correlation less than negative corrmin 41 | will be printed. For long-running regressions this vague metric of non-centered skewness can 42 | give you a rough sense of how well the model is working before it's done. 43 | singcutoff : float 44 | The first step in ridge regression is computing the singular value decomposition (SVD) of the 45 | stimulus Rstim. If Rstim is not full rank, some singular values will be approximately equal 46 | to zero and the corresponding singular vectors will be noise. These singular values/vectors 47 | should be removed both for speed (the fewer multiplications the better!) and accuracy. Any 48 | singular values less than singcutoff will be removed. 49 | use_corr : boolean 50 | If True, this function will use correlation as its metric of model fit. If False, this function 51 | will instead use variance explained (R-squared) as its metric of model fit. For ridge regression 52 | this can make a big difference -- highly regularized solutions will have very small norms and 53 | will thus explain very little variance while still leading to high correlations, as correlation 54 | is scale-free while R**2 is not. 55 | 56 | Returns 57 | ------- 58 | Rcorrs : array_like, shape (A, M) 59 | The correlation between each predicted response and each column of Presp for each alpha. 60 | 61 | """ 62 | ## Calculate SVD of stimulus matrix 63 | logger.info("Doing SVD...") 64 | try: 65 | U,S,Vh = np.linalg.svd(Rstim, full_matrices=False) 66 | except np.linalg.LinAlgError as e: 67 | logger.info("NORMAL SVD FAILED, trying more robust dgesvd..") 68 | from text.regression.svd_dgesvd import svd_dgesvd 69 | U,S,Vh = svd_dgesvd(Rstim, full_matrices=False) 70 | 71 | ## Truncate tiny singular values for speed 72 | origsize = S.shape[0] 73 | ngoodS = np.sum(S>singcutoff) 74 | nbad = origsize-ngoodS 75 | U = U[:,:ngoodS] 76 | S = S[:ngoodS] 77 | Vh = Vh[:ngoodS] 78 | logger.info("Dropped %d tiny singular values.. (U is now %s)"%(nbad, str(U.shape))) 79 | 80 | ## Normalize alpha by the Frobenius norm 81 | #frob = np.sqrt((S**2).sum()) ## Frobenius! 82 | frob = S[0] 83 | #frob = S.sum() 84 | logger.info("Training stimulus has Frobenius norm: %0.03f"%frob) 85 | if normalpha: 86 | nalphas = alphas * frob 87 | else: 88 | nalphas = alphas 89 | 90 | ## Precompute some products for speed 91 | UR = np.dot(U.T, Rresp) ## Precompute this matrix product for speed 92 | PVh = np.dot(Pstim, Vh.T) ## Precompute this matrix product for speed 93 | 94 | #Prespnorms = np.apply_along_axis(np.linalg.norm, 0, Presp) ## Precompute test response norms 95 | zPresp = zs(Presp) 96 | Prespvar = Presp.var(0) 97 | Rcorrs = [] ## Holds training correlations for each alpha 98 | for na, a in zip(nalphas, alphas): 99 | #D = np.diag(S/(S**2+a**2)) ## Reweight singular vectors by the ridge parameter 100 | D = S/(S**2+na**2) ## Reweight singular vectors by the (normalized?) ridge parameter 101 | 102 | pred = np.dot(mult_diag(D, PVh, left=False), UR) ## Best (1.75 seconds to prediction in test) 103 | # pred = np.dot(mult_diag(D, np.dot(Pstim, Vh.T), left=False), UR) ## Better (2.0 seconds to prediction in test) 104 | 105 | # pvhd = reduce(np.dot, [Pstim, Vh.T, D]) ## Pretty good (2.4 seconds to prediction in test) 106 | # pred = np.dot(pvhd, UR) 107 | 108 | # wt = reduce(np.dot, [Vh.T, D, UR]).astype(dtype) ## Bad (14.2 seconds to prediction in test) 109 | # wt = reduce(np.dot, [Vh.T, D, U.T, Rresp]).astype(dtype) ## Worst 110 | # pred = np.dot(Pstim, wt) ## Predict test responses 111 | 112 | if use_corr: 113 | #prednorms = np.apply_along_axis(np.linalg.norm, 0, pred) ## Compute predicted test response norms 114 | #Rcorr = np.array([np.corrcoef(Presp[:,ii], pred[:,ii].ravel())[0,1] for ii in range(Presp.shape[1])]) ## Slowly compute correlations 115 | #Rcorr = np.array(np.sum(np.multiply(Presp, pred), 0)).squeeze()/(prednorms*Prespnorms) ## Efficiently compute correlations 116 | Rcorr = (zPresp*zs(pred)).mean(0) 117 | else: 118 | ## Compute variance explained 119 | resvar = (Presp-pred).var(0) 120 | Rcorr = np.clip(1-(resvar/Prespvar), 0, 1) 121 | 122 | Rcorr[np.isnan(Rcorr)] = 0 123 | Rcorrs.append(Rcorr) 124 | 125 | log_template = "Training: alpha=%0.3f, mean corr=%0.5f, max corr=%0.5f, over-under(%0.2f)=%d" 126 | log_msg = log_template % (a, 127 | np.mean(Rcorr), 128 | np.max(Rcorr), 129 | corrmin, 130 | (Rcorr>corrmin).sum()-(-Rcorr>corrmin).sum()) 131 | if logger is not None: 132 | logger.info(log_msg) 133 | else: 134 | print (log_msg) 135 | 136 | return Rcorrs 137 | 138 | 139 | def bootstrap_ridge(Rstim, Rresp, Pstim, Presp, alphas, nboots, chunklen, nchunks, dtype=np.single, 140 | corrmin=0.2, joined=None, singcutoff=1e-10, normalpha=False, single_alpha=False, 141 | use_corr=True, logger=logging.getLogger("ridge_corr")): 142 | """Uses ridge regression with a bootstrapped held-out set to get optimal alpha values for each response. 143 | [nchunks] random chunks of length [chunklen] will be taken from [Rstim] and [Rresp] for each regression 144 | run. [nboots] total regression runs will be performed. The best alpha value for each response will be 145 | averaged across the bootstraps to estimate the best alpha for that response. 146 | 147 | If [joined] is given, it should be a list of lists where the STRFs for all the voxels in each sublist 148 | will be given the same regularization parameter (the one that is the best on average). 149 | 150 | Parameters 151 | ---------- 152 | Rstim : array_like, shape (TR, N) 153 | Training stimuli with TR time points and N features. Each feature should be Z-scored across time. 154 | Rresp : array_like, shape (TR, M) 155 | Training responses with TR time points and M different responses (voxels, neurons, what-have-you). 156 | Each response should be Z-scored across time. 157 | Pstim : array_like, shape (TP, N) 158 | Test stimuli with TP time points and N features. Each feature should be Z-scored across time. 159 | Presp : array_like, shape (TP, M) 160 | Test responses with TP time points and M different responses. Each response should be Z-scored across 161 | time. 162 | alphas : list or array_like, shape (A,) 163 | Ridge parameters that will be tested. Should probably be log-spaced. np.logspace(0, 3, 20) works well. 164 | nboots : int 165 | The number of bootstrap samples to run. 15 to 30 works well. 166 | chunklen : int 167 | On each sample, the training data is broken into chunks of this length. This should be a few times 168 | longer than your delay/STRF. e.g. for a STRF with 3 delays, I use chunks of length 10. 169 | nchunks : int 170 | The number of training chunks held out to test ridge parameters for each bootstrap sample. The product 171 | of nchunks and chunklen is the total number of training samples held out for each sample, and this 172 | product should be about 20 percent of the total length of the training data. 173 | dtype : np.dtype 174 | All data will be cast as this dtype for computation. np.single is used by default for memory efficiency, 175 | as using np.double will thrash most machines on a big problem. If you want to do regression on 176 | complex variables, this should be changed to np.complex128. 177 | corrmin : float in [0..1] 178 | Purely for display purposes. After each alpha is tested for each bootstrap sample, the number of 179 | responses with correlation greater than this value will be printed. For long-running regressions this 180 | can give a rough sense of how well the model works before it's done. 181 | joined : None or list of array_like indices 182 | If you want the STRFs for two (or more) responses to be directly comparable, you need to ensure that 183 | the regularization parameter that they use is the same. To do that, supply a list of the response sets 184 | that should use the same ridge parameter here. For example, if you have four responses, joined could 185 | be [np.array([0,1]), np.array([2,3])], in which case responses 0 and 1 will use the same ridge parameter 186 | (which will be parameter that is best on average for those two), and likewise for responses 2 and 3. 187 | singcutoff : float 188 | The first step in ridge regression is computing the singular value decomposition (SVD) of the 189 | stimulus Rstim. If Rstim is not full rank, some singular values will be approximately equal 190 | to zero and the corresponding singular vectors will be noise. These singular values/vectors 191 | should be removed both for speed (the fewer multiplications the better!) and accuracy. Any 192 | singular values less than singcutoff will be removed. 193 | normalpha : boolean 194 | Whether ridge parameters (alphas) should be normalized by the Frobenius norm of Rstim. Good for rigorously 195 | comparing models with different numbers of parameters. 196 | single_alpha : boolean 197 | Whether to use a single alpha for all responses. Good for identification/decoding. 198 | use_corr : boolean 199 | If True, this function will use correlation as its metric of model fit. If False, this function 200 | will instead use variance explained (R-squared) as its metric of model fit. For ridge regression 201 | this can make a big difference -- highly regularized solutions will have very small norms and 202 | will thus explain very little variance while still leading to high correlations, as correlation 203 | is scale-free while R**2 is not. 204 | 205 | Returns 206 | ------- 207 | wt : array_like, shape (N, M) 208 | Regression weights for N features and M responses. 209 | corrs : array_like, shape (M,) 210 | Validation set correlations. Predicted responses for the validation set are obtained using the regression 211 | weights: pred = np.dot(Pstim, wt), and then the correlation between each predicted response and each 212 | column in Presp is found. 213 | alphas : array_like, shape (M,) 214 | The regularization coefficient (alpha) selected for each voxel using bootstrap cross-validation. 215 | bootstrap_corrs : array_like, shape (A, M, B) 216 | Correlation between predicted and actual responses on randomly held out portions of the training set, 217 | for each of A alphas, M voxels, and B bootstrap samples. 218 | valinds : array_like, shape (TH, B) 219 | The indices of the training data that were used as "validation" for each bootstrap sample. 220 | """ 221 | nresp, nvox = Rresp.shape 222 | bestalphas = np.zeros((nboots, nvox)) ## Will hold the best alphas for each voxel 223 | valinds = [] ## Will hold the indices into the validation data for each bootstrap 224 | 225 | Rcmats = [] 226 | for bi in counter(range(nboots), countevery=1, total=nboots): 227 | logger.info("Selecting held-out test set..") 228 | allinds = range(nresp) 229 | indchunks = list(zip(*[iter(allinds)]*chunklen)) 230 | random.shuffle(indchunks) 231 | heldinds = list(itools.chain(*indchunks[:nchunks])) 232 | notheldinds = list(set(allinds)-set(heldinds)) 233 | valinds.append(heldinds) 234 | 235 | RRstim = Rstim[notheldinds,:] 236 | PRstim = Rstim[heldinds,:] 237 | RRresp = Rresp[notheldinds,:] 238 | PRresp = Rresp[heldinds,:] 239 | 240 | ## Run ridge regression using this test set 241 | Rcmat = ridge_corr(RRstim, PRstim, RRresp, PRresp, alphas, 242 | dtype=dtype, corrmin=corrmin, singcutoff=singcutoff, 243 | normalpha=normalpha, use_corr=use_corr) 244 | 245 | Rcmats.append(Rcmat) 246 | 247 | ## Find weights for each voxel 248 | try: 249 | U,S,Vh = np.linalg.svd(Rstim, full_matrices=False) 250 | except np.linalg.LinAlgError as e: 251 | logger.info("NORMAL SVD FAILED, trying more robust dgesvd..") 252 | from text.regression.svd_dgesvd import svd_dgesvd 253 | U,S,Vh = svd_dgesvd(Rstim, full_matrices=False) 254 | 255 | ## Normalize alpha by the Frobenius norm 256 | #frob = np.sqrt((S**2).sum()) ## Frobenius! 257 | frob = S[0] 258 | #frob = S.sum() 259 | logger.info("Total training stimulus has Frobenius norm: %0.03f"%frob) 260 | if normalpha: 261 | nalphas = alphas * frob 262 | else: 263 | nalphas = alphas 264 | 265 | allRcorrs = np.dstack(Rcmats) 266 | if not single_alpha: 267 | logger.info("Finding best alpha for each response..") 268 | if joined is None: 269 | ## Find best alpha for each voxel 270 | meanbootcorrs = allRcorrs.mean(2) 271 | bestalphainds = np.argmax(meanbootcorrs, 0) 272 | valphas = nalphas[bestalphainds] 273 | else: 274 | ## Find best alpha for each group of voxels 275 | valphas = np.zeros((nvox,)) 276 | for jl in joined: 277 | jcorrs = allRcorrs[:,jl,:].mean(1).mean(1) ## Mean across voxels in the set, then mean across bootstraps 278 | bestalpha = np.argmax(jcorrs) 279 | valphas[jl] = nalphas[bestalpha] 280 | else: 281 | logger.info("Finding single best alpha..") 282 | meanbootcorr = allRcorrs.mean(2).mean(1) 283 | bestalphaind = np.argmax(meanbootcorr) 284 | bestalpha = alphas[bestalphaind] 285 | valphas = np.array([bestalpha]*nvox) 286 | logger.info("Best alpha = %0.3f"%bestalpha) 287 | 288 | logger.info("Computing weights for each response using entire training set..") 289 | UR = np.dot(U.T, np.nan_to_num(Rresp)) 290 | pred = np.zeros(Presp.shape) 291 | wt = np.zeros((Rstim.shape[1], Rresp.shape[1])) 292 | for ai,alpha in enumerate(nalphas): 293 | selvox = np.nonzero(valphas==alpha)[0] 294 | awt = reduce(np.dot, [Vh.T, np.diag(S/(S**2+alpha**2)), UR[:,selvox]]) 295 | pred[:,selvox] = np.dot(Pstim, awt) 296 | wt[:,selvox] = awt 297 | 298 | ## Find test correlations 299 | nnpred = np.nan_to_num(pred) 300 | corrs = np.nan_to_num(np.array([np.corrcoef(Presp[:,ii], nnpred[:,ii].ravel())[0,1] for ii in range(Presp.shape[1])])) 301 | 302 | return wt, corrs, valphas, allRcorrs, valinds 303 | -------------------------------------------------------------------------------- /stimulus_utils.py: -------------------------------------------------------------------------------- 1 | from textgrid import TextGrid 2 | import os 3 | import numpy as np 4 | from collections import defaultdict 5 | 6 | def load_grid(story, grid_dir="data/grids"): 7 | """Loads the TextGrid for the given [story] from the directory [grid_dir]. 8 | The first file that starts with [story] will be loaded, so if there are 9 | multiple versions of a grid for a story, beward. 10 | """ 11 | gridfile = [os.path.join(grid_dir, gf) for gf in os.listdir(grid_dir) if gf.startswith(story)][0] 12 | return TextGrid(open(gridfile).read()) 13 | 14 | def load_grids_for_stories(stories): 15 | """Loads grids for the given [stories], puts them in a dictionary. 16 | """ 17 | return dict([(st, load_grid(st)) for st in stories]) 18 | 19 | def load_5tier_grids_for_stories(stories, rootdir): 20 | grids = dict() 21 | for story in stories: 22 | storydir = os.path.join(rootdir, [sd for sd in os.listdir(rootdir) if sd.startswith(story)][0]) 23 | storyfile = os.path.join(storydir, [sf for sf in os.listdir(storydir) if sf.endswith("TextGrid")][0]) 24 | grids[story] = TextGrid(open(storyfile).read()) 25 | return grids 26 | 27 | 28 | class TRFile(object): 29 | def __init__(self, trfilename, expectedtr=2.0045): 30 | """Loads data from [trfilename], should be output from stimulus presentation code. 31 | """ 32 | self.trtimes = [] 33 | self.soundstarttime = -1 34 | self.soundstoptime = -1 35 | self.otherlabels = [] 36 | self.expectedtr = expectedtr 37 | 38 | if trfilename is not None: 39 | self.load_from_file(trfilename) 40 | 41 | 42 | def load_from_file(self, trfilename): 43 | """Loads TR data from report with given [trfilename]. 44 | """ 45 | ## Read the report file and populate the datastructure 46 | for ll in open(trfilename): 47 | timestr = ll.split()[0] 48 | label = " ".join(ll.split()[1:]) 49 | time = float(timestr) 50 | 51 | if label in ("init-trigger", "trigger"): 52 | self.trtimes.append(time) 53 | 54 | elif label=="sound-start": 55 | self.soundstarttime = time 56 | 57 | elif label=="sound-stop": 58 | self.soundstoptime = time 59 | 60 | else: 61 | self.otherlabels.append((time, label)) 62 | 63 | ## Fix weird TR times 64 | itrtimes = np.diff(self.trtimes) 65 | badtrtimes = np.nonzero(itrtimes>(itrtimes.mean()*1.5))[0] 66 | newtrs = [] 67 | for btr in badtrtimes: 68 | ## Insert new TR where it was missing.. 69 | newtrtime = self.trtimes[btr]+self.expectedtr 70 | newtrs.append((newtrtime,btr)) 71 | 72 | for ntr,btr in newtrs: 73 | self.trtimes.insert(btr+1, ntr) 74 | 75 | def simulate(self, ntrs): 76 | """Simulates [ntrs] TRs that occur at the expected TR. 77 | """ 78 | self.trtimes = list(np.arange(ntrs)*self.expectedtr) 79 | 80 | def get_reltriggertimes(self): 81 | """Returns the times of all trigger events relative to the sound. 82 | """ 83 | return np.array(self.trtimes)-self.soundstarttime 84 | 85 | @property 86 | def avgtr(self): 87 | """Returns the average TR for this run. 88 | """ 89 | return np.diff(self.trtimes).mean() 90 | 91 | def load_generic_trfiles(stories, root="data/trfiles"): 92 | """Loads a dictionary of generic TRFiles (i.e. not specifically from the session 93 | in which the data was collected.. this should be fine) for the given stories. 94 | """ 95 | trdict = dict() 96 | 97 | for story in stories: 98 | try: 99 | trf = TRFile(os.path.join(root, "%s.report"%story)) 100 | trdict[story] = [trf] 101 | except Exception as e: 102 | print (e) 103 | 104 | return trdict 105 | -------------------------------------------------------------------------------- /textgrid.py: -------------------------------------------------------------------------------- 1 | # Natural Language Toolkit: TextGrid analysis 2 | # 3 | # Copyright (C) 2001-2011 NLTK Project 4 | # Author: Margaret Mitchell 5 | # Steven Bird (revisions) 6 | # URL: 7 | # For license information, see LICENSE.TXT 8 | # 9 | 10 | """ 11 | Tools for reading TextGrid files, the format used by Praat. 12 | 13 | Module contents 14 | =============== 15 | 16 | The textgrid corpus reader provides 4 data items and 1 function 17 | for each textgrid file. For each tier in the file, the reader 18 | provides 10 data items and 2 functions. 19 | 20 | For the full textgrid file: 21 | 22 | - size 23 | The number of tiers in the file. 24 | 25 | - xmin 26 | First marked time of the file. 27 | 28 | - xmax 29 | Last marked time of the file. 30 | 31 | - t_time 32 | xmax - xmin. 33 | 34 | - text_type 35 | The style of TextGrid format: 36 | - ooTextFile: Organized by tier. 37 | - ChronTextFile: Organized by time. 38 | - OldooTextFile: Similar to ooTextFile. 39 | 40 | - to_chron() 41 | Convert given file to a ChronTextFile format. 42 | 43 | - to_oo() 44 | Convert given file to an ooTextFile format. 45 | 46 | For each tier: 47 | 48 | - text_type 49 | The style of TextGrid format, as above. 50 | 51 | - classid 52 | The style of transcription on this tier: 53 | - IntervalTier: Transcription is marked as intervals. 54 | - TextTier: Transcription is marked as single points. 55 | 56 | - nameid 57 | The name of the tier. 58 | 59 | - xmin 60 | First marked time of the tier. 61 | 62 | - xmax 63 | Last marked time of the tier. 64 | 65 | - size 66 | Number of entries in the tier. 67 | 68 | - transcript 69 | The raw transcript for the tier. 70 | 71 | - simple_transcript 72 | The transcript formatted as a list of tuples: (time1, time2, utterance). 73 | 74 | - tier_info 75 | List of (classid, nameid, xmin, xmax, size, transcript). 76 | 77 | - min_max() 78 | A tuple of (xmin, xmax). 79 | 80 | - time(non_speech_marker) 81 | Returns the utterance time of a given tier. 82 | Excludes entries that begin with a non-speech marker. 83 | 84 | """ 85 | 86 | # needs more cleanup, subclassing, epydoc docstrings 87 | 88 | import sys 89 | import re 90 | 91 | TEXTTIER = "TextTier" 92 | INTERVALTIER = "IntervalTier" 93 | 94 | OOTEXTFILE = re.compile(r"""(?x) 95 | xmin\ =\ (.*)[\r\n]+ 96 | xmax\ =\ (.*)[\r\n]+ 97 | [\s\S]+?size\ =\ (.*)[\r\n]+ 98 | """) 99 | 100 | CHRONTEXTFILE = re.compile(r"""(?x) 101 | [\r\n]+(\S+)\ 102 | (\S+)\ +!\ Time\ domain.\ *[\r\n]+ 103 | (\S+)\ +!\ Number\ of\ tiers.\ *[\r\n]+" 104 | """) 105 | 106 | OLDOOTEXTFILE = re.compile(r"""(?x) 107 | [\r\n]+(\S+) 108 | [\r\n]+(\S+) 109 | [\r\n]+.+[\r\n]+(\S+) 110 | """) 111 | 112 | 113 | 114 | ################################################################# 115 | # TextGrid Class 116 | ################################################################# 117 | 118 | class TextGrid(object): 119 | """ 120 | Class to manipulate the TextGrid format used by Praat. 121 | Separates each tier within this file into its own Tier 122 | object. Each TextGrid object has 123 | a number of tiers (size), xmin, xmax, a text type to help 124 | with the different styles of TextGrid format, and tiers with their 125 | own attributes. 126 | """ 127 | 128 | def __init__(self, read_file): 129 | """ 130 | Takes open read file as input, initializes attributes 131 | of the TextGrid file. 132 | @type read_file: An open TextGrid file, mode "r". 133 | @param size: Number of tiers. 134 | @param xmin: xmin. 135 | @param xmax: xmax. 136 | @param t_time: Total time of TextGrid file. 137 | @param text_type: TextGrid format. 138 | @type tiers: A list of tier objects. 139 | """ 140 | 141 | self.read_file = read_file 142 | self.size = 0 143 | self.xmin = 0 144 | self.xmax = 0 145 | self.t_time = 0 146 | self.text_type = self._check_type() 147 | self.tiers = self._find_tiers() 148 | 149 | def __iter__(self): 150 | for tier in self.tiers: 151 | yield tier 152 | 153 | def next(self): 154 | if self.idx == (self.size - 1): 155 | raise StopIteration 156 | self.idx += 1 157 | return self.tiers[self.idx] 158 | 159 | @staticmethod 160 | def load(file): 161 | """ 162 | @param file: a file in TextGrid format 163 | """ 164 | 165 | return TextGrid(open(file).read()) 166 | 167 | def _load_tiers(self, header): 168 | """ 169 | Iterates over each tier and grabs tier information. 170 | """ 171 | 172 | tiers = [] 173 | if self.text_type == "ChronTextFile": 174 | m = re.compile(header) 175 | tier_headers = m.findall(self.read_file) 176 | tier_re = " \d+.?\d* \d+.?\d*[\r\n]+\"[^\"]*\"" 177 | for i in range(0, self.size): 178 | tier_info = [tier_headers[i]] + \ 179 | re.findall(str(i + 1) + tier_re, self.read_file) 180 | tier_info = "\n".join(tier_info) 181 | tiers.append(Tier(tier_info, self.text_type, self.t_time)) 182 | return tiers 183 | 184 | tier_re = header + "[\s\S]+?(?=" + header + "|$$)" 185 | m = re.compile(tier_re) 186 | tier_iter = m.finditer(self.read_file) 187 | for iterator in tier_iter: 188 | (begin, end) = iterator.span() 189 | tier_info = self.read_file[begin:end] 190 | tiers.append(Tier(tier_info, self.text_type, self.t_time)) 191 | return tiers 192 | 193 | def _check_type(self): 194 | """ 195 | Figures out the TextGrid format. 196 | """ 197 | 198 | m = re.match("(.*)[\r\n](.*)[\r\n](.*)[\r\n](.*)", self.read_file) 199 | try: 200 | type_id = m.group(1).strip() 201 | except AttributeError: 202 | raise TypeError("Cannot read file -- try TextGrid.load()") 203 | xmin = m.group(4) 204 | if type_id == "File type = \"ooTextFile\"": 205 | if "xmin" not in xmin: 206 | text_type = "OldooTextFile" 207 | else: 208 | text_type = "ooTextFile" 209 | elif type_id == "\"Praat chronological TextGrid text file\"": 210 | text_type = "ChronTextFile" 211 | else: 212 | raise TypeError("Unknown format '(%s)'", (type_id)) 213 | return text_type 214 | 215 | def _find_tiers(self): 216 | """ 217 | Splits the textgrid file into substrings corresponding to tiers. 218 | """ 219 | 220 | if self.text_type == "ooTextFile": 221 | m = OOTEXTFILE 222 | header = " +item \[" 223 | elif self.text_type == "ChronTextFile": 224 | m = CHRONTEXTFILE 225 | header = "\"\S+\" \".*\" \d+\.?\d* \d+\.?\d*" 226 | elif self.text_type == "OldooTextFile": 227 | m = OLDOOTEXTFILE 228 | header = "\".*\"[\r\n]+\".*\"" 229 | 230 | file_info = m.findall(self.read_file)[0] 231 | self.xmin = float(file_info[0]) 232 | self.xmax = float(file_info[1]) 233 | self.t_time = self.xmax - self.xmin 234 | self.size = int(file_info[2]) 235 | tiers = self._load_tiers(header) 236 | return tiers 237 | 238 | def to_chron(self): 239 | """ 240 | @return: String in Chronological TextGrid file format. 241 | """ 242 | 243 | chron_file = "" 244 | chron_file += "\"Praat chronological TextGrid text file\"\n" 245 | chron_file += str(self.xmin) + " " + str(self.xmax) 246 | chron_file += " ! Time domain.\n" 247 | chron_file += str(self.size) + " ! Number of tiers.\n" 248 | for tier in self.tiers: 249 | idx = (self.tiers.index(tier)) + 1 250 | tier_header = "\"" + tier.classid + "\" \"" \ 251 | + tier.nameid + "\" " + str(tier.xmin) \ 252 | + " " + str(tier.xmax) 253 | chron_file += tier_header + "\n" 254 | transcript = tier.simple_transcript 255 | for (xmin, xmax, utt) in transcript: 256 | chron_file += str(idx) + " " + str(xmin) 257 | chron_file += " " + str(xmax) +"\n" 258 | chron_file += "\"" + utt + "\"\n" 259 | return chron_file 260 | 261 | def to_oo(self): 262 | """ 263 | @return: A string in OoTextGrid file format. 264 | """ 265 | 266 | oo_file = "" 267 | oo_file += "File type = \"ooTextFile\"\n" 268 | oo_file += "Object class = \"TextGrid\"\n\n" 269 | oo_file += "xmin = ", self.xmin, "\n" 270 | oo_file += "xmax = ", self.xmax, "\n" 271 | oo_file += "tiers? \n" 272 | oo_file += "size = ", self.size, "\n" 273 | oo_file += "item []:\n" 274 | for i in range(len(self.tiers)): 275 | oo_file += "%4s%s [%s]" % ("", "item", i + 1) 276 | _curr_tier = self.tiers[i] 277 | for (x, y) in _curr_tier.header: 278 | oo_file += "%8s%s = \"%s\"" % ("", x, y) 279 | if _curr_tier.classid != TEXTTIER: 280 | for (xmin, xmax, text) in _curr_tier.simple_transcript: 281 | oo_file += "%12s%s = %s" % ("", "xmin", xmin) 282 | oo_file += "%12s%s = %s" % ("", "xmax", xmax) 283 | oo_file += "%12s%s = \"%s\"" % ("", "text", text) 284 | else: 285 | for (time, mark) in _curr_tier.simple_transcript: 286 | oo_file += "%12s%s = %s" % ("", "time", time) 287 | oo_file += "%12s%s = %s" % ("", "mark", mark) 288 | return oo_file 289 | 290 | 291 | ################################################################# 292 | # Tier Class 293 | ################################################################# 294 | 295 | class Tier(object): 296 | """ 297 | A container for each tier. 298 | """ 299 | 300 | def __init__(self, tier, text_type, t_time): 301 | """ 302 | Initializes attributes of the tier: class, name, xmin, xmax 303 | size, transcript, total time. 304 | Utilizes text_type to guide how to parse the file. 305 | @type tier: a tier object; single item in the TextGrid list. 306 | @param text_type: TextGrid format 307 | @param t_time: Total time of TextGrid file. 308 | @param classid: Type of tier (point or interval). 309 | @param nameid: Name of tier. 310 | @param xmin: xmin of the tier. 311 | @param xmax: xmax of the tier. 312 | @param size: Number of entries in the tier 313 | @param transcript: The raw transcript for the tier. 314 | """ 315 | 316 | self.tier = tier 317 | self.text_type = text_type 318 | self.t_time = t_time 319 | self.classid = "" 320 | self.nameid = "" 321 | self.xmin = 0 322 | self.xmax = 0 323 | self.size = 0 324 | self.transcript = "" 325 | self.tier_info = "" 326 | self._make_info() 327 | self.simple_transcript = self.make_simple_transcript() 328 | if self.classid != TEXTTIER: 329 | self.mark_type = "intervals" 330 | else: 331 | self.mark_type = "points" 332 | self.header = [("class", self.classid), ("name", self.nameid), \ 333 | ("xmin", self.xmin), ("xmax", self.xmax), ("size", self.size)] 334 | 335 | def __iter__(self): 336 | return self 337 | 338 | def _make_info(self): 339 | """ 340 | Figures out most attributes of the tier object: 341 | class, name, xmin, xmax, transcript. 342 | """ 343 | 344 | trans = "([\S\s]*)" 345 | if self.text_type == "ChronTextFile": 346 | classid = "\"(.*)\" +" 347 | nameid = "\"(.*)\" +" 348 | xmin = "(\d+\.?\d*) +" 349 | xmax = "(\d+\.?\d*) *[\r\n]+" 350 | # No size values are given in the Chronological Text File format. 351 | self.size = None 352 | size = "" 353 | elif self.text_type == "ooTextFile": 354 | classid = " +class = \"(.*)\" *[\r\n]+" 355 | nameid = " +name = \"(.*)\" *[\r\n]+" 356 | xmin = " +xmin = (\d+\.?\d*) *[\r\n]+" 357 | xmax = " +xmax = (\d+\.?\d*) *[\r\n]+" 358 | size = " +\S+: size = (\d+) *[\r\n]+" 359 | elif self.text_type == "OldooTextFile": 360 | classid = "\"(.*)\" *[\r\n]+" 361 | nameid = "\"(.*)\" *[\r\n]+" 362 | xmin = "(\d+\.?\d*) *[\r\n]+" 363 | xmax = "(\d+\.?\d*) *[\r\n]+" 364 | size = "(\d+) *[\r\n]+" 365 | m = re.compile(classid + nameid + xmin + xmax + size + trans) 366 | self.tier_info = m.findall(self.tier)[0] 367 | self.classid = self.tier_info[0] 368 | self.nameid = self.tier_info[1] 369 | self.xmin = float(self.tier_info[2]) 370 | self.xmax = float(self.tier_info[3]) 371 | if self.size != None: 372 | self.size = int(self.tier_info[4]) 373 | self.transcript = self.tier_info[-1] 374 | 375 | def make_simple_transcript(self): 376 | """ 377 | @return: Transcript of the tier, in form [(start_time end_time label)] 378 | """ 379 | 380 | if self.text_type == "ChronTextFile": 381 | trans_head = "" 382 | trans_xmin = " (\S+)" 383 | trans_xmax = " (\S+)[\r\n]+" 384 | trans_text = "\"([\S\s]*?)\"" 385 | elif self.text_type == "ooTextFile": 386 | trans_head = " +\S+ \[\d+\]: *[\r\n]+" 387 | trans_xmin = " +\S+ = (\S+) *[\r\n]+" 388 | trans_xmax = " +\S+ = (\S+) *[\r\n]+" 389 | trans_text = " +\S+ = \"([^\"]*?)\"" 390 | elif self.text_type == "OldooTextFile": 391 | trans_head = "" 392 | trans_xmin = "(.*)[\r\n]+" 393 | trans_xmax = "(.*)[\r\n]+" 394 | trans_text = "\"([\S\s]*?)\"" 395 | if self.classid == TEXTTIER: 396 | trans_xmin = "" 397 | trans_m = re.compile(trans_head + trans_xmin + trans_xmax + trans_text) 398 | self.simple_transcript = trans_m.findall(self.transcript) 399 | return self.simple_transcript 400 | 401 | def transcript(self): 402 | """ 403 | @return: Transcript of the tier, as it appears in the file. 404 | """ 405 | 406 | return self.transcript 407 | 408 | def time(self, non_speech_char="."): 409 | """ 410 | @return: Utterance time of a given tier. 411 | Screens out entries that begin with a non-speech marker. 412 | """ 413 | 414 | total = 0.0 415 | if self.classid != TEXTTIER: 416 | for (time1, time2, utt) in self.simple_transcript: 417 | utt = utt.strip() 418 | if utt and not utt[0] == ".": 419 | total += (float(time2) - float(time1)) 420 | return total 421 | 422 | def tier_name(self): 423 | """ 424 | @return: Tier name of a given tier. 425 | """ 426 | 427 | return self.nameid 428 | 429 | def classid(self): 430 | """ 431 | @return: Type of transcription on tier. 432 | """ 433 | 434 | return self.classid 435 | 436 | def min_max(self): 437 | """ 438 | @return: (xmin, xmax) tuple for a given tier. 439 | """ 440 | 441 | return (self.xmin, self.xmax) 442 | 443 | def __repr__(self): 444 | return "<%s \"%s\" (%.2f, %.2f) %.2f%%>" % (self.classid, self.nameid, self.xmin, self.xmax, 100*self.time()/self.t_time) 445 | 446 | def __str__(self): 447 | return self.__repr__() + "\n " + "\n ".join(" ".join(row) for row in self.simple_transcript) 448 | 449 | def demo_TextGrid(demo_data): 450 | print ("** Demo of the TextGrid class. **") 451 | 452 | fid = TextGrid(demo_data) 453 | print ("Tiers:", fid.size) 454 | 455 | for i, tier in enumerate(fid): 456 | print ("\n***") 457 | print ("Tier:", i + 1) 458 | print (tier) 459 | 460 | def demo(): 461 | # Each demo demonstrates different TextGrid formats. 462 | print ("Format 1") 463 | demo_TextGrid(demo_data1) 464 | print ("\nFormat 2") 465 | demo_TextGrid(demo_data2) 466 | print ("\nFormat 3") 467 | demo_TextGrid(demo_data3) 468 | 469 | 470 | demo_data1 = """File type = "ooTextFile" 471 | Object class = "TextGrid" 472 | 473 | xmin = 0 474 | xmax = 2045.144149659864 475 | tiers? 476 | size = 3 477 | item []: 478 | item [1]: 479 | class = "IntervalTier" 480 | name = "utterances" 481 | xmin = 0 482 | xmax = 2045.144149659864 483 | intervals: size = 5 484 | intervals [1]: 485 | xmin = 0 486 | xmax = 2041.4217474125382 487 | text = "" 488 | intervals [2]: 489 | xmin = 2041.4217474125382 490 | xmax = 2041.968276643991 491 | text = "this" 492 | intervals [3]: 493 | xmin = 2041.968276643991 494 | xmax = 2042.5281632653062 495 | text = "is" 496 | intervals [4]: 497 | xmin = 2042.5281632653062 498 | xmax = 2044.0487352585324 499 | text = "a" 500 | intervals [5]: 501 | xmin = 2044.0487352585324 502 | xmax = 2045.144149659864 503 | text = "demo" 504 | item [2]: 505 | class = "TextTier" 506 | name = "notes" 507 | xmin = 0 508 | xmax = 2045.144149659864 509 | points: size = 3 510 | points [1]: 511 | time = 2041.4217474125382 512 | mark = ".begin_demo" 513 | points [2]: 514 | time = 2043.8338291031832 515 | mark = "voice gets quiet here" 516 | points [3]: 517 | time = 2045.144149659864 518 | mark = ".end_demo" 519 | item [3]: 520 | class = "IntervalTier" 521 | name = "phones" 522 | xmin = 0 523 | xmax = 2045.144149659864 524 | intervals: size = 12 525 | intervals [1]: 526 | xmin = 0 527 | xmax = 2041.4217474125382 528 | text = "" 529 | intervals [2]: 530 | xmin = 2041.4217474125382 531 | xmax = 2041.5438290324326 532 | text = "D" 533 | intervals [3]: 534 | xmin = 2041.5438290324326 535 | xmax = 2041.7321032910372 536 | text = "I" 537 | intervals [4]: 538 | xmin = 2041.7321032910372 539 | xmax = 2041.968276643991 540 | text = "s" 541 | intervals [5]: 542 | xmin = 2041.968276643991 543 | xmax = 2042.232189031843 544 | text = "I" 545 | intervals [6]: 546 | xmin = 2042.232189031843 547 | xmax = 2042.5281632653062 548 | text = "z" 549 | intervals [7]: 550 | xmin = 2042.5281632653062 551 | xmax = 2044.0487352585324 552 | text = "eI" 553 | intervals [8]: 554 | xmin = 2044.0487352585324 555 | xmax = 2044.2487352585324 556 | text = "dc" 557 | intervals [9]: 558 | xmin = 2044.2487352585324 559 | xmax = 2044.3102321849011 560 | text = "d" 561 | intervals [10]: 562 | xmin = 2044.3102321849011 563 | xmax = 2044.5748932104329 564 | text = "E" 565 | intervals [11]: 566 | xmin = 2044.5748932104329 567 | xmax = 2044.8329108578437 568 | text = "m" 569 | intervals [12]: 570 | xmin = 2044.8329108578437 571 | xmax = 2045.144149659864 572 | text = "oU" 573 | """ 574 | 575 | demo_data2 = """File type = "ooTextFile" 576 | Object class = "TextGrid" 577 | 578 | 0 579 | 2.8 580 | 581 | 2 582 | "IntervalTier" 583 | "utterances" 584 | 0 585 | 2.8 586 | 3 587 | 0 588 | 1.6229213249309031 589 | "" 590 | 1.6229213249309031 591 | 2.341428074708195 592 | "demo" 593 | 2.341428074708195 594 | 2.8 595 | "" 596 | "IntervalTier" 597 | "phones" 598 | 0 599 | 2.8 600 | 6 601 | 0 602 | 1.6229213249309031 603 | "" 604 | 1.6229213249309031 605 | 1.6428291382019483 606 | "dc" 607 | 1.6428291382019483 608 | 1.65372183721983721 609 | "d" 610 | 1.65372183721983721 611 | 1.94372874328943728 612 | "E" 613 | 1.94372874328943728 614 | 2.13821938291038210 615 | "m" 616 | 2.13821938291038210 617 | 2.341428074708195 618 | "oU" 619 | 2.341428074708195 620 | 2.8 621 | "" 622 | """ 623 | 624 | demo_data3 = """"Praat chronological TextGrid text file" 625 | 0 2.8 ! Time domain. 626 | 2 ! Number of tiers. 627 | "IntervalTier" "utterances" 0 2.8 628 | "IntervalTier" "utterances" 0 2.8 629 | 1 0 1.6229213249309031 630 | "" 631 | 2 0 1.6229213249309031 632 | "" 633 | 2 1.6229213249309031 1.6428291382019483 634 | "dc" 635 | 2 1.6428291382019483 1.65372183721983721 636 | "d" 637 | 2 1.65372183721983721 1.94372874328943728 638 | "E" 639 | 2 1.94372874328943728 2.13821938291038210 640 | "m" 641 | 2 2.13821938291038210 2.341428074708195 642 | "oU" 643 | 1 1.6229213249309031 2.341428074708195 644 | "demo" 645 | 1 2.341428074708195 2.8 646 | "" 647 | 2 2.341428074708195 2.8 648 | "" 649 | """ 650 | 651 | if __name__ == "__main__": 652 | demo() 653 | 654 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tables 3 | #from matplotlib.pyplot import figure, show 4 | import scipy.linalg 5 | 6 | def make_delayed(stim, delays, circpad=False): 7 | """Creates non-interpolated concatenated delayed versions of [stim] with the given [delays] 8 | (in samples). 9 | 10 | If [circpad], instead of being padded with zeros, [stim] will be circularly shifted. 11 | """ 12 | nt,ndim = stim.shape 13 | dstims = [] 14 | for di,d in enumerate(delays): 15 | dstim = np.zeros((nt, ndim)) 16 | if d<0: ## negative delay 17 | dstim[:d,:] = stim[-d:,:] 18 | if circpad: 19 | dstim[d:,:] = stim[:-d,:] 20 | elif d>0: 21 | dstim[d:,:] = stim[:-d,:] 22 | if circpad: 23 | dstim[:d,:] = stim[-d:,:] 24 | else: ## d==0 25 | dstim = stim.copy() 26 | dstims.append(dstim) 27 | return np.hstack(dstims) 28 | 29 | def best_corr_vec(wvec, vocab, SU, n=10): 30 | """Returns the [n] words from [vocab] most similar to the given [wvec], where each word is represented 31 | as a row in [SU]. Similarity is computed using correlation.""" 32 | wvec = wvec - np.mean(wvec) 33 | nwords = len(vocab) 34 | corrs = np.nan_to_num([np.corrcoef(wvec, SU[wi,:]-np.mean(SU[wi,:]))[1,0] for wi in range(nwords-1)]) 35 | scorrs = np.argsort(corrs) 36 | words = list(reversed([(corrs[i],vocab[i]) for i in scorrs[-n:]])) 37 | return words 38 | 39 | def get_word_prob(): 40 | """Returns the probabilities of all the words in the mechanical turk video labels. 41 | """ 42 | import constants as c 43 | import cPickle 44 | data = cPickle.load(open(c.datafile)) # Read in the words from the labels 45 | wordcount = dict() 46 | totalcount = 0 47 | for label in data: 48 | for word in label: 49 | totalcount += 1 50 | if word in wordcount: 51 | wordcount[word] += 1 52 | else: 53 | wordcount[word] = 1 54 | 55 | wordprob = dict([(word, float(wc)/totalcount) for word, wc in wordcount.items()]) 56 | return wordprob 57 | 58 | def best_prob_vec(wvec, vocab, space, wordprobs): 59 | """Orders the words by correlation with the given [wvec], but also weights the correlations by the prior 60 | probability of the word appearing in the mechanical turk video labels. 61 | """ 62 | words = best_corr_vec(wvec, vocab, space, n=len(vocab)) ## get correlations for all words 63 | ## weight correlations by the prior probability of the word in the labels 64 | weightwords = [] 65 | for wcorr,word in words: 66 | if word in wordprobs: 67 | weightwords.append((wordprobs[word]*wcorr, word)) 68 | 69 | return sorted(weightwords, key=lambda ww: ww[0]) 70 | 71 | def find_best_words(vectors, vocab, wordspace, actual, display=True, num=15): 72 | cwords = [] 73 | for si in range(len(vectors)): 74 | cw = best_corr_vec(vectors[si], vocab, wordspace, n=num) 75 | cwords.append(cw) 76 | if display: 77 | print ("Closest words to scene %d:" % si) 78 | print ([b[1] for b in cw]) 79 | print ("Actual words:") 80 | print (actual[si]) 81 | print ("") 82 | return cwords 83 | 84 | def find_best_stims_for_word(wordvector, decstims, n): 85 | """Returns a list of the indexes of the [n] stimuli in [decstims] (should be decoded stimuli) 86 | that lie closest to the vector [wordvector], which should be taken from the same space as the 87 | stimuli. 88 | """ 89 | scorrs = np.array([np.corrcoef(wordvector, ds)[0,1] for ds in decstims]) 90 | scorrs[np.isnan(scorrs)] = -1 91 | return np.argsort(scorrs)[-n:][::-1] 92 | 93 | def princomp(x, use_dgesvd=False): 94 | """Does principal components analysis on [x]. 95 | Returns coefficients, scores and latent variable values. 96 | Translated from MATLAB princomp function. Unlike the matlab princomp function, however, the 97 | rows of the returned value 'coeff' are the principal components, not the columns. 98 | """ 99 | 100 | n,p = x.shape 101 | #cx = x-np.tile(x.mean(0), (n,1)) ## column-centered x 102 | cx = x-x.mean(0) 103 | r = np.min([n-1,p]) ## maximum possible rank of cx 104 | 105 | if use_dgesvd: 106 | from svd_dgesvd import svd_dgesvd 107 | U,sigma,coeff = svd_dgesvd(cx, full_matrices=False) 108 | else: 109 | U,sigma,coeff = np.linalg.svd(cx, full_matrices=False) 110 | 111 | sigma = np.diag(sigma) 112 | score = np.dot(cx, coeff.T) 113 | sigma = sigma/np.sqrt(n-1) 114 | 115 | latent = sigma**2 116 | 117 | return coeff, score, latent 118 | 119 | def eigprincomp(x, npcs=None, norm=False, weights=None): 120 | """Does principal components analysis on [x]. 121 | Returns coefficients (eigenvectors) and eigenvalues. 122 | If given, only the [npcs] greatest eigenvectors/values will be returned. 123 | If given, the covariance matrix will be computed using [weights] on the samples. 124 | """ 125 | n,p = x.shape 126 | #cx = x-np.tile(x.mean(0), (n,1)) ## column-centered x 127 | cx = x-x.mean(0) 128 | r = np.min([n-1,p]) ## maximum possible rank of cx 129 | 130 | xcov = np.cov(cx.T) 131 | if norm: 132 | xcov /= n 133 | 134 | if npcs is not None: 135 | latent,coeff = scipy.linalg.eigh(xcov, eigvals=(p-npcs,p-1)) 136 | else: 137 | latent,coeff = np.linalg.eigh(xcov) 138 | 139 | ## Transpose coeff, reverse its rows 140 | return coeff.T[::-1], latent[::-1] 141 | 142 | def weighted_cov(x, weights=None): 143 | """If given [weights], the covariance will be computed using those weights on the samples. 144 | Otherwise the simple covariance will be returned. 145 | """ 146 | if weights is None: 147 | return np.cov(x) 148 | else: 149 | w = weights/weights.sum() ## Normalize the weights 150 | dmx = (x.T-(w*x).sum(1)).T ## Subtract the WEIGHTED mean 151 | wfact = 1/(1-(w**2).sum()) ## Compute the weighting factor 152 | return wfact*np.dot(w*dmx, dmx.T.conj()) ## Take the weighted inner product 153 | 154 | def test_weighted_cov(): 155 | """Runs a test on the weighted_cov function, creating a dataset for which the covariance is known 156 | for two different populations, and weights are used to reproduce the individual covariances. 157 | """ 158 | T = 1000 ## number of time points 159 | N = 100 ## A signals 160 | M = 100 ## B signals 161 | snr = 5 ## signal to noise ratio 162 | 163 | ## Create the two datasets 164 | siga = np.random.rand(T) 165 | noisea = np.random.rand(T, N) 166 | respa = (noisea.T+snr*siga).T 167 | 168 | sigb = np.random.rand(T) 169 | noiseb = np.random.rand(T, M) 170 | respb = (noiseb.T+snr*sigb).T 171 | 172 | ## Compute self-covariance matrixes 173 | cova = np.cov(respa) 174 | covb = np.cov(respb) 175 | 176 | ## Compute the full covariance matrix 177 | allresp = np.hstack([respa, respb]) 178 | fullcov = np.cov(allresp) 179 | 180 | ## Make weights that will recover individual covariances 181 | wta = np.ones([N+M,]) 182 | wta[N:] = 0 183 | 184 | wtb = np.ones([N+M,]) 185 | wtb[:N] = 0 186 | 187 | recova = weighted_cov(allresp, wta) 188 | recovb = weighted_cov(allresp, wtb) 189 | 190 | return locals() 191 | 192 | def fixPCs(orig, new): 193 | """Finds and fixes sign-flips in PCs by finding the coefficient with the greatest 194 | magnitude in the [orig] PCs, then negating the [new] PCs if that coefficient has 195 | a different sign. 196 | """ 197 | flipped = [] 198 | for o,n in zip(orig, new): 199 | maxind = np.abs(o).argmax() 200 | if o[maxind]*n[maxind]>0: 201 | ## Same sign, no need to flip 202 | flipped.append(n) 203 | else: 204 | ## Different sign, flip 205 | flipped.append(-n) 206 | 207 | return np.vstack(flipped) 208 | 209 | 210 | def plot_model_comparison(corrs1, corrs2, name1, name2, thresh=0.35): 211 | fig = figure(figsize=(8,8)) 212 | ax = fig.add_subplot(1,1,1) 213 | 214 | good1 = corrs1>thresh 215 | good2 = corrs2>thresh 216 | better1 = corrs1>corrs2 217 | #both = np.logical_and(good1, good2) 218 | neither = np.logical_not(np.logical_or(good1, good2)) 219 | only1 = np.logical_and(good1, better1) 220 | only2 = np.logical_and(good2, np.logical_not(better1)) 221 | 222 | ptalpha = 0.3 223 | ax.plot(corrs1[neither], corrs2[neither], 'ko', alpha=ptalpha) 224 | #ax.plot(corrs1[both], corrs2[both], 'go', alpha=ptalpha) 225 | ax.plot(corrs1[only1], corrs2[only1], 'ro', alpha=ptalpha) 226 | ax.plot(corrs1[only2], corrs2[only2], 'bo', alpha=ptalpha) 227 | 228 | lims = [-0.5, 1.0] 229 | 230 | ax.plot([thresh, thresh], [lims[0], thresh], 'r-') 231 | ax.plot([lims[0], thresh], [thresh,thresh], 'b-') 232 | 233 | ax.text(lims[0]+0.05, thresh, "$n=%d$"%np.sum(good2), horizontalalignment="left", verticalalignment="bottom") 234 | ax.text(thresh, lims[0]+0.05, "$n=%d$"%np.sum(good1), horizontalalignment="left", verticalalignment="bottom") 235 | 236 | ax.plot(lims, lims, '-', color="gray") 237 | ax.set_xlim(lims) 238 | ax.set_ylim(lims) 239 | ax.set_xlabel(name1) 240 | ax.set_ylabel(name2) 241 | 242 | show() 243 | return fig 244 | 245 | import matplotlib.colors 246 | bwr = matplotlib.colors.LinearSegmentedColormap.from_list("bwr", ((0.0, 0.0, 1.0), (1.0, 1.0, 1.0), (1.0, 0.0, 0.0))) 247 | bkr = matplotlib.colors.LinearSegmentedColormap.from_list("bkr", ((0.0, 0.0, 1.0), (0.0, 0.0, 0.0), (1.0, 0.0, 0.0))) 248 | bgr = matplotlib.colors.LinearSegmentedColormap.from_list("bgr", ((0.0, 0.0, 1.0), (0.5, 0.5, 0.5), (1.0, 0.0, 0.0))) 249 | 250 | def plot_model_comparison2(corrFile1, corrFile2, name1, name2, thresh=0.35): 251 | fig = figure(figsize=(9,10)) 252 | #ax = fig.add_subplot(3,1,[1,2], aspect="equal") 253 | ax = fig.add_axes([0.25, 0.4, 0.6, 0.5], aspect="equal") 254 | 255 | corrs1 = tables.openFile(corrFile1).root.semcorr.read() 256 | corrs2 = tables.openFile(corrFile2).root.semcorr.read() 257 | maxcorr = np.clip(np.vstack([corrs1, corrs2]).max(0), 0, thresh)/thresh 258 | corrdiff = (corrs1-corrs2) + 0.5 259 | colors = (bgr(corrdiff).T*maxcorr).T 260 | colors[:,3] = 1.0 ## Don't scale alpha 261 | 262 | ptalpha = 0.8 263 | ax.scatter(corrs1, corrs2, s=10, c=colors, alpha=ptalpha, edgecolors="none") 264 | lims = [-0.5, 1.0] 265 | 266 | ax.plot([thresh, thresh], [lims[0], thresh], color="gray") 267 | ax.plot([lims[0], thresh], [thresh,thresh], color="gray") 268 | 269 | good1 = corrs1>thresh 270 | good2 = corrs2>thresh 271 | ax.text(lims[0]+0.05, thresh, "$n=%d$"%np.sum(good2), horizontalalignment="left", verticalalignment="bottom") 272 | ax.text(thresh, lims[0]+0.05, "$n=%d$"%np.sum(good1), horizontalalignment="left", verticalalignment="bottom") 273 | 274 | ax.plot(lims, lims, '-', color="gray") 275 | ax.set_xlim(lims) 276 | ax.set_ylim(lims) 277 | ax.set_xlabel(name1+" model") 278 | ax.set_ylabel(name2+" model") 279 | 280 | fig.canvas.draw() 281 | show() 282 | ## Add over-under comparison 283 | #ax_left = ax.get_window_extent()._bbox.x0 284 | #ax_right = ax.get_window_extent()._bbox.x1 285 | #ax_width = ax_right-ax_left 286 | #print ax_left, ax_right 287 | #ax2 = fig.add_axes([ax_left, 0.1, ax_width, 0.2]) 288 | ax2 = fig.add_axes([0.25, 0.1, 0.6, 0.25])#, sharex=ax) 289 | #ax2 = fig.add_subplot(3, 1, 3) 290 | #plot_model_overunder_comparison(corrs1, corrs2, name1, name2, thresh=thresh, ax=ax2) 291 | plot_model_histogram_comparison(corrs1, corrs2, name1, name2, thresh=thresh, ax=ax2) 292 | 293 | fig.suptitle("Model comparison: %s vs. %s"%(name1, name2)) 294 | show() 295 | return fig 296 | 297 | 298 | def plot_model_overunder_comparison(corrs1, corrs2, name1, name2, thresh=0.35, ax=None): 299 | """Plots over-under difference between two models. 300 | """ 301 | if ax is None: 302 | fig = figure(figsize=(8,8)) 303 | ax = fig.add_subplot(1,1,1) 304 | 305 | maxcorr = max(corrs1.max(), corrs2.max()) 306 | vals = np.linspace(0, maxcorr, 500) 307 | overunder = lambda c: np.array([np.sum(c>v)-np.sum(c<-v) for v in vals]) 308 | 309 | ou1 = overunder(corrs1) 310 | ou2 = overunder(corrs2) 311 | 312 | oud = ou2-ou1 313 | 314 | ax.fill_between(vals, 0, np.clip(oud, 0, 1e9), facecolor="blue") 315 | ax.fill_between(vals, 0, np.clip(oud, -1e9, 0), facecolor="red") 316 | 317 | yl = np.max(np.abs(np.array(ax.get_ylim()))) 318 | ax.plot([thresh, thresh], [-yl, yl], '-', color="gray") 319 | ax.set_ylim(-yl, yl) 320 | ax.set_xlim(0, maxcorr) 321 | ax.set_xlabel("Voxel correlation") 322 | ax.set_ylabel("%s better %s better"%(name1, name2)) 323 | 324 | show() 325 | return ax 326 | 327 | def plot_model_histogram_comparison(corrs1, corrs2, name1, name2, thresh=0.35, ax=None): 328 | """Plots over-under difference between two models. 329 | """ 330 | if ax is None: 331 | fig = figure(figsize=(8,8)) 332 | ax = fig.add_subplot(1,1,1) 333 | 334 | maxcorr = max(corrs1.max(), corrs2.max()) 335 | nbins = 100 336 | hist1 = np.histogram(corrs1, nbins, range=(-1,1)) 337 | hist2 = np.histogram(corrs2, nbins, range=(-1,1)) 338 | 339 | ouhist1 = hist1[0][nbins/2:]-hist1[0][:nbins/2][::-1] 340 | ouhist2 = hist2[0][nbins/2:]-hist2[0][:nbins/2][::-1] 341 | 342 | oud = ouhist2-ouhist1 343 | bwidth = 2.0/nbins 344 | barlefts = hist1[1][nbins/2:-1] 345 | 346 | #ax.fill_between(vals, 0, np.clip(oud, 0, 1e9), facecolor="blue") 347 | #ax.fill_between(vals, 0, np.clip(oud, -1e9, 0), facecolor="red") 348 | 349 | ax.bar(barlefts, np.clip(oud, 0, 1e9), bwidth, facecolor="blue") 350 | ax.bar(barlefts, np.clip(oud, -1e9, 0), bwidth, facecolor="red") 351 | 352 | yl = np.max(np.abs(np.array(ax.get_ylim()))) 353 | ax.plot([thresh, thresh], [-yl, yl], '-', color="gray") 354 | ax.set_ylim(-yl, yl) 355 | ax.set_xlim(0, maxcorr) 356 | ax.set_xlabel("Voxel correlation") 357 | ax.set_ylabel("%s better %s better"%(name1, name2)) 358 | 359 | show() 360 | return ax 361 | 362 | 363 | def plot_model_comparison_rois(corrs1, corrs2, name1, name2, roivoxels, roinames, thresh=0.35): 364 | """Plots model correlation comparisons per ROI. 365 | """ 366 | fig = figure() 367 | ptalpha = 0.3 368 | 369 | for ri in range(len(roinames)): 370 | ax = fig.add_subplot(4, 4, ri+1) 371 | ax.plot(corrs1[roivoxels[ri]], corrs2[roivoxels[ri]], 'bo', alpha=ptalpha) 372 | lims = [-0.3, 1.0] 373 | ax.plot(lims, lims, '-', color="gray") 374 | ax.set_xlim(lims) 375 | ax.set_ylim(lims) 376 | ax.set_title(roinames[ri]) 377 | 378 | show() 379 | return fig 380 | 381 | def save_table_file(filename, filedict): 382 | """Saves the variables in [filedict] in a hdf5 table file at [filename]. 383 | """ 384 | hf = tables.openFile(filename, mode="w", title="save_file") 385 | for vname, var in filedict.items(): 386 | hf.createArray("/", vname, var) 387 | hf.close() 388 | 389 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | #import scipy.stats 4 | import random 5 | import sys 6 | import os 7 | 8 | def zscore(mat, return_unzvals=False): 9 | """Z-scores the rows of [mat] by subtracting off the mean and dividing 10 | by the standard deviation. 11 | If [return_unzvals] is True, a matrix will be returned that can be used 12 | to return the z-scored values to their original state. 13 | """ 14 | zmat = np.empty(mat.shape, mat.dtype) 15 | unzvals = np.zeros((zmat.shape[0], 2), mat.dtype) 16 | for ri in range(mat.shape[0]): 17 | unzvals[ri,0] = np.std(mat[ri,:]) 18 | unzvals[ri,1] = np.mean(mat[ri,:]) 19 | zmat[ri,:] = (mat[ri,:]-unzvals[ri,1]) / (1e-10+unzvals[ri,0]) 20 | 21 | if return_unzvals: 22 | return zmat, unzvals 23 | 24 | return zmat 25 | 26 | def center(mat, return_uncvals=False): 27 | """Centers the rows of [mat] by subtracting off the mean, but doesn't 28 | divide by the SD. 29 | Can be undone like zscore. 30 | """ 31 | cmat = np.empty(mat.shape) 32 | uncvals = np.ones((mat.shape[0], 2)) 33 | for ri in range(mat.shape[0]): 34 | uncvals[ri,1] = np.mean(mat[ri,:]) 35 | cmat[ri,:] = mat[ri,:]-uncvals[ri,1] 36 | 37 | if return_uncvals: 38 | return cmat, uncvals 39 | 40 | return cmat 41 | 42 | def unzscore(mat, unzvals): 43 | """Un-Z-scores the rows of [mat] by multiplying by unzvals[:,0] (the standard deviations) 44 | and then adding unzvals[:,1] (the row means). 45 | """ 46 | unzmat = np.empty(mat.shape) 47 | for ri in range(mat.shape[0]): 48 | unzmat[ri,:] = mat[ri,:]*(1e-10+unzvals[ri,0])+unzvals[ri,1] 49 | return unzmat 50 | 51 | def ridge(A, b, alpha): 52 | """Performs ridge regression, estimating x in Ax=b with a regularization 53 | parameter of alpha. 54 | With $G=\alpha I(m_A)$, this function returns $W$ with: 55 | $W=(A^TA+G^TG)^{-1}A^Tb^T$ 56 | Tantamount to minimizing $||Ax-b||+||\alpha I||$. 57 | """ 58 | G = np.matrix(np.identity(A.shape[1]) * alpha) 59 | return np.dot(np.dot(np.linalg.inv(np.dot(A.T,A) + np.dot(G.T,G)), A.T), b.T) 60 | 61 | def model_voxels(Rstim, Pstim, Rresp, Presp, alpha): 62 | """Use ridge regression with regularization parameter [alpha] to model [Rresp] 63 | using [Rstim]. Correlation coefficients on the test set ([Presp] and [Pstim]) 64 | will be returned for each voxel, as well as the linear weights. 65 | """ 66 | print ("Z-scoring stimuli (with a flip)... (or not)") 67 | #zRstim = zscore(Rstim.T).T 68 | #zPstim = zscore(Pstim.T).T 69 | 70 | Rresp[np.isnan(Rresp)] = 0.0 71 | Presp[np.isnan(Presp)] = 0.0 72 | 73 | print ("Running ridge regression...") 74 | rwts = ridge(Rstim, Rresp.T, alpha) 75 | print ("Finding correlations...") 76 | pred = np.dot(Pstim, rwts) 77 | prednorms = np.apply_along_axis(np.linalg.norm, 0, pred) 78 | respnorms = np.apply_along_axis(np.linalg.norm, 0, Presp) 79 | correlations = np.array(np.sum(np.multiply(Presp, pred), 0)).squeeze()/(prednorms*respnorms) 80 | 81 | print ("Max correlation: %0.3f" % np.max(correlations)) 82 | print ("Skewness: %0.3f" % scipy.stats.skew(correlations)) 83 | return np.array(correlations), rwts 84 | 85 | def model_voxels_old(Rstim, Pstim, Rresp, Presp, alpha): 86 | """Use ridge regression with regularization parameter [alpha] to model [Rresp] 87 | using [Rstim]. Correlation coefficients on the test set ([Presp] and [Pstim]) 88 | will be returned for each voxel, as well as the linear weights. 89 | """ 90 | print ("Z-scoring stimuli (with a flip)...") 91 | #zRstim = zscore(Rstim.T).T 92 | #zPstim = zscore(Pstim.T).T 93 | 94 | Rresp[np.isnan(Rresp)] = 0.0 95 | Presp[np.isnan(Presp)] = 0.0 96 | 97 | print ("Running ridge regression...") 98 | rwts = ridge(Rstim, Rresp.T, alpha) 99 | print ("Finding correlations...") 100 | correlations = [] 101 | for vi in range(Presp.shape[1]): 102 | rcorr = np.corrcoef(Presp[:,vi].T,np.array((np.matrix(Pstim) * np.matrix(rwts[:,vi]))).T)[0,1] 103 | correlations.append(rcorr) 104 | 105 | print ("Max correlation: %0.3f" % np.max(correlations)) 106 | print ("Skewness: %0.3f" % scipy.stats.skew(correlations)) 107 | return np.array(correlations), rwts 108 | 109 | def gaussianize(vec): 110 | """Uses a look-up table to force the values in [vec] to be gaussian.""" 111 | ranks = np.argsort(np.argsort(vec)) 112 | cranks = (ranks+1).astype(float)/(ranks.max()+2) 113 | vals = scipy.stats.norm.isf(1-cranks) 114 | zvals = vals/vals.std() 115 | return zvals 116 | 117 | def gaussianize_mat(mat): 118 | """Gaussianizes each column of [mat].""" 119 | gmat = np.empty(mat.shape) 120 | for ri in range(mat.shape[1]): 121 | gmat[:,ri] = gaussianize(mat[:,ri]) 122 | return gmat 123 | 124 | def make_delayed(stim, delays, circpad=False): 125 | """Creates non-interpolated concatenated delayed versions of [stim] with the given [delays] 126 | (in samples). 127 | 128 | If [circpad], instead of being padded with zeros, [stim] will be circularly shifted. 129 | """ 130 | nt,ndim = stim.shape 131 | dstims = [] 132 | for di,d in enumerate(delays): 133 | dstim = np.zeros((nt, ndim)) 134 | if d<0: ## negative delay 135 | dstim[:d,:] = stim[-d:,:] 136 | if circpad: 137 | dstim[d:,:] = stim[:-d,:] 138 | elif d>0: 139 | dstim[d:,:] = stim[:-d,:] 140 | if circpad: 141 | dstim[:d,:] = stim[-d:,:] 142 | else: ## d==0 143 | dstim = stim.copy() 144 | dstims.append(dstim) 145 | return np.hstack(dstims) 146 | 147 | def mult_diag(d, mtx, left=True): 148 | """Multiply a full matrix by a diagonal matrix. 149 | This function should always be faster than dot. 150 | 151 | Input: 152 | d -- 1D (N,) array (contains the diagonal elements) 153 | mtx -- 2D (N,N) array 154 | 155 | Output: 156 | mult_diag(d, mts, left=True) == dot(diag(d), mtx) 157 | mult_diag(d, mts, left=False) == dot(mtx, diag(d)) 158 | 159 | By Pietro Berkes 160 | From http://mail.scipy.org/pipermail/numpy-discussion/2007-March/026807.html 161 | """ 162 | if left: 163 | return (d*mtx.T).T 164 | else: 165 | return d*mtx 166 | 167 | import time 168 | import logging 169 | def counter(iterable, countevery=100, total=None, logger=logging.getLogger("counter")): 170 | """Logs a status and timing update to [logger] every [countevery] draws from [iterable]. 171 | If [total] is given, log messages will include the estimated time remaining. 172 | """ 173 | start_time = time.time() 174 | 175 | ## Check if the iterable has a __len__ function, use it if no total length is supplied 176 | if total is None: 177 | if hasattr(iterable, "__len__"): 178 | total = len(iterable) 179 | 180 | for count, thing in enumerate(iterable): 181 | yield thing 182 | 183 | if not count%countevery: 184 | current_time = time.time() 185 | rate = float(count+1)/(current_time-start_time) 186 | 187 | if rate>1: ## more than 1 item/second 188 | ratestr = "%0.2f items/second"%rate 189 | else: ## less than 1 item/second 190 | ratestr = "%0.2f seconds/item"%(rate**-1) 191 | 192 | if total is not None: 193 | remitems = total-(count+1) 194 | remtime = remitems/rate 195 | timestr = ", %s remaining" % time.strftime('%H:%M:%S', time.gmtime(remtime)) 196 | itemstr = "%d/%d"%(count+1, total) 197 | else: 198 | timestr = "" 199 | itemstr = "%d"%(count+1) 200 | 201 | formatted_str = "%s items complete (%s%s)"%(itemstr,ratestr,timestr) 202 | if logger is None: 203 | print (formatted_str) 204 | else: 205 | logger.info(formatted_str) 206 | 207 | 208 | def wait_for_disk(dir, maxtime=0.2, retrytime=10.0, maxtries=100): 209 | """Waits to continue until disk is not slammed. 210 | """ 211 | for trynum in range(maxtries): 212 | stime = time.time() 213 | os.listdir(dir) 214 | lstime = time.time() - stime 215 | if lstime < maxtime: 216 | print ("Disk access is quick (%0.3f seconds to ls), continuing.." % lstime) 217 | return 218 | else: 219 | print ("Disk access is slow (%0.3f seconds to ls), waiting more.." % lstime) 220 | time.sleep(retrytime) 221 | 222 | print ("Disk access is slow but fuck it, I'm starting anyway..") 223 | --------------------------------------------------------------------------------