├── .gitignore ├── README.md ├── coco_proc.py ├── config.py ├── gen ├── __init__.py ├── coco_dev_reference0 ├── coco_dev_reference1 ├── coco_dev_reference2 ├── coco_dev_reference3 ├── coco_dev_reference4 ├── coco_reference0 ├── coco_reference1 ├── coco_reference2 ├── coco_reference3 ├── coco_reference4 └── multi-bleu.perl ├── lm ├── __init__.py ├── mlbl.py └── mlblf.py ├── models └── __init__.py ├── tester.py ├── trainer.py └── utils ├── __init__.py ├── lm_tools.py └── svd_tools.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # multimodal-neural-language-models 2 | 3 | A bare-bones NumPy implementation of "Multimodal Neural Language Models" (Kiros et al, ICML 2014), containing additive and multiplicative log-bilinear image caption generators. These models differ from most other image caption generators in that they do not use recurrent neural networks. 4 | 5 | This code may be useful to you if you're looking for a simple, bare-bones image caption generator that can be trained on the CPU. It may also be useful for teaching purposes. This code was used as part of an assignment for the undergraduate neural networks class at the University of Toronto. 6 | 7 | On MSCOCO using VGG19 features, a single model can achieve BLEU4 score of 25. An ensemble can achieve near 27. For comparison, a "Show and Tell" LSTM with the same features achieves a score of 27.x. The state of the art is currently around 34. Thus these models are quite far from the current state of the art. I am releasing this code for completeness as part of my PhD thesis. 8 | 9 | ## Visualization 10 | 11 | Here are [results](http://www.cs.toronto.edu/~rkiros/bayescapgen.html) on 1000 images using an ensemble of additive log-bilinear models trained using this code. 12 | 13 | ## Dependencies 14 | 15 | This code is written in python. To use it you will need: 16 | 17 | * Python 2.7 18 | * A recent version of [NumPy](http://www.numpy.org/) and [SciPy](http://www.scipy.org/) 19 | 20 | ## Quickstart for Toronto users 21 | 22 | To train an additive log-bilinear model with the default settings, open IPython and run the following: 23 | 24 | import coco_proc, trainer 25 | z, zd, zt = coco_proc.process(context=5) 26 | trainer.trainer(z, zd) 27 | 28 | this will store trained models in the models directory and periodically compute BLEU using the Perl code and reference captions in the gen directory. All the hyperparameters settings can be tuned in trainer.py. Links to MSCOCO data are in config.py. 29 | 30 | ## Getting started 31 | 32 | You will first need to download the pre-processed MSCOCO data. All necessary files can be downloaded by running: 33 | 34 | wget http://www.cs.toronto.edu/~rkiros/data/mnlm.zip 35 | 36 | After unpacking, open config.py and set the paths accordingly. Then you can proceed to the quickstart instructions. All training settings can be found in trainer.py. Testing trained models is done with tester.py. The lm directory contains classes for the additive and multiplicative log-bilinear models. Helper functions, such as beam search, is found in the utils directory. 37 | 38 | ## Reference 39 | 40 | If you found this code useful, please cite the following paper: 41 | 42 | Ryan Kiros, Ruslan Salakhutdinov, Richard S. Zemel. **"Multimodal Neural Language Models."** *ICML (2014).* 43 | 44 | @inproceedings{kiros2014multimodal, 45 | title={Multimodal Neural Language Models.}, 46 | author={Kiros, Ryan and Salakhutdinov, Ruslan and Zemel, Richard S}, 47 | booktitle={ICML}, 48 | volume={14}, 49 | pages={595--603}, 50 | year={2014} 51 | } 52 | 53 | ## License 54 | 55 | [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) 56 | 57 | 58 | -------------------------------------------------------------------------------- /coco_proc.py: -------------------------------------------------------------------------------- 1 | # Pre-process MSCOCO 2 | 3 | import config 4 | import numpy as np 5 | import os 6 | import json 7 | import re 8 | from utils import lm_tools 9 | from collections import Counter 10 | from collections import defaultdict 11 | from scipy.sparse import lil_matrix, sparsetools, csr_matrix 12 | from numpy.random import RandomState 13 | 14 | def process(context=5): 15 | """ 16 | Main process function 17 | """ 18 | # Load images 19 | print 'Loading images...' 20 | (trainIM, devIM, testIM) = load_features_npy() 21 | 22 | # Load sentences 23 | print 'Loading sentences...' 24 | d = load_sentences() 25 | 26 | # Load image ids 27 | print 'Loading image ids...' 28 | (dx_train, dx_dev) = image_ids() 29 | 30 | # Load splits 31 | print 'Loading splits...' 32 | (train_sp, dev_sp, test_sp) = load_splits() 33 | 34 | # Load captions 35 | print 'Loading captions...' 36 | train = construct_captions(d, train_sp) 37 | dev = construct_captions(d, dev_sp) 38 | test = construct_captions(d, test_sp) 39 | 40 | # Tokenize 41 | (train_tokens, topwords) = tokenize(train, context=context) 42 | dev_tokens = tokenize(dev, context=context, topwords=topwords)[0] 43 | test_tokens = tokenize(test, context=context, topwords=topwords)[0] 44 | 45 | # Index words and create vocabulary 46 | print 'Creating vocabulary...' 47 | (word_dict, index_dict) = index_words(train_tokens + dev_tokens) 48 | 49 | # Compute n-grams 50 | print 'Computing n-grams...' 51 | train_ngrams = lm_tools.get_ngrams(train_tokens, context=context) 52 | dev_ngrams = lm_tools.get_ngrams(dev_tokens, context=context) 53 | test_ngrams = lm_tools.get_ngrams(test_tokens, context=context) 54 | 55 | # Compute sparse label matrix 56 | print 'Computing labels...' 57 | train_labels = compute_labels(train_ngrams, word_dict, context=context) 58 | dev_labels = compute_labels(dev_ngrams, word_dict, context=context) 59 | 60 | # Compute model instances 61 | print 'Computing model instances...' 62 | (train_instances, train_index) = lm_tools.model_inputs(train_ngrams, word_dict, 63 | context=context, include_last=False, include_index=True) 64 | (dev_instances, dev_index) = lm_tools.model_inputs(dev_ngrams, word_dict, 65 | context=context, include_last=False, include_index=True) 66 | (test_instances, test_index) = lm_tools.model_inputs(test_ngrams, word_dict, 67 | context=context, include_last=False, include_index=True) 68 | 69 | # Save everything into dictionaries 70 | print 'Packing up...' 71 | z = {} 72 | z['text'] = train 73 | z['tokens'] = train_tokens 74 | z['word_dict'] = word_dict 75 | z['index_dict'] = index_dict 76 | z['ngrams'] = train_ngrams 77 | z['labels'] = train_labels 78 | z['instances'] = train_instances 79 | z['IM'] = trainIM 80 | z['index'] = train_index 81 | z['context'] = context 82 | 83 | zd = {} 84 | zd['text'] = dev 85 | zd['tokens'] = dev_tokens 86 | zd['ngrams'] = dev_ngrams 87 | zd['labels'] = dev_labels 88 | zd['instances'] = dev_instances 89 | zd['IM'] = devIM 90 | zd['index'] = dev_index 91 | zd['context'] = context 92 | 93 | zt = {} 94 | zt['text'] = test 95 | zt['tokens'] = test_tokens 96 | zt['ngrams'] = test_ngrams 97 | zt['instances'] = test_instances 98 | zt['IM'] = testIM 99 | zt['index'] = test_index 100 | zt['context'] = context 101 | 102 | return (z, zd, zt) 103 | 104 | def load_json(): 105 | """ 106 | Load the JSON annotations 107 | """ 108 | # Load the training sentences 109 | f = open(config.paths['sentences_train2014']) 110 | train_data = json.load(f) 111 | f.close() 112 | 113 | # Load the validation sentences 114 | f = open(config.paths['sentences_val2014']) 115 | val_data = json.load(f) 116 | f.close() 117 | 118 | return (train_data, val_data) 119 | 120 | def uniq(seq): 121 | seen = set() 122 | seen_add = seen.add 123 | return [ x for x in seq if x not in seen and not seen_add(x)] 124 | 125 | def load_features_npy(): 126 | """ 127 | Load directly from numpy files 128 | """ 129 | trainIM = np.load(config.paths['train']) 130 | devIM = np.load(config.paths['dev']) 131 | testIM = np.load(config.paths['test']) 132 | return (trainIM, devIM, testIM) 133 | 134 | def load_splits(): 135 | """ 136 | Load train/dev/test splits 137 | """ 138 | (train, dev, test) = ([], [], []) 139 | f = open(config.paths['coco_train']) 140 | for line in f: 141 | train.append(int(line.strip()[:-4][-12:])) 142 | f.close() 143 | f = open(config.paths['coco_val']) 144 | for line in f: 145 | dev.append(int(line.strip()[:-4][-12:])) 146 | f.close() 147 | f = open(config.paths['coco_test']) 148 | for line in f: 149 | test.append(int(line.strip()[:-4][-12:])) 150 | f.close() 151 | return (train, dev, test) 152 | 153 | def image_ids(): 154 | """ 155 | Return a dictionary mapping image features to their IDs 156 | """ 157 | dx_train = {} 158 | dx_dev = {} 159 | count = 0 160 | (train_data, val_data) = load_json() 161 | 162 | # Part-1: COCO training data 163 | tr = [] 164 | for x in train_data['images']: 165 | tr.append(x['id']) 166 | tr = sorted(tr) 167 | for i, x in enumerate(tr): 168 | dx_train[x] = count 169 | count += 1 170 | 171 | # Part-2: COCO validation data 172 | count = 0 173 | va = [] 174 | for x in val_data['images']: 175 | va.append(x['id']) 176 | va = sorted(va) 177 | for i, x in enumerate(va): 178 | dx_dev[x] = count 179 | count += 1 180 | 181 | return (dx_train, dx_dev) 182 | 183 | def load_sentences(): 184 | """ 185 | Return a dictionary of image ids to sentences 186 | """ 187 | (train_data, val_data) = load_json() 188 | 189 | # Populate the dictionary 190 | d = defaultdict(list) 191 | for x in train_data['sentences']: 192 | image_id = x['image_id'] 193 | sentence = x['sentence'] 194 | d[image_id].append(sentence) 195 | for x in val_data['sentences']: 196 | image_id = x['image_id'] 197 | sentence = x['sentence'] 198 | d[image_id].append(sentence) 199 | return d 200 | 201 | def construct_captions(d, ids): 202 | """ 203 | Construct captions for entries in ids 204 | """ 205 | X = [] 206 | for x in ids: 207 | captions = d[x] 208 | for s in captions[:5]: 209 | X.append(s) 210 | return X 211 | 212 | def word_tokenize(text): 213 | """ 214 | Perform word tokenization (from NLTK) 215 | """ 216 | CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"), 217 | re.compile(r"(?i)\b(d)('ye)\b"), 218 | re.compile(r"(?i)\b(gim)(me)\b"), 219 | re.compile(r"(?i)\b(gon)(na)\b"), 220 | re.compile(r"(?i)\b(got)(ta)\b"), 221 | re.compile(r"(?i)\b(lem)(me)\b"), 222 | re.compile(r"(?i)\b(mor)('n)\b"), 223 | re.compile(r"(?i)\b(wan)(na) ")] 224 | CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"), 225 | re.compile(r"(?i) ('t)(was)\b")] 226 | 227 | #starting quotes 228 | text = re.sub(r'^\"', r'``', text) 229 | text = re.sub(r'(``)', r' \1 ', text) 230 | text = re.sub(r'([ (\[{<])"', r'\1 `` ', text) 231 | 232 | #punctuation 233 | text = re.sub(r'([:,])([^\d])', r' \1 \2', text) 234 | text = re.sub(r'\.\.\.', r' ... ', text) 235 | text = re.sub(r'[;@#$%&]', r' \g<0> ', text) 236 | text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text) 237 | text = re.sub(r'[?!]', r' \g<0> ', text) 238 | text = re.sub(r"([^'])' ", r"\1 ' ", text) 239 | 240 | #parens, brackets, etc. 241 | text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text) 242 | text = re.sub(r'--', r' -- ', text) 243 | 244 | #add extra space to make things easier 245 | text = " " + text + " " 246 | 247 | #ending quotes 248 | text = re.sub(r'"', " '' ", text) 249 | text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text) 250 | 251 | text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text) 252 | text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ", 253 | text) 254 | 255 | for regexp in CONTRACTIONS2: 256 | text = regexp.sub(r' \1 \2 ', text) 257 | for regexp in CONTRACTIONS3: 258 | text = regexp.sub(r' \1 \2 ', text) 259 | 260 | return text.split() 261 | 262 | def tokenize(X, context=5, start='', end='', topwords=None): 263 | """ 264 | Tokenize each of the captions 265 | """ 266 | tokens = [word_tokenize(x) for x in X] 267 | if topwords == None: 268 | word_counts = get_counts(tokens) 269 | topwords = [w for w in word_counts.keys() if word_counts[w] >= 5] 270 | topwords += ['unk'] 271 | d = defaultdict(lambda : 0) 272 | for w in topwords: 273 | d[w] = 1 274 | tokens = [[w if d[w] > 0 else 'unk' for w in t] for t in tokens] 275 | for i, x in enumerate(tokens): 276 | tokens[i] = [start] * context + x + [end] 277 | return (tokens, topwords) 278 | 279 | def get_counts(tokens): 280 | """ 281 | Compute a dictionary of counts from tokens 282 | """ 283 | flat_tokens = [item for sublist in tokens for item in sublist] 284 | word_counts = Counter(flat_tokens) 285 | return word_counts 286 | 287 | def index_words(tokens): 288 | """ 289 | Compute dictionaries for indexing words 290 | """ 291 | flat_tokens = [item for sublist in tokens for item in sublist] 292 | word_dict = {} 293 | for i, w in enumerate(list(set(flat_tokens))): 294 | word_dict[w] = i 295 | index_dict = dict((v,k) for k, v in word_dict.iteritems()) 296 | return (word_dict, index_dict) 297 | 298 | def compute_labels(ngrams, word_dict, context=5): 299 | """ 300 | Create matrix of word occurences (labels for the model) 301 | """ 302 | ngrams_count = [len(x) for x in ngrams] 303 | uniq_ngrams = uniq([item[:-1] for sublist in ngrams for item in sublist]) 304 | count = 0 305 | train_dict = {} 306 | for w in uniq_ngrams: 307 | train_dict[w] = count 308 | count = count + 1 309 | 310 | labels = lil_matrix((sum(ngrams_count), len(word_dict.keys()))) 311 | train_ngrams_flat = [item for sublist in ngrams for item in sublist] 312 | labels_dict = defaultdict(int) 313 | col_dict = defaultdict(list) 314 | 315 | for w in train_ngrams_flat: 316 | row_ind = train_dict[w[:context]] 317 | col_ind = word_dict[w[-1]] 318 | labels_dict[(row_ind, col_ind)] += 1 319 | col_dict[row_ind] = list(set(col_dict[row_ind] + [col_ind])) 320 | 321 | count = 0 322 | for x in ngrams: 323 | for w in x: 324 | row_ind = train_dict[w[:context]] 325 | inds = col_dict[(row_ind)] 326 | labels[count, word_dict[w[-1]]] = 1 327 | count = count + 1 328 | 329 | labels_un = labels.tocsr() 330 | return labels_un 331 | 332 | 333 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dataset configuration 3 | """ 4 | #-----------------------------------------------------------------------------# 5 | # Paths to MSCOCO 6 | #-----------------------------------------------------------------------------# 7 | paths = dict() 8 | 9 | # JSON annotations 10 | paths['sentences_train2014'] = '/ais/gobi3/u/rkiros/coco/annotations/sentences_train2014.json' 11 | paths['sentences_val2014'] = '/ais/gobi3/u/rkiros/coco/annotations/sentences_val2014.json' 12 | 13 | # VGG19 features 14 | paths['train'] = '/ais/gobi3/u/rkiros/coco/splits/train.npy' 15 | paths['dev'] = '/ais/gobi3/u/rkiros/coco/splits/dev.npy' 16 | paths['test'] = '/ais/gobi3/u/rkiros/coco/splits/test.npy' 17 | 18 | # Data splits 19 | paths['coco_train'] = '/ais/gobi3/u/rkiros/coco/coco_train.txt' 20 | paths['coco_val'] = '/ais/gobi3/u/rkiros/coco/coco_val.txt' 21 | paths['coco_test'] = '/ais/gobi3/u/rkiros/coco/coco_test.txt' 22 | 23 | # Word embeddings 24 | paths['embedding'] = '/ais/gobi3/u/rkiros/iaprtc12/embeddings-scaled.EMBEDDING_SIZE=100.txt' 25 | 26 | 27 | -------------------------------------------------------------------------------- /gen/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | 5 | -------------------------------------------------------------------------------- /gen/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | # $Id$ 4 | use strict; 5 | 6 | my $lowercase = 0; 7 | if ($ARGV[0] eq "-lc") { 8 | $lowercase = 1; 9 | shift; 10 | } 11 | 12 | my $stem = $ARGV[0]; 13 | if (!defined $stem) { 14 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 15 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 16 | exit(1); 17 | } 18 | 19 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 20 | 21 | my @REF; 22 | my $ref=0; 23 | while(-e "$stem$ref") { 24 | &add_to_ref("$stem$ref",\@REF); 25 | $ref++; 26 | } 27 | &add_to_ref($stem,\@REF) if -e $stem; 28 | die("ERROR: could not find reference file $stem") unless scalar @REF; 29 | 30 | sub add_to_ref { 31 | my ($file,$REF) = @_; 32 | my $s=0; 33 | open(REF,$file) or die "Can't read $file"; 34 | while() { 35 | chop; 36 | push @{$$REF[$s++]}, $_; 37 | } 38 | close(REF); 39 | } 40 | 41 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 42 | my $s=0; 43 | while() { 44 | chop; 45 | $_ = lc if $lowercase; 46 | my @WORD = split; 47 | my %REF_NGRAM = (); 48 | my $length_translation_this_sentence = scalar(@WORD); 49 | my ($closest_diff,$closest_length) = (9999,9999); 50 | foreach my $reference (@{$REF[$s]}) { 51 | # print "$s $_ <=> $reference\n"; 52 | $reference = lc($reference) if $lowercase; 53 | my @WORD = split(' ',$reference); 54 | my $length = scalar(@WORD); 55 | my $diff = abs($length_translation_this_sentence-$length); 56 | if ($diff < $closest_diff) { 57 | $closest_diff = $diff; 58 | $closest_length = $length; 59 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 60 | } elsif ($diff == $closest_diff) { 61 | $closest_length = $length if $length < $closest_length; 62 | # from two references with the same closeness to me 63 | # take the *shorter* into account, not the "first" one. 64 | } 65 | for(my $n=1;$n<=4;$n++) { 66 | my %REF_NGRAM_N = (); 67 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 68 | my $ngram = "$n"; 69 | for(my $w=0;$w<$n;$w++) { 70 | $ngram .= " ".$WORD[$start+$w]; 71 | } 72 | $REF_NGRAM_N{$ngram}++; 73 | } 74 | foreach my $ngram (keys %REF_NGRAM_N) { 75 | if (!defined($REF_NGRAM{$ngram}) || 76 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 77 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 78 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 79 | } 80 | } 81 | } 82 | } 83 | $length_translation += $length_translation_this_sentence; 84 | $length_reference += $closest_length; 85 | for(my $n=1;$n<=4;$n++) { 86 | my %T_NGRAM = (); 87 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 88 | my $ngram = "$n"; 89 | for(my $w=0;$w<$n;$w++) { 90 | $ngram .= " ".$WORD[$start+$w]; 91 | } 92 | $T_NGRAM{$ngram}++; 93 | } 94 | foreach my $ngram (keys %T_NGRAM) { 95 | $ngram =~ /^(\d+) /; 96 | my $n = $1; 97 | # my $corr = 0; 98 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 99 | $TOTAL[$n] += $T_NGRAM{$ngram}; 100 | if (defined($REF_NGRAM{$ngram})) { 101 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 102 | $CORRECT[$n] += $T_NGRAM{$ngram}; 103 | # $corr = $T_NGRAM{$ngram}; 104 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 105 | } 106 | else { 107 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 108 | # $corr = $REF_NGRAM{$ngram}; 109 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 110 | } 111 | } 112 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 113 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 114 | } 115 | } 116 | $s++; 117 | } 118 | my $brevity_penalty = 1; 119 | my $bleu = 0; 120 | 121 | my @bleu=(); 122 | 123 | for(my $n=1;$n<=4;$n++) { 124 | if (defined ($TOTAL[$n])){ 125 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 126 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 127 | }else{ 128 | $bleu[$n]=0; 129 | } 130 | } 131 | 132 | if ($length_reference==0){ 133 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 134 | exit(1); 135 | } 136 | 137 | #if ($length_translation<$length_reference) { 138 | # $brevity_penalty = exp(1-$length_reference/$length_translation); 139 | #} 140 | 141 | #$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 142 | # my_log( $bleu[2] ) + 143 | # my_log( $bleu[3] ) + 144 | # my_log( $bleu[4] ) ) / 4) ; 145 | 146 | my $bleu_1 = $brevity_penalty * exp((my_log( $bleu[1] ))); 147 | 148 | my $bleu_2 = $brevity_penalty * exp((my_log( $bleu[1] ) + 149 | my_log( $bleu[2] ) ) / 2) ; 150 | 151 | my $bleu_3 = $brevity_penalty * exp((my_log( $bleu[1] ) + 152 | my_log( $bleu[2] ) + 153 | my_log( $bleu[3] ) ) / 3) ; 154 | 155 | my $bleu_4 = $brevity_penalty * exp((my_log( $bleu[1] ) + 156 | my_log( $bleu[2] ) + 157 | my_log( $bleu[3] ) + 158 | my_log( $bleu[4] ) ) / 4) ; 159 | 160 | printf "BLEU = %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 161 | 100*$bleu_1, 162 | 100*$bleu_2, 163 | 100*$bleu_3, 164 | 100*$bleu_4, 165 | $brevity_penalty, 166 | $length_translation / $length_reference, 167 | $length_translation, 168 | $length_reference; 169 | 170 | sub my_log { 171 | return -9999999999 unless $_[0]; 172 | return log($_[0]); 173 | } 174 | 175 | 176 | -------------------------------------------------------------------------------- /lm/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | 5 | -------------------------------------------------------------------------------- /lm/mlbl.py: -------------------------------------------------------------------------------- 1 | # Additive multimodal log-bilinear model 2 | 3 | import numpy as np 4 | import sys 5 | from utils import lm_tools 6 | from scipy.optimize import check_grad 7 | from scipy.sparse import vstack 8 | from numpy.random import RandomState 9 | import time 10 | 11 | class MLBL(object): 12 | """ 13 | Multimodal Log-bilinear language model trained using SGD 14 | """ 15 | def __init__(self, 16 | name='lbl', 17 | loc='models/mlbl.pkl', 18 | seed=1234, 19 | dropout=0.0, 20 | k=5, 21 | V=10364, 22 | K=50, 23 | D=10, 24 | h=3, 25 | context=5, 26 | batchsize=20, 27 | maxepoch=10, 28 | eta_t=0.2, 29 | gamma_r=0.0, 30 | gamma_c=0.0, 31 | f=0.995, 32 | p_i=0.5, 33 | p_f=0.5, 34 | T=20, 35 | verbose=1): 36 | """ 37 | name: name of the network 38 | loc: location to save model files 39 | seed: random seed 40 | dropout: probability of dropout 41 | k: validation interval before stopping 42 | V: vocabulary size 43 | K: embedding dimensionality 44 | D: dimensionality of the image features 45 | h: intermediate layer dimensionality 46 | context: word context length 47 | batchsize: size of the minibatches 48 | maxepoch: max number of training epochs 49 | eta_t: learning rate 50 | gamma_r: weight decay for representations 51 | gamma_c: weight decay for contexts 52 | f: learning rate decay 53 | p_i: initial momentum 54 | p_f: final momentum 55 | T: number of epochs until p_f is reached (linearly) 56 | verbose: display progress 57 | """ 58 | self.name = name 59 | self.loc = loc 60 | self.dropout = dropout 61 | self.seed = seed 62 | self.k = k 63 | self.V = V 64 | self.K = K 65 | self.D = D 66 | self.h = h 67 | self.context = context 68 | self.batchsize = batchsize 69 | self.maxepoch = maxepoch 70 | self.eta_t = eta_t 71 | self.gamma_r = gamma_r 72 | self.gamma_c = gamma_c 73 | self.f = f 74 | self.p_i = p_i 75 | self.p_f = p_f 76 | self.T = T 77 | self.verbose = verbose 78 | self.p_t = (1 - (1 / T)) * p_i + (1 / T) * p_f 79 | 80 | def init_params(self, embed_map, count_dict): 81 | """ 82 | Initializes embeddings and context matricies 83 | """ 84 | prng = RandomState(self.seed) 85 | 86 | # Pre-trained word embedding matrix 87 | if embed_map != None: 88 | R = np.zeros((self.K, self.V)) 89 | for i in range(self.V): 90 | word = count_dict[i] 91 | if word in embed_map: 92 | R[:,i] = embed_map[word] 93 | else: 94 | R[:,i] = embed_map['*UNKNOWN*'] 95 | else: 96 | r = np.sqrt(6) / np.sqrt(self.K + self.V + 1) 97 | R = prng.rand(self.K, self.V) * 2 * r - r 98 | bw = np.zeros((1, self.V)) 99 | 100 | # Context 101 | C = 0.01 * prng.randn(self.context, self.K, self.K) 102 | 103 | # Image context 104 | M = 0.01 * prng.randn(self.h, self.K) 105 | 106 | # Hidden layer 107 | r = np.sqrt(6) / np.sqrt(self.D + self.h + 1) 108 | J = prng.rand(self.D, self.h) * 2 * r - r 109 | bj = np.zeros((1, self.h)) 110 | 111 | # Initial deltas used for SGD 112 | deltaR = np.zeros(np.shape(R)) 113 | deltaC = np.zeros(np.shape(C)) 114 | deltaB = np.zeros(np.shape(bw)) 115 | deltaM = np.zeros(np.shape(M)) 116 | deltaJ = np.zeros(np.shape(J)) 117 | deltaBj = np.zeros(np.shape(bj)) 118 | 119 | # Pack up 120 | self.R = R 121 | self.C = C 122 | self.bw = bw 123 | self.M = M 124 | self.J = J 125 | self.bj = bj 126 | self.deltaR = deltaR 127 | self.deltaC = deltaC 128 | self.deltaB = deltaB 129 | self.deltaM = deltaM 130 | self.deltaJ = deltaJ 131 | self.deltaBj = deltaBj 132 | 133 | def forward(self, X, Im, test=True): 134 | """ 135 | Feed-forward pass through the model 136 | """ 137 | batchsize = X.shape[0] 138 | 139 | # Forwardprop images 140 | IF = np.dot(Im, self.J) + self.bj 141 | IF = IF * (IF > 0) 142 | 143 | # Dropout (if applicable) 144 | if self.dropout > 0 and not test: 145 | dropmask = np.random.rand(batchsize, self.h) > self.dropout 146 | IF = IF * dropmask 147 | 148 | # Obtain word features 149 | tmp = self.R[:,X.flatten()].flatten(order='F').reshape((batchsize, self.K * self.context)) 150 | words = np.zeros((batchsize, self.K, self.context)) 151 | for i in range(batchsize): 152 | words[i] = tmp[i].reshape((self.K, self.context), order='F') 153 | 154 | # Compute the hidden layer (predicted next word representation) 155 | acts = np.zeros((batchsize, self.K)) 156 | for i in range(self.context): 157 | acts += np.dot(words[:,:,i], self.C[i]) 158 | if test: 159 | acts += np.dot(IF, self.M * (1 - self.dropout)) 160 | else: 161 | acts += np.dot(IF, self.M) 162 | 163 | # Compute softmax 164 | preds = np.dot(acts, self.R) + self.bw 165 | preds = np.exp(preds - preds.max(1).reshape(batchsize, 1)) 166 | preds /= preds.sum(1).reshape(batchsize, 1) 167 | 168 | return (words, acts, IF, preds) 169 | 170 | def objective(self, Y, preds): 171 | """ 172 | Compute the objective function 173 | """ 174 | batchsize = Y.shape[0] 175 | 176 | # Cross-entropy 177 | C = -np.sum(Y.multiply(np.log(preds + 1e-20))) / batchsize 178 | return C 179 | 180 | def backward(self, Y, preds, IF, acts, words, X, Im): 181 | """ 182 | Backward pass through the network 183 | """ 184 | batchsize = preds.shape[0] 185 | 186 | # Compute part of df/dR 187 | Y = np.array(Y.todense()) 188 | Ix = (preds - Y) / batchsize 189 | dR = np.dot(acts.T, Ix) 190 | db = np.sum(Ix, 0) 191 | 192 | # Compute df/dC and word inputs for df/dR 193 | Ix = np.dot(Ix, self.R.T) 194 | dC = np.zeros(np.shape(self.C)) 195 | for i in range(self.context): 196 | dC[i] = np.dot(words[:,:,i].T, Ix) 197 | delta = np.dot(Ix, self.C[i].T) 198 | for j in range(X.shape[0]): 199 | dR[:,X[j,i]] += delta.T[:,j] 200 | 201 | # Compute df/dM 202 | dM = np.dot(IF.T, Ix) 203 | 204 | # Compute df/dJ 205 | Ix = np.multiply(np.dot(Ix, self.M.T), (IF > 0)) 206 | dJ = np.dot(Im.T, Ix) 207 | dBj = np.sum(Ix, 0) 208 | 209 | # Weight decay terms 210 | dR += self.gamma_r * self.R 211 | dC += self.gamma_c * self.C 212 | dM += self.gamma_c * self.M 213 | dJ += self.gamma_c * self.J 214 | 215 | # Pack 216 | self.dR = dR 217 | self.dM = dM 218 | self.db = db 219 | self.dC = dC 220 | self.dJ = dJ 221 | self.dBj = dBj 222 | 223 | def update_params(self, X): 224 | """ 225 | Update the network parameters using the computed gradients 226 | """ 227 | batchsize = X.shape[0] 228 | self.deltaC = self.p_t * self.deltaC - \ 229 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dC 230 | self.deltaR = self.p_t * self.deltaR - \ 231 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dR 232 | self.deltaB = self.p_t * self.deltaB - \ 233 | (1 - self.p_t) * (self.eta_t / batchsize) * self.db 234 | self.deltaM = self.p_t * self.deltaM - \ 235 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dM 236 | self.deltaJ = self.p_t * self.deltaJ - \ 237 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dJ 238 | self.deltaBj = self.p_t * self.deltaBj - \ 239 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dBj 240 | 241 | self.C += self.deltaC 242 | self.R += self.deltaR 243 | self.bw += self.deltaB 244 | self.M += self.deltaM 245 | self.J += self.deltaJ 246 | self.bj += self.deltaBj 247 | 248 | def update_hyperparams(self): 249 | """ 250 | Updates the learning rate and momentum schedules 251 | """ 252 | self.eta_t *= self.f 253 | if self.step < self.T: 254 | self.p_t = (1 - ((self.step + 1) / self.T)) * self.p_i + \ 255 | ((self.step + 1) / self.T) * self.p_f 256 | else: 257 | self.p_t = self.p_f 258 | 259 | def compute_obj(self, X, Im, Y): 260 | """ 261 | Perform a forward pass and compute the objective 262 | """ 263 | preds = self.forward(X, Im)[-1] 264 | obj = self.objective(Y, preds) 265 | return obj 266 | 267 | def train(self, X, indX, XY, V, indV, VY, IM, VIM, count_dict, word_dict, embed_map, prog): 268 | """ 269 | Trains the LBL 270 | """ 271 | self.start = self.seed 272 | self.init_params(embed_map, count_dict) 273 | self.step = 0 274 | inds = np.arange(len(X)) 275 | numbatches = len(inds) / self.batchsize 276 | tic = time.time() 277 | bleu = [0.0]*4 278 | best = 0.0 279 | scores = '/'.join([str(b) for b in bleu]) 280 | patience = 10 281 | count = 0 282 | done = False 283 | 284 | # Main loop 285 | lm_tools.display_phase(1) 286 | for epoch in range(self.maxepoch): 287 | if done: 288 | break 289 | self.epoch = epoch 290 | prng = RandomState(self.seed + epoch + 1) 291 | prng.shuffle(inds) 292 | for minibatch in range(numbatches): 293 | 294 | batchX = X[inds[minibatch::numbatches]] 295 | batchY = XY[inds[minibatch::numbatches]] 296 | batchindX = indX[inds[minibatch::numbatches]].astype(int).flatten() 297 | batchindX = np.floor(batchindX/5).astype(int) 298 | batchIm = IM[batchindX] 299 | 300 | (words, acts, IF, preds) = self.forward(batchX, batchIm, test=False) 301 | self.backward(batchY, preds, IF, acts, words, batchX, batchIm) 302 | self.update_params(batchX) 303 | if np.sum(np.isnan(acts)) > 0: 304 | print 'NaNs... breaking out' 305 | done = True 306 | break 307 | 308 | # Print out progress 309 | if np.mod(minibatch * self.batchsize, prog['_details']) == 0 and minibatch > 0: 310 | print "epoch: %d, pts: %d, time: %.2f" % (epoch, minibatch * self.batchsize, (time.time()-tic)/60) 311 | if np.mod(minibatch * self.batchsize, prog['_samples']) == 0 and minibatch > 0: 312 | print "best: %s" % (scores) 313 | print '\nSamples:' 314 | lm_tools.generate_and_show(self, word_dict, count_dict, VIM, k=3) 315 | print ' ' 316 | if np.mod(minibatch * self.batchsize, prog['_update']) == 0 and minibatch > 0: 317 | self.update_hyperparams() 318 | self.step += 1 319 | print "learning rate: %.4f, momentum: %.4f" % (self.eta_t, self.p_t) 320 | 321 | # Compute BLEU 322 | if np.mod(minibatch * self.batchsize, prog['_bleu']) == 0 and minibatch > 0: 323 | bleu = lm_tools.compute_bleu(self, word_dict, count_dict, VIM, prog, k=3) 324 | if bleu[-1] >= best: 325 | count = 0 326 | best = bleu[-1] 327 | scores = '/'.join([str(b) for b in bleu]) 328 | print scores + '\n' 329 | lm_tools.save_model(self, self.loc) 330 | else: 331 | count += 1 332 | if count == patience: 333 | done = True 334 | break 335 | 336 | self.update_hyperparams() 337 | self.step += 1 338 | return best 339 | 340 | def main(): 341 | pass 342 | 343 | if __name__ == '__main__': 344 | main() 345 | 346 | 347 | -------------------------------------------------------------------------------- /lm/mlblf.py: -------------------------------------------------------------------------------- 1 | # Multiplicative multimodal log-bilinear model 2 | 3 | import numpy as np 4 | import sys 5 | from utils import lm_tools 6 | from utils import svd_tools 7 | from scipy.optimize import check_grad 8 | from scipy.sparse import vstack 9 | from numpy.random import RandomState 10 | import time 11 | 12 | class MLBLF(object): 13 | """ 14 | Factored 3-way Multimodal Log-bilinear language model trained using SGD 15 | """ 16 | def __init__(self, 17 | name='lbl', 18 | loc='models/mlblf.pkl', 19 | seed=1234, 20 | dropout=0.0, 21 | k=5, 22 | V=10364, 23 | K=50, 24 | D=10, 25 | h=3, 26 | factors=10, 27 | context=5, 28 | batchsize=20, 29 | maxepoch=10, 30 | eta_t=0.2, 31 | gamma_r=0.0, 32 | gamma_c=0.0, 33 | f=0.995, 34 | p_i=0.5, 35 | p_f=0.5, 36 | T=20, 37 | verbose=1): 38 | """ 39 | name: name of the network 40 | loc: location to save model files 41 | seed: random seed 42 | dropout: probability of dropout 43 | k: validation interval before stopping 44 | V: vocabulary size 45 | K: embedding dimensionality 46 | D: dimensionality of the image features 47 | h: intermediate layer dimensionality 48 | factors: number of factors 49 | context: word context length 50 | batchsize: size of the minibatches 51 | maxepoch: max number of training epochs 52 | eta_t: learning rate 53 | gamma_r: weight decay for representations 54 | gamma_c: weight decay for contexts 55 | f: learning rate decay 56 | p_i: initial momentum 57 | p_f: final momentum 58 | T: number of epochs until p_f is reached (linearly) 59 | verbose: display progress 60 | """ 61 | self.name = name 62 | self.loc = loc 63 | self.dropout = dropout 64 | self.seed = seed 65 | self.k = k 66 | self.V = V 67 | self.K = K 68 | self.D = D 69 | self.h = h 70 | self.factors = factors 71 | self.context = context 72 | self.batchsize = batchsize 73 | self.maxepoch = maxepoch 74 | self.eta_t = eta_t 75 | self.gamma_r = gamma_r 76 | self.gamma_c = gamma_c 77 | self.f = f 78 | self.p_i = p_i 79 | self.p_f = p_f 80 | self.T = T 81 | self.verbose = verbose 82 | self.p_t = (1 - (1 / T)) * p_i + (1 / T) * p_f 83 | 84 | def init_params(self, embed_map, count_dict): 85 | """ 86 | Initializes embeddings and context matricies 87 | """ 88 | prng = RandomState(self.seed) 89 | 90 | # Pre-trained word embedding matrix 91 | if embed_map != None: 92 | R = np.zeros((self.K, self.V)) 93 | for i in range(self.V): 94 | word = count_dict[i] 95 | if word in embed_map: 96 | R[:,i] = embed_map[word] 97 | else: 98 | R[:,i] = embed_map['*UNKNOWN*'] 99 | else: 100 | r = np.sqrt(6) / np.sqrt(self.K + self.V + 1) 101 | R = prng.rand(self.K, self.V) * 2 * r - r 102 | bw = np.zeros((1, self.V)) 103 | 104 | # Context 105 | C = 0.01 * prng.randn(self.context, self.K, self.K) 106 | 107 | # Image context 108 | M = 0.01 * prng.randn(self.h, self.K) 109 | 110 | # Hidden layer 111 | r = np.sqrt(6) / np.sqrt(self.D + self.h + 1) 112 | J = prng.rand(self.D, self.h) * 2 * r - r 113 | bj = np.zeros((1, self.h)) 114 | 115 | # Decomposition matricies 116 | Wfx, Whf = svd_tools.svd(R, n_components=self.factors, transpose='false') 117 | Wfv = 0.01 * prng.randn(self.h, self.factors) 118 | 119 | # Initial deltas used for SGD 120 | deltaC = np.zeros(np.shape(C)) 121 | deltaB = np.zeros(np.shape(bw)) 122 | deltaM = np.zeros(np.shape(M)) 123 | deltaJ = np.zeros(np.shape(J)) 124 | deltaBj = np.zeros(np.shape(bj)) 125 | deltaWfx = np.zeros(np.shape(Wfx)) 126 | deltaWhf = np.zeros(np.shape(Whf)) 127 | deltaWfv = np.zeros(np.shape(Wfv)) 128 | 129 | # Pack up 130 | self.C = C 131 | self.bw = bw 132 | self.M = M 133 | self.J = J 134 | self.bj = bj 135 | self.Wfx = Wfx 136 | self.Whf = Whf 137 | self.Wfv = Wfv 138 | self.deltaC = deltaC 139 | self.deltaB = deltaB 140 | self.deltaM = deltaM 141 | self.deltaJ = deltaJ 142 | self.deltaBj = deltaBj 143 | self.deltaWfx = deltaWfx 144 | self.deltaWhf = deltaWhf 145 | self.deltaWfv = deltaWfv 146 | 147 | def forward(self, X, Im, test=True): 148 | """ 149 | Feed-forward pass through the model 150 | """ 151 | batchsize = X.shape[0] 152 | 153 | # Forwardprop images 154 | IF = np.dot(Im, self.J) + self.bj 155 | IF = np.multiply(IF, (IF > 0)) 156 | 157 | # Dropout (if applicable) 158 | if self.dropout > 0 and not test: 159 | dropmask = np.random.rand(batchsize, self.h) > self.dropout 160 | IF = IF * dropmask 161 | 162 | # Obtain word features 163 | R = np.dot(self.Wfx, self.Whf) 164 | tmp = R[:,X.flatten()].flatten(order='F').reshape((batchsize, self.K * self.context)) 165 | words = np.zeros((batchsize, self.K, self.context)) 166 | for i in range(batchsize): 167 | words[i] = tmp[i].reshape((self.K, self.context), order='F') 168 | 169 | # Compute the hidden layer (predicted next word representation) 170 | acts = np.zeros((batchsize, self.K)) 171 | for i in range(self.context): 172 | acts += np.dot(words[:,:,i], self.C[i]) 173 | if test: 174 | acts += np.dot(IF, self.M * (1 - self.dropout)) 175 | else: 176 | acts += np.dot(IF, self.M) 177 | 178 | # Multiplicative interaction 179 | if test: 180 | F = np.multiply(np.dot(acts, self.Wfx), np.dot(IF, self.Wfv * (1 - self.dropout))) 181 | else: 182 | F = np.multiply(np.dot(acts, self.Wfx), np.dot(IF, self.Wfv)) 183 | 184 | # Compute softmax 185 | preds = np.dot(F, self.Whf) + self.bw 186 | preds = np.exp(preds - preds.max(1).reshape(batchsize, 1)) 187 | preds /= preds.sum(1).reshape(batchsize, 1) 188 | 189 | return (words, acts, IF, F, preds) 190 | 191 | def objective(self, Y, preds): 192 | """ 193 | Compute the objective function 194 | """ 195 | batchsize = Y.shape[0] 196 | 197 | # Cross-entropy 198 | C = -np.sum(Y.multiply(np.log(preds + 1e-20))) / batchsize 199 | return C 200 | 201 | def backward(self, Y, preds, F, IF, acts, words, X, Im): 202 | """ 203 | Backward pass through the network 204 | """ 205 | batchsize = preds.shape[0] 206 | 207 | # Compute part of df/dWhf 208 | Y = np.array(Y.todense()) 209 | Ix = (preds - Y) / batchsize 210 | dWhf = np.dot(F.T, Ix) 211 | db = np.sum(Ix, 0) 212 | 213 | # Compute df/Wfv and part of df/Wfx 214 | Ix = np.dot(Ix, self.Whf.T) 215 | dWfv = np.dot(IF.T, np.multiply(Ix, np.dot(acts, self.Wfx))) 216 | dWfx = np.dot(acts.T, np.multiply(Ix, np.dot(IF, self.Wfv))) 217 | 218 | # Compute df/dC and word inputs for df/dR 219 | Ix_word = np.dot(np.multiply(Ix, np.dot(IF, self.Wfv)), self.Wfx.T) 220 | dC = np.zeros(np.shape(self.C)) 221 | dR = np.zeros((self.K, self.V)) 222 | for i in range(self.context): 223 | dC[i] = np.dot(words[:,:,i].T, Ix_word) 224 | delta = np.dot(Ix_word, self.C[i].T) 225 | for j in range(X.shape[0]): 226 | dR[:, X[j,i]] += delta.T[:,j] 227 | dWfx += np.dot(dR, self.Whf.T) 228 | dWhf += np.dot(self.Wfx.T, dR) 229 | 230 | # Compute df/dM 231 | dM = np.dot(IF.T, Ix_word) 232 | 233 | # Compute df/dJ 234 | Ix = np.multiply(np.dot(np.multiply(Ix, np.dot(acts, self.Wfx)), self.Wfv.T), (IF > 0)) + \ 235 | np.multiply(np.dot(Ix_word, self.M.T), (IF > 0)) 236 | dJ = np.dot(Im.T, Ix) 237 | dBj = np.sum(Ix, 0) 238 | 239 | # Weight decay terms 240 | dWhf += self.gamma_r * self.Whf 241 | dWfv += self.gamma_r * self.Wfv 242 | dWfx += self.gamma_r * self.Wfx 243 | dC += self.gamma_c * self.C 244 | dM += self.gamma_c * self.M 245 | dJ += self.gamma_c * self.J 246 | 247 | # Pack 248 | self.db = db 249 | self.dC = dC 250 | self.dM = dM 251 | self.dJ = dJ 252 | self.dBj = dBj 253 | self.dWhf = dWhf 254 | self.dWfv = dWfv 255 | self.dWfx = dWfx 256 | 257 | def update_params(self, X): 258 | """ 259 | Update the network parameters using the computed gradients 260 | """ 261 | batchsize = X.shape[0] 262 | self.deltaC = self.p_t * self.deltaC - \ 263 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dC 264 | self.deltaB = self.p_t * self.deltaB - \ 265 | (1 - self.p_t) * (self.eta_t / batchsize) * self.db 266 | self.deltaM = self.p_t * self.deltaM - \ 267 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dM 268 | self.deltaJ = self.p_t * self.deltaJ - \ 269 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dJ 270 | self.deltaBj = self.p_t * self.deltaBj - \ 271 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dBj 272 | self.deltaWhf = self.p_t * self.deltaWhf - \ 273 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dWhf 274 | self.deltaWfv = self.p_t * self.deltaWfv - \ 275 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dWfv 276 | self.deltaWfx = self.p_t * self.deltaWfx - \ 277 | (1 - self.p_t) * (self.eta_t / batchsize) * self.dWfx 278 | 279 | self.C = self.C + self.deltaC 280 | self.bw = self.bw + self.deltaB 281 | self.M = self.M + self.deltaM 282 | self.J = self.J + self.deltaJ 283 | self.bj = self.bj + self.deltaBj 284 | self.Wfv = self.Wfv + self.deltaWfv 285 | self.Wfx = self.Wfx + self.deltaWfx 286 | self.Whf = self.Whf + self.deltaWhf 287 | 288 | def update_hyperparams(self): 289 | """ 290 | Updates the learning rate and momentum schedules 291 | """ 292 | self.eta_t *= self.f 293 | if self.step < self.T: 294 | self.p_t = (1 - ((self.step + 1) / self.T)) * self.p_i + \ 295 | ((self.step + 1) / self.T) * self.p_f 296 | else: 297 | self.p_t = self.p_f 298 | 299 | def compute_obj(self, X, Im, Y): 300 | """ 301 | Perform a forward pass and compute the objective 302 | """ 303 | preds = self.forward(X, Im)[-1] 304 | obj = self.objective(Y, preds) 305 | return obj 306 | 307 | def train(self, X, indX, XY, V, indV, VY, IM, VIM, count_dict, word_dict, embed_map, prog): 308 | """ 309 | Trains the LBL 310 | """ 311 | self.start = self.seed 312 | self.init_params(embed_map, count_dict) 313 | self.step = 0 314 | inds = np.arange(len(X)) 315 | numbatches = len(inds) / self.batchsize 316 | tic = time.time() 317 | bleu = [0.0]*4 318 | best = 0.0 319 | scores = '/'.join([str(b) for b in bleu]) 320 | patience = 10 321 | count = 0 322 | done = False 323 | 324 | # Main loop 325 | lm_tools.display_phase(1) 326 | for epoch in range(self.maxepoch): 327 | if done: 328 | break 329 | self.epoch = epoch 330 | prng = RandomState(self.seed + epoch + 1) 331 | prng.shuffle(inds) 332 | for minibatch in range(numbatches): 333 | 334 | batchX = X[inds[minibatch::numbatches]] 335 | batchY = XY[inds[minibatch::numbatches]] 336 | batchindX = indX[inds[minibatch::numbatches]].astype(int).flatten() 337 | batchindX = np.floor(batchindX/5).astype(int) 338 | batchIm = IM[batchindX] 339 | 340 | (words, acts, IF, F, preds) = self.forward(batchX, batchIm, test=False) 341 | self.backward(batchY, preds, F, IF, acts, words, batchX, batchIm) 342 | self.update_params(batchX) 343 | if np.sum(np.isnan(acts)) > 0: 344 | print 'NaNs... breaking out' 345 | done = True 346 | break 347 | 348 | # Print out progress 349 | if np.mod(minibatch * self.batchsize, prog['_details']) == 0 and minibatch > 0: 350 | print "epoch: %d, pts: %d, time: %.2f" % (epoch, minibatch * self.batchsize, (time.time()-tic)/60) 351 | if np.mod(minibatch * self.batchsize, prog['_samples']) == 0 and minibatch > 0: 352 | print "best: %s" % (scores) 353 | print '\nSamples:' 354 | lm_tools.generate_and_show(self, word_dict, count_dict, VIM, k=3) 355 | print ' ' 356 | if np.mod(minibatch * self.batchsize, prog['_update']) == 0 and minibatch > 0: 357 | self.update_hyperparams() 358 | self.step += 1 359 | print "learning rate: %.4f, momentum: %.4f" % (self.eta_t, self.p_t) 360 | 361 | # Compute BLEU 362 | if np.mod(minibatch * self.batchsize, prog['_bleu']) == 0 and minibatch > 0: 363 | bleu = lm_tools.compute_bleu(self, word_dict, count_dict, VIM, prog, k=3) 364 | if bleu[-1] >= best: 365 | count = 0 366 | best = bleu[-1] 367 | scores = '/'.join([str(b) for b in bleu]) 368 | print scores + '\n' 369 | lm_tools.save_model(self, self.loc) 370 | else: 371 | count += 1 372 | if count == patience: 373 | done = True 374 | break 375 | 376 | self.update_hyperparams() 377 | self.step += 1 378 | return best 379 | 380 | def main(): 381 | pass 382 | 383 | if __name__ == '__main__': 384 | main() 385 | 386 | 387 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | 5 | -------------------------------------------------------------------------------- /tester.py: -------------------------------------------------------------------------------- 1 | # Tester module 2 | 3 | from utils import lm_tools 4 | 5 | def tester(loc, z, zd, k=3, neval=500, evaldev=True): 6 | """ 7 | Trainer function for multimodal log-bilinear models 8 | loc: location of model to evaluate 9 | k: the beam width to use for inference 10 | neval: Number of images to evaluate 11 | evaldev: True if evaluating on dev set, False for test set 12 | """ 13 | prog = {} 14 | prog['_neval'] = neval 15 | prog['_evaldev'] = evaldev 16 | 17 | print 'Loading model...' 18 | net = lm_tools.load_model(loc) 19 | 20 | print 'Evaluating...' 21 | bleu = lm_tools.compute_bleu(net, z['word_dict'], z['index_dict'], zd['IM'], prog, k=k) 22 | print bleu 23 | 24 | 25 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | # Trainer module 2 | 3 | import numpy as np 4 | import copy 5 | import config 6 | from lm import mlbl, mlblf 7 | from utils import lm_tools 8 | from numpy.random import RandomState 9 | 10 | def trainer(z, zd): 11 | """ 12 | Trainer function for multimodal log-bilinear models 13 | 14 | Dictionary: 15 | 'model' ('add' or 'mul') int:[0,1], cat{add, mul} 16 | 'name' (name of the model, unique to each run) 17 | 'loc' (location to save) 18 | 'context' int:[3,25] 19 | 'learning_rate' float:[0.001, 10] 20 | 'momentum' float:[0, 0.9] 21 | 'batch_size' int:[20, 100] 22 | 'hidden_size' int:[100, 2000] 23 | 'dropout' float:[0, 0.7] 24 | 'word_decay' float:[1e-3, 1e-9] 25 | 'context_decay' float:[1e-3, 1e-9] 26 | 'factors' (mul model only!) int:[50,200], truncate by embedding_size 27 | """ 28 | d = {} 29 | d['model'] = 'add' 30 | d['name'] = 'testrun' 31 | d['loc'] = './models/' + d['model'] + '_' + d['name'] 32 | d['context'] = 5 33 | d['learning_rate'] = 0.43 34 | d['momentum'] = 0.23 35 | d['batch_size'] = 40 36 | d['maxepoch'] = 10 37 | d['hidden_size'] = 441 38 | d['dropout'] = 0.15 39 | d['word_decay'] = 3e-7 40 | d['context_decay'] = 1e-8 41 | d['factors'] = 50 42 | 43 | # Progress display and update times 44 | prog = {} 45 | prog['_details'] = 1000 # How often to display training details 46 | prog['_samples'] = 10000 # How often to display samples 47 | prog['_update'] = 100000 # How often to update learning rate schedule 48 | prog['_bleu'] = 1000000 # How often to compute BLEU 49 | prog['_neval'] = 500 # How many development images to evaluate 50 | prog['_evaldev'] = True # Use development set reference captions for eval 51 | 52 | print d['loc'] 53 | 54 | # Load the word embeddings 55 | embed_map = load_embeddings() 56 | 57 | # Unpack some stuff from the data 58 | train_ngrams = z['ngrams'] 59 | train_labels = z['labels'] 60 | train_instances = z['instances'] 61 | word_dict = z['word_dict'] 62 | index_dict = z['index_dict'] 63 | context = z['context'] 64 | vocabsize = len(z['word_dict']) 65 | trainIM = z['IM'] 66 | train_index = z['index'] 67 | 68 | dev_ngrams = zd['ngrams'] 69 | dev_labels = zd['labels'] 70 | dev_instances = zd['instances'] 71 | devIM = zd['IM'] 72 | dev_index = zd['index'] 73 | 74 | # Initialize the network 75 | if d['model'] == 'add': 76 | net = mlbl.MLBL(name=d['name'], 77 | loc=d['loc'], 78 | seed=1234, 79 | dropout=d['dropout'], 80 | V=vocabsize, 81 | K=100, 82 | D=trainIM.shape[1], 83 | h=d['hidden_size'], 84 | context=d['context'], 85 | batchsize=d['batch_size'], 86 | maxepoch=d['maxepoch'], 87 | eta_t=d['learning_rate'], 88 | gamma_r=d['word_decay'], 89 | gamma_c=d['context_decay'], 90 | f=0.99, 91 | p_i=d['momentum'], 92 | p_f=d['momentum'], 93 | T=20.0, 94 | verbose=1) 95 | elif d['model'] == 'mul': 96 | net = mlblf.MLBLF(name=d['name'], 97 | loc=d['loc'], 98 | seed=1234, 99 | dropout=d['dropout'], 100 | V=vocabsize, 101 | K=100, 102 | D=trainIM.shape[1], 103 | h=d['hidden_size'], 104 | factors=d['factors'], 105 | context=d['context'], 106 | batchsize=d['batch_size'], 107 | maxepoch=d['maxepoch'], 108 | eta_t=d['learning_rate'], 109 | gamma_r=d['word_decay'], 110 | gamma_c=d['context_decay'], 111 | f=0.99, 112 | p_i=d['momentum'], 113 | p_f=d['momentum'], 114 | T=20.0, 115 | verbose=1) 116 | 117 | # Train the network 118 | X = train_instances 119 | indX = train_index 120 | Y = train_labels 121 | V = dev_instances 122 | indV = dev_index 123 | VY = dev_labels 124 | 125 | best = net.train(X, indX, Y, V, indV, VY, trainIM, devIM, index_dict, word_dict, embed_map, prog) 126 | return best 127 | 128 | def load_embeddings(): 129 | """ 130 | Load in the embeddings 131 | """ 132 | embed_map = {} 133 | ap = open(config.paths['embedding'], 'r') 134 | for line in ap: 135 | entry = line.split(' ') 136 | key = entry[0] 137 | value = [float(x) for x in entry[1:]] 138 | embed_map[key] = value 139 | return embed_map 140 | 141 | 142 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /utils/lm_tools.py: -------------------------------------------------------------------------------- 1 | # Language model tools for LBL and MLBL 2 | 3 | import numpy as np 4 | import copy 5 | import random 6 | import sys 7 | import os 8 | import cPickle as pickle 9 | from collections import defaultdict 10 | from scipy.linalg import norm 11 | 12 | def save_model(net, loc): 13 | """ 14 | Save the network model to the specified directory 15 | """ 16 | output = open(loc, 'wb') 17 | pickle.dump(net, output) 18 | output.close() 19 | 20 | def load_model(loc): 21 | """ 22 | Load the network model from the specified directory 23 | """ 24 | inputs = open(loc, 'rb') 25 | net = pickle.load(inputs) 26 | inputs.close() 27 | return net 28 | 29 | def display_phase(phase): 30 | """ 31 | Print a message displaying the current training phase 32 | """ 33 | print "============================== Training phase %d ==============================" % (phase) 34 | 35 | def compute_ngrams(sequence, n): 36 | """ 37 | Return n-grams from the input sequence 38 | """ 39 | sequence = list(sequence) 40 | count = max(0, len(sequence) - n + 1) 41 | return [tuple(sequence[i:i+n]) for i in range(count)] 42 | 43 | def get_ngrams(X, context=5): 44 | """ 45 | Extract n-grams from each caption in X 46 | """ 47 | ngrams = [] 48 | for x in X: 49 | x_ngrams = compute_ngrams(x, context + 1) 50 | ngrams.append(x_ngrams) 51 | return ngrams 52 | 53 | def model_inputs(ngrams, word_dict, context=5, include_last=True, include_index=False): 54 | """ 55 | Maps ngrams to format used for the language model 56 | include_last=True for evaluation (LL, perplexity) 57 | Out of vocabulary words are mapped to 'unk' (unknown) token 58 | """ 59 | d = defaultdict(lambda : 0) 60 | for w in word_dict.keys(): 61 | d[w] = 1 62 | ngrams_count = [len(x) for x in ngrams] 63 | if include_last: 64 | instances = np.zeros((sum(ngrams_count), context + 1)) 65 | else: 66 | instances = np.zeros((sum(ngrams_count), context)) 67 | count = 0 68 | index = np.zeros((sum(ngrams_count), 1)) 69 | for i in range(len(ngrams)): 70 | for j in range(len(ngrams[i])): 71 | values = [word_dict[w] if d[w] > 0 else word_dict['unk'] 72 | for w in ngrams[i][j]] 73 | if include_last: 74 | instances[count] = values 75 | else: 76 | instances[count] = values[:-1] 77 | index[count] = i 78 | count = count + 1 79 | instances = instances.astype(int) 80 | if include_index: 81 | return (instances, index) 82 | else: 83 | return instances 84 | 85 | def compute_ll(net, instances, Im=None): 86 | """ 87 | Compute the log-likelihood of instances from net 88 | """ 89 | if Im != None: 90 | preds = net.forward(instances[:,:-1], Im)[-1] 91 | else: 92 | preds = net.forward(instances[:,:-1])[-1] 93 | ll = 0 94 | for i in range(preds.shape[0]): 95 | ll += np.log2(preds[i, instances[i, -1]] + 1e-20) 96 | return ll 97 | 98 | def perplexity(net, ngrams, word_dict, Im=None, context=5): 99 | """ 100 | Compute the perplexity of ngrams from net 101 | """ 102 | ll = 0 103 | N = 0 104 | for i, ng in enumerate(ngrams): 105 | instances = model_inputs([ng], word_dict, context=context) 106 | if Im != None: 107 | ll += compute_ll(net, instances, np.tile(Im[i], (len(ng), 1))) 108 | else: 109 | ll += compute_ll(net, instances) 110 | N += len(instances) 111 | return pow(2, (-1.0 / N) * ll) 112 | 113 | def weighted_sample(n_picks, weights): 114 | """ 115 | Sample from a distribution weighted by 'weights' 116 | """ 117 | t = np.cumsum(weights) 118 | s = np.sum(weights) 119 | return np.searchsorted(t, np.random.rand(n_picks) * s) 120 | 121 | def beam_search(net, word_dict, index_dict, num, Im, initial=None, k=2, N=1, lm=None, beta=0.0, rerank=False): 122 | """ 123 | Return a N-best list of generated captions from a beam width of k 124 | """ 125 | # Set the initialization 126 | if initial == None: 127 | initial = [''] * net.context 128 | inputs = np.array([word_dict[w] for w in initial]).reshape(1, net.context) 129 | 130 | # Initialize the beams 131 | beam_tokens = [] 132 | beam_inputs = [] 133 | beam_scores = [] 134 | for i in range(k): 135 | beam_scores.append([0]) 136 | beam_tokens.append([''] * net.context) 137 | beam_inputs.append(np.array([word_dict[w] for w in initial]).reshape(1, net.context)) 138 | 139 | # Start loop 140 | done = False 141 | count = 1 142 | while not done: 143 | 144 | # Special case when count = 1 145 | if count == 1: 146 | preds = net.forward(inputs[:,inputs.shape[1]-net.context:], [Im])[-1].flatten() 147 | preds = np.log(preds + 1e-20) 148 | argpreds = np.argsort(preds)[::-1] 149 | words = [index_dict[w] for w in argpreds][:k] 150 | scores = preds[argpreds][:k] 151 | for i in range(k): 152 | beam_tokens[i].append(words[i]) 153 | beam_inputs[i] = np.c_[beam_inputs[i], argpreds[i]] 154 | beam_scores[i].append(scores[i]) 155 | beam_scores[i] = beam_scores[i][1:] 156 | count += 1 157 | 158 | # Every other case 159 | if count > 1: 160 | 161 | # Loop over each beam 162 | candidate_tokens = [] 163 | candidate_scores = [] 164 | candidate_inputs = [] 165 | candidate_norm = [] 166 | for i in range(k): 167 | 168 | # Make predictions and sort 169 | preds = net.forward(beam_inputs[i][:,beam_inputs[i].shape[1]-net.context:], [Im])[-1].flatten() 170 | preds = np.log(preds + 1e-20) 171 | argpreds = np.argsort(preds)[::-1] 172 | words = [index_dict[w] for w in argpreds][:k] 173 | scores = preds[argpreds][:k] 174 | for j in range(k): 175 | 176 | # First deal with tokens 177 | tmp = copy.deepcopy(beam_tokens[i]) 178 | last_word = tmp[-1] 179 | if last_word != '': 180 | tmp.append(words[j]) 181 | candidate_tokens.append(tmp) 182 | candidate_norm.append(len(tmp) - net.context) 183 | elif j == 0: 184 | candidate_tokens.append(tmp) 185 | candidate_norm.append(len(tmp) - net.context) 186 | 187 | # Then scores 188 | tmp = copy.deepcopy(beam_scores[i]) 189 | if last_word != '': 190 | tmp.append(scores[j]) 191 | candidate_scores.append(tmp) 192 | elif j == 0: 193 | candidate_scores.append(tmp) 194 | 195 | # Then inputs 196 | tmp = copy.deepcopy(beam_inputs[i]) 197 | if last_word != '': 198 | tmp = np.c_[tmp, argpreds[j]] 199 | candidate_inputs.append(tmp) 200 | elif j == 0: 201 | candidate_inputs.append(tmp) 202 | 203 | # Now sort and rescore 204 | scores = [sum(w) for w in candidate_scores] 205 | for i in range(len(scores)): 206 | scores[i] /= candidate_norm[i] 207 | argscores = np.argsort(scores)[::-1][:k] 208 | 209 | # Reset the beams based on the scores 210 | for i in range(k): 211 | beam_tokens[i] = candidate_tokens[argscores[i]] 212 | beam_scores[i] = candidate_scores[argscores[i]] 213 | beam_inputs[i] = candidate_inputs[argscores[i]] 214 | 215 | # Shallow fusion (if applicable) 216 | if beta > 0: 217 | for i in range(k): 218 | # Need to reverse the conditionals for SRILM convention 219 | lmscore = beta * lm.logprob_strings(beam_tokens[i][-1], beam_tokens[i][:-1][::-1]) 220 | if lmscore == -np.inf: 221 | lmscore = 0.0 222 | beam_scores[i][-1] += lmscore 223 | 224 | # Check if all beams have produced tokens 225 | numends = 0 226 | for i in range(k): 227 | if beam_tokens[i][-1] == '': 228 | numends += 1 229 | if numends == k: 230 | done = True 231 | 232 | # If we've gone too long, also finish 233 | count += 1 234 | if count == num: 235 | done = True 236 | 237 | # Return the top-N beams 238 | topbeams = [b[net.context:-1] for b in beam_tokens[:k]] 239 | if rerank: 240 | scores = np.zeros(k) 241 | for i in range(k): 242 | scores[i] = lm.total_logprob_strings(topbeams[i][1:]) 243 | scores[i] /= len(topbeams[i][1:]) 244 | argscores = np.argsort(scores)[::-1] 245 | topbeams = [topbeams[i] for i in argscores] 246 | 247 | return topbeams 248 | 249 | def search(net, z, maxlen=50, im=None, init=None, k=2, N=1): 250 | """ 251 | Generate samples from the net using beam search 252 | """ 253 | captions = beam_search(net, z['word_dict'], z['index_dict'], num=maxlen, Im=im, initial=init, k=k, N=N) 254 | return captions 255 | 256 | def generate_and_show(net, word_dict, index_dict, IM, k=1, num=5): 257 | """ 258 | Generate and show results from the model 259 | """ 260 | inds = range(len(IM)) 261 | random.shuffle(inds) 262 | for i in inds[:num]: 263 | caption = beam_search(net, word_dict, index_dict, 50, IM[i], k=k, N=1)[0] 264 | print ' '.join(caption) 265 | 266 | def generate_and_save(net, z, IM, k=1, model='mlblf', dataset='coco', split='dev', extra='c10'): 267 | """ 268 | Generate and save results 269 | """ 270 | maxlen=50 271 | saveloc = model + '_' + dataset + '_' + split + '_' + extra + '_' + 'bw' + str(k) + '.txt' 272 | print saveloc 273 | captions = [] 274 | for i in range(0, len(IM), 5): 275 | c = search(net, z, maxlen, IM[i], k=k, N=1)[0] 276 | print (i, ' '.join(c)) 277 | captions.append(c) 278 | f = open(saveloc, 'wb') 279 | for c in captions: 280 | f.write(' '.join(c) + '\n') 281 | f.close() 282 | return captions 283 | 284 | def compute_bleu(net, word_dict, index_dict, IM, prog, k=1, maxlen=50, lm=None, beta=0.0, rerank=False): 285 | """ 286 | Compute BLEU 287 | """ 288 | print '\nComputing BLEU...' 289 | saveloc = './gen/' + net.name + '_bleu_' + str(k) + '_offdev' 290 | print saveloc 291 | captions = [] 292 | for i in range(0, len(IM[:prog['_neval']]), 1): 293 | c = beam_search(net, word_dict, index_dict, maxlen, IM[i], k=k, N=1, lm=lm, beta=beta, rerank=rerank)[0] 294 | print (i, ' '.join(c)) 295 | captions.append(c) 296 | f = open(saveloc, 'wb') 297 | for c in captions: 298 | f.write(' '.join(c) + '\n') 299 | f.close() 300 | if prog['_evaldev']: 301 | os.system("./gen/multi-bleu.perl ./gen/coco_dev_reference < " + saveloc + ' > ' + saveloc + '_scores') 302 | else: 303 | os.system("./gen/multi-bleu.perl ./gen/coco_reference < " + saveloc + ' > ' + saveloc + '_scores') 304 | f = open(saveloc + '_scores', 'rb') 305 | bleu = f.readline() 306 | f.close() 307 | bleu = bleu[7:].split('/') 308 | bleu[-1] = bleu[-1].split('(')[0] 309 | bleu = [float(b) for b in bleu] 310 | return bleu 311 | 312 | 313 | 314 | -------------------------------------------------------------------------------- /utils/svd_tools.py: -------------------------------------------------------------------------------- 1 | # Helper functions for SVD (SK-LEARN) 2 | import numpy as np 3 | import numbers 4 | from scipy import linalg 5 | 6 | def check_random_state(seed): 7 | """Turn seed into a np.random.RandomState instance 8 | 9 | If seed is None, return the RandomState singleton used by np.random. 10 | If seed is an int, return a new RandomState instance seeded with seed. 11 | If seed is already a RandomState instance, return it. 12 | Otherwise raise ValueError. 13 | """ 14 | if seed is None or seed is np.random: 15 | return np.random.mtrand._rand 16 | if isinstance(seed, (numbers.Integral, np.integer)): 17 | return np.random.RandomState(seed) 18 | if isinstance(seed, np.random.RandomState): 19 | return seed 20 | raise ValueError('%r cannot be used to seed a numpy.random.RandomState' 21 | ' instance' % seed) 22 | 23 | def safe_sparse_dot(a, b, dense_output=False): 24 | """ 25 | Dot product that handle the sparse matrix case correctly 26 | """ 27 | from scipy import sparse 28 | if sparse.issparse(a) or sparse.issparse(b): 29 | ret = a * b 30 | if dense_output and hasattr(ret, "toarray"): 31 | ret = ret.toarray() 32 | return ret 33 | else: 34 | return np.dot(a, b) 35 | 36 | def qr_economic(A, **kwargs): 37 | """ 38 | Compat function for the QR-decomposition in economic mode 39 | Scipy 0.9 changed the keyword econ=True to mode='economic' 40 | """ 41 | import scipy.linalg 42 | # trick: triangular solve has introduced in 0.9 43 | if hasattr(scipy.linalg, 'solve_triangular'): 44 | return scipy.linalg.qr(A, mode='economic', **kwargs) 45 | else: 46 | import warnings 47 | with warnings.catch_warnings(): 48 | warnings.simplefilter("ignore", DeprecationWarning) 49 | return scipy.linalg.qr(A, econ=True, **kwargs) 50 | 51 | def randomized_range_finder(A, size, n_iter, random_state=None, n_iterations=None): 52 | """ 53 | Computes an orthonormal matrix whose range approximates the range of A. 54 | """ 55 | random_state = check_random_state(random_state) 56 | R = random_state.normal(size=(A.shape[1], size)) 57 | Y = safe_sparse_dot(A, R) 58 | del R 59 | for i in xrange(n_iter): 60 | Y = safe_sparse_dot(A, safe_sparse_dot(A.T, Y)) 61 | Q, R = qr_economic(Y) 62 | return Q 63 | 64 | def svd_flip(u, v): 65 | """Sign correction to ensure deterministic output from SVD 66 | 67 | Adjusts the columns of u and the rows of v such that the loadings in the 68 | columns in u that are largest in absolute value are always positive. 69 | 70 | Parameters 71 | ---------- 72 | u, v: arrays 73 | The output of `linalg.svd` or `sklearn.utils.extmath.randomized_svd`, 74 | with matching inner dimensions so one can compute `np.dot(u * s, v)`. 75 | 76 | Returns 77 | ------- 78 | u_adjusted, s, v_adjusted: arrays with the same dimensions as the input. 79 | 80 | """ 81 | max_abs_cols = np.argmax(np.abs(u), axis=0) 82 | signs = np.sign(u[max_abs_cols, xrange(u.shape[1])]) 83 | u *= signs 84 | v *= signs[:, np.newaxis] 85 | return u, v 86 | 87 | def svd(M, n_components, n_oversamples=10, n_iter=5, transpose='auto', flip_sign=True, random_state=0, n_iterations=None): 88 | """ 89 | Equivalent to scikit-learn Truncated SVD 90 | """ 91 | if n_components >= M.shape[1]: 92 | raise ValueError("n_components must be < n_features;" 93 | " got %d >= %d" % (n_components, M.shape[1])) 94 | 95 | random_state = check_random_state(random_state) 96 | n_random = n_components + n_oversamples 97 | n_samples, n_features = M.shape 98 | if transpose == 'auto' and n_samples > n_features: 99 | transpose = True 100 | if transpose: 101 | M = M.T 102 | Q = randomized_range_finder(M, n_random, n_iter, random_state) 103 | B = safe_sparse_dot(Q.T, M) 104 | Uhat, s, V = linalg.svd(B, full_matrices=False) 105 | del B 106 | U = np.dot(Q, Uhat) 107 | if flip_sign: 108 | U, V = svd_flip(U, V) 109 | if transpose: 110 | U, Sigma, VT = V[:n_components, :].T, s[:n_components], U[:, :n_components].T 111 | else: 112 | U, Sigma, VT = U[:, :n_components], s[:n_components], V[:n_components, :] 113 | Sigma = np.diag(Sigma) 114 | return np.dot(U, Sigma.T), VT 115 | #return U, np.dot(Sigma.T, VT) 116 | 117 | 118 | --------------------------------------------------------------------------------