├── .gitignore ├── README.md ├── bin ├── run-spelling-convnet-exp01-convparams-final.sh ├── run-spelling-convnet-exp01-convparams.sh ├── run-spelling-convnet-exp02-embeddings.sh ├── run-spelling-convnet-exp03-inputs.sh ├── run-spelling-convnet-exp04-real-errors.sh ├── run-spelling-convnet-exp05-multiclass.sh ├── run-spelling-convnet-residual.sh ├── run-spelling-convnet.sh ├── run-spelling-correction-isolated-binary.sh ├── run-spelling-correction-isolated-multiclass.sh └── run-spelling-lstm.sh ├── contrasting_cases.py ├── modeling ├── __init__.py ├── autograd_examples.py ├── builders.py ├── callbacks.py ├── chainer_model.py ├── data.py ├── dataset.py ├── difference.py ├── fbeta.py ├── fbeta_predict.py ├── lasagne_model.py ├── layers.py ├── nonconvnet.py ├── outliers.py ├── parser.py ├── preprocess.py ├── residual.py ├── spelling.py └── utils.py ├── models ├── keras │ ├── attention │ │ ├── model.json │ │ └── model.py │ ├── preposition │ │ ├── convnet │ │ │ ├── 4e0ae5dc683611e5950afcaa149e39ea │ │ │ │ ├── model.py │ │ │ │ └── model_old_keras.py │ │ │ ├── model-word2vec.json │ │ │ ├── model.json │ │ │ ├── model.py │ │ │ ├── run-medium.sh │ │ │ ├── run-small.sh │ │ │ └── small │ │ │ │ └── find-best-filter-size │ │ │ │ ├── find-best.sh │ │ │ │ └── find-best.txt │ │ └── lstm │ │ │ ├── model.json │ │ │ └── model.py │ └── spelling │ │ ├── convnet │ │ ├── exp03-inputs │ │ │ └── op_transpose_n_ops_1_n_errors_per_word_3 │ │ │ │ └── analysis.py │ │ ├── model.json │ │ └── model.py │ │ ├── correction │ │ └── isolated │ │ │ ├── binary │ │ │ ├── model.json │ │ │ └── model.py │ │ │ └── multiclass │ │ │ ├── model.json │ │ │ └── model.py │ │ ├── data │ │ └── nietzsche.txt │ │ └── toksents.py └── lasagne │ └── spelling │ └── convnet │ ├── model.json │ └── model.py ├── notebooks ├── ConvnetAnalysis.ipynb ├── ConvnetAnalysisHumanJudgments.ipynb ├── ConvnetSensitivityAnalysis.ipynb ├── Spelling.ipynb └── notes.txt ├── requirements.txt ├── setup.py ├── tests ├── testdata.py ├── testdifference.py ├── testlasagne.py ├── testlayers.py └── testnonconvnet.py ├── train_chainer.py ├── train_keras.py ├── train_keras_simple.py └── train_lasagne.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # modeling 2 | -------------------------------------------------------------------------------- /bin/run-spelling-convnet-exp01-convparams-final.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | model_dir=models/keras/spelling/convnet 4 | data_dir=data/spelling/experimental/ 5 | distance=1 6 | errors=3 7 | 8 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/') 9 | experiment_dir=$model_dir/$experiment_name 10 | mkdir -p $experiment_dir 11 | 12 | for operation in delete 13 | do 14 | for n_embed_dims in 10 15 | do 16 | for n_filters in 3000 17 | do 18 | for filter_width in 6 19 | do 20 | for n_fully_connected in 1 21 | do 22 | for n_residual_blocks in 0 23 | do 24 | for n_hidden in 1000 25 | do 26 | model_dest=$experiment_dir/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_residual_blocks_${n_residual_blocks}_n_hidden_${n_hidden} 27 | if [ -d $model_dest ] 28 | then 29 | continue 30 | fi 31 | ./train_keras.py $model_dir \ 32 | $data_dir/op-${operation}-distance-${distance}-errors-per-word-${errors}.h5 \ 33 | $data_dir/op-${operation}-distance-${distance}-errors-per-word-${errors}.h5 \ 34 | chars \ 35 | --target-name binary_target \ 36 | --model-dest $model_dest \ 37 | --n-embeddings 61 \ 38 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=${n_fully_connected} n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 batch_size=32 \ 39 | --shuffle \ 40 | --confusion-matrix \ 41 | --classification-report \ 42 | --class-weight-auto \ 43 | --class-weight-exponent 3 \ 44 | --early-stopping-metric f2 \ 45 | --verbose \ 46 | --log 47 | done 48 | done 49 | done 50 | done 51 | done 52 | done 53 | done 54 | -------------------------------------------------------------------------------- /bin/run-spelling-convnet-exp01-convparams.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | model_dir=models/keras/spelling/convnet 4 | data_dir=data/spelling/experimental/old/ 5 | distance=1 6 | errors=3 7 | 8 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/') 9 | experiment_dir=$model_dir/$experiment_name 10 | mkdir -p $experiment_dir 11 | 12 | for operation in delete 13 | do 14 | for n_embed_dims in 10 30 100 15 | do 16 | for n_filters in 100 200 300 17 | do 18 | for filter_width in 2 4 6 8 19 | do 20 | for n_fully_connected in 1 21 | do 22 | for n_residual_blocks in 0 23 | do 24 | for n_hidden in 100 200 300 25 | do 26 | model_dest=$experiment_dir/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_residual_blocks_${n_residual_blocks}_n_hidden_${n_hidden} 27 | if [ -d $model_dest ] 28 | then 29 | continue 30 | fi 31 | echo ./train_keras.py $model_dir \ 32 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \ 33 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \ 34 | word \ 35 | --target-name target \ 36 | --model-dest $model_dest \ 37 | --n-embeddings 61 \ 38 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=${n_fully_connected} n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=5 \ 39 | --shuffle \ 40 | --confusion-matrix \ 41 | --classification-report \ 42 | --class-weight-auto \ 43 | --class-weight-exponent 3 \ 44 | --early-stopping-metric f2 \ 45 | --verbose \ 46 | --log 47 | done 48 | done 49 | done 50 | done 51 | done 52 | done 53 | done | parallel --gnu -j 2 54 | -------------------------------------------------------------------------------- /bin/run-spelling-convnet-exp02-embeddings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | model_dir=models/keras/spelling/convnet 4 | data_dir=data/spelling/experimental/ 5 | distance=1 6 | errors=3 7 | 8 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/') 9 | experiment_dir=$model_dir/$experiment_name 10 | mkdir -p $experiment_dir 11 | 12 | operation=delete 13 | n_embed_dims=56 14 | n_filters=10 15 | filter_width=6 16 | n_fully_connected=0 17 | n_hidden=0 18 | 19 | for embedding_init in identity orthogonal uniform normal 20 | do 21 | for train_embeddings in false true 22 | do 23 | model_dest=$experiment_dir/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_hidden_${n_hidden}_embedding_init_${embedding_init}_train_embeddings_${train_embeddings} 24 | #--model-dest $model_dest \ 25 | echo $model_dest 26 | ./train_keras.py $model_dir \ 27 | $data_dir/op-${operation}-distance-${distance}-errors-per-word-${errors}.h5 \ 28 | $data_dir/op-${operation}-distance-${distance}-errors-per-word-${errors}.h5 \ 29 | chars \ 30 | --target-name binary_target \ 31 | --n-embeddings 56 \ 32 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=${n_fully_connected} n_hidden=$n_hidden embedding_init=$embedding_init train_embeddings=$train_embeddings optimizer=SGD learning_rate=0.001 momentum=0.0 decay=0.0 \ 33 | --shuffle \ 34 | --confusion-matrix \ 35 | --classification-report \ 36 | --class-weight-auto \ 37 | --class-weight-exponent 3 \ 38 | --verbose \ 39 | --n-train 50000 \ 40 | --n-epochs 3 \ 41 | --no-save 42 | #--log \ 43 | done 44 | done 45 | #| parallel --gnu -j 2 46 | -------------------------------------------------------------------------------- /bin/run-spelling-convnet-exp03-inputs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | model_dir=models/keras/spelling/convnet 4 | data_dir=data/spelling/experimental/ 5 | 6 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/') 7 | experiment_dir=$model_dir/$experiment_name 8 | mkdir -p $experiment_dir 9 | 10 | n_embed_dims=10 11 | n_filters=3000 12 | filter_width=6 13 | n_fully_connected=1 14 | n_residual_blocks=0 15 | n_hidden=1000 16 | 17 | for operation in delete insert substitute transpose 18 | do 19 | for n_operations in 1 2 20 | do 21 | for n_errors_per_word in 3 10 22 | do 23 | model_dest=$experiment_dir/op_${operation}_n_ops_${n_operations}_n_errors_per_word_${n_errors_per_word} 24 | if [ -d $model_dest ] 25 | then 26 | continue 27 | fi 28 | echo ./train_keras.py $model_dir \ 29 | $data_dir/op-${operation}-distance-${n_operations}-errors-per-word-${n_errors_per_word}.h5 \ 30 | $data_dir/op-${operation}-distance-${n_operations}-errors-per-word-${n_errors_per_word}.h5 \ 31 | marked_chars \ 32 | --target-name binary_target \ 33 | --model-dest $model_dest \ 34 | --n-embeddings 61 \ 35 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=$n_fully_connected n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 \ 36 | --shuffle \ 37 | --confusion-matrix \ 38 | --classification-report \ 39 | --class-weight-auto \ 40 | --class-weight-exponent 3 \ 41 | --early-stopping-metric val_f2 \ 42 | --checkpoint-metric val_f2 \ 43 | --verbose \ 44 | --log 45 | break 46 | done 47 | done 48 | done | parallel --gnu -j 2 49 | -------------------------------------------------------------------------------- /bin/run-spelling-convnet-exp04-real-errors.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=models/keras/spelling/convnet 4 | data_dir=data/spelling/experimental/ 5 | 6 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/') 7 | experiment_dir=$model_dir/$experiment_name 8 | mkdir -p $experiment_dir 9 | 10 | n_embed_dims=10 11 | n_filters=3000 12 | filter_width=6 13 | n_fully_connected=1 14 | n_residual_blocks=0 15 | n_hidden=1000 16 | 17 | # Train two models, one with random artificial errors, one with artificial 18 | # errors learned from a corpus of real errors. 19 | 20 | corpora="non-word-error-detection-experiment-04-random-negative-examples.h5 non-word-error-detection-experiment-04-generated-negative-examples.h5" 21 | 22 | for corpus in $corpora 23 | do 24 | model_dest=$experiment_dir/$(echo $corpus | sed -e 's,-,_,g' -e 's,.h5,,') 25 | if [ -d $model_dest ] 26 | then 27 | continue 28 | fi 29 | ./train_keras.py $model_dir \ 30 | $data_dir/$corpus \ 31 | $data_dir/$corpus \ 32 | marked_chars \ 33 | --target-name binary_target \ 34 | --model-dest $model_dest \ 35 | --n-embeddings 255 \ 36 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=$n_fully_connected n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 \ 37 | --shuffle \ 38 | --confusion-matrix \ 39 | --classification-report \ 40 | --class-weight-auto \ 41 | --class-weight-exponent 3 \ 42 | --early-stopping-metric val_f2 \ 43 | --checkpoint-metric val_f2 \ 44 | --save-all-checkpoints \ 45 | --verbose \ 46 | --log 47 | done 48 | #| parallel --gnu -j 2 49 | -------------------------------------------------------------------------------- /bin/run-spelling-convnet-exp05-multiclass.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | 3 | model_dir=models/keras/spelling/convnet 4 | data_dir=data/spelling/experimental/ 5 | distance=1 6 | errors=3 7 | 8 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/') 9 | experiment_dir=$model_dir/$experiment_name 10 | mkdir -p $experiment_dir 11 | 12 | for operation in delete 13 | do 14 | for n_embed_dims in 100 15 | do 16 | for n_filters in 300 17 | do 18 | for filter_width in 8 19 | do 20 | for n_fully_connected in 2 21 | do 22 | for n_residual_blocks in 1 23 | do 24 | for n_hidden in 300 25 | do 26 | ./train_keras.py $model_dir \ 27 | $data_dir/op-$operation-distance-$distance-errors-per-word-${errors}/000.h5 \ 28 | $data_dir/op-$operation-distance-$distance-errors-per-word-${errors}/000.h5 \ 29 | chars \ 30 | --target-name multiclass_target \ 31 | --model-dest $experiment_dir/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_residual_blocks_${n_residual_blocks}_n_hidden_${n_hidden}_n_hsm_classes_5000 \ 32 | --n-embeddings 61 \ 33 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=${n_fully_connected} n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=240 n_hsm_classes=5000 batch_size=8 \ 34 | --shuffle \ 35 | --class-weight-auto \ 36 | --class-weight-exponent 3 \ 37 | --early-stopping-metric f1 \ 38 | --verbose \ 39 | --target-data $data_dir/op-$operation-distance-$distance-errors-per-word-${errors}.json \ 40 | --extra-train-file $(ls $data_dir/op-$operation-distance-$distance-errors-per-word-${errors}/* | egrep -v '000.h5') \ 41 | --n-classes 119773 \ 42 | --log 43 | done 44 | done 45 | done 46 | done 47 | done 48 | done 49 | done 50 | -------------------------------------------------------------------------------- /bin/run-spelling-convnet-residual.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | model_dir=models/keras/spelling/convnet 4 | data_dir=data/spelling/experimental/ 5 | distance=1 6 | errors=3 7 | nonce_interval= 8 | 9 | crossval_dir=$model_dir/crossval 10 | mkdir -p $crossval_dir 11 | 12 | #for operation in delete insert substitute transpose 13 | for operation in delete 14 | do 15 | for n_embed_dims in 100 16 | do 17 | for n_filters in 1000 18 | do 19 | for filter_width in 5 20 | do 21 | for n_fully_connected in 1 2 3 4 5 6 7 22 | do 23 | for n_residual_blocks in 0 24 | do 25 | for n_hidden in 100 26 | do 27 | echo ./train_keras.py $model_dir \ 28 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \ 29 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \ 30 | word \ 31 | --model-dest $crossval_dir/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_residual_blocks_${n_residual_blocks}_n_hidden_${n_hidden} \ 32 | --target-name target \ 33 | --n-embeddings 61 \ 34 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=${n_fully_connected} n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 \ 35 | --shuffle \ 36 | --confusion-matrix \ 37 | --classification-report \ 38 | --class-weight-auto \ 39 | --class-weight-exponent 3 \ 40 | --early-stopping-metric f2 \ 41 | --n-validation 100000 \ 42 | --log \ 43 | --verbose 44 | done 45 | done 46 | done 47 | done 48 | done 49 | done 50 | done | parallel --gnu -j 2 51 | -------------------------------------------------------------------------------- /bin/run-spelling-convnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | model_dir=models/keras/spelling/convnet 4 | data_dir=data/spelling/experimental/ 5 | distance=1 6 | errors=3 7 | nonce_interval=-nonce-interval-3 8 | 9 | mkdir -p $model_dir/crossval 10 | 11 | #for operation in delete insert substitute transpose 12 | #for nonce in "" "-nonce-interval-3" 13 | #do 14 | for operation in delete 15 | do 16 | for n_embed_dims in 100 17 | do 18 | for n_filters in 1000 19 | do 20 | for filter_width in 5 21 | do 22 | for n_hidden in 100 23 | do 24 | for n_fully_connected in 1 2 3 4 25 | do 26 | echo ./train_keras.py $model_dir \ 27 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \ 28 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \ 29 | word \ 30 | --model-dest $model_dir/crossval/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_hidden_${n_hidden} \ 31 | --target-name target \ 32 | --n-embeddings 61 \ 33 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_hidden=$n_hidden n_fully_connected=${n_fully_connected} patience=3 \ 34 | --shuffle \ 35 | --confusion-matrix \ 36 | --classification-report \ 37 | --class-weight-auto \ 38 | --class-weight-exponent 3 \ 39 | --early-stopping-metric f2 \ 40 | --n-validation 100000 \ 41 | --log 42 | done 43 | done 44 | done 45 | done 46 | done 47 | done | parallel --gnu -j 2 48 | -------------------------------------------------------------------------------- /bin/run-spelling-correction-isolated-binary.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=models/keras/spelling/correction/isolated/binary/ 4 | data_dir=data/spelling/experimental/ 5 | 6 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/') 7 | experiment_dir=$model_dir/$experiment_name 8 | mkdir -p $experiment_dir 9 | 10 | n_embed_dims=10 11 | n_filters=3000 12 | filter_width=6 13 | n_fully_connected=2 14 | n_residual_blocks=2 15 | n_hidden=1000 16 | 17 | #corpora="non-word-error-detection-experiment-04-random-negative-examples.h5 non-word-error-detection-experiment-04-generated-negative-examples.h5" 18 | #corpora="non-word-error-detection-experiment-04-random-negative-examples.h5" 19 | corpora="non-word-error-detection-experiment-04-generated-negative-examples.h5" 20 | 21 | for corpus in $corpora 22 | do 23 | model_dest=$experiment_dir/$(echo $corpus | sed -e 's,-,_,g' -e 's,.h5,,') 24 | if [ -d $model_dest ] 25 | then 26 | continue 27 | fi 28 | ./train_keras_simple.py $model_dir \ 29 | $data_dir/$corpus \ 30 | $data_dir/$corpus \ 31 | non_word_marked_chars real_word_marked_chars \ 32 | --target-name binary_target \ 33 | --model-dest $model_dest \ 34 | --n-embeddings 255 \ 35 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=$n_fully_connected n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 \ 36 | --class-weight-exponent 3 \ 37 | --verbose \ 38 | --no-save 39 | done 40 | #--log 41 | #| parallel --gnu -j 2 42 | -------------------------------------------------------------------------------- /bin/run-spelling-correction-isolated-multiclass.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | model_dir=models/keras/spelling/correction/isolated/multiclass/ 4 | data_dir=data/spelling/experimental/ 5 | 6 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/') 7 | experiment_dir=$model_dir/$experiment_name 8 | mkdir -p $experiment_dir 9 | 10 | n_embed_dims=10 11 | n_filters=3000 12 | filter_width=6 13 | n_fully_connected=2 14 | n_residual_blocks=2 15 | n_hidden=1000 16 | 17 | #corpora="non-word-error-detection-experiment-04-random-negative-examples.h5 non-word-error-detection-experiment-04-generated-negative-examples.h5" 18 | #corpora="non-word-error-detection-experiment-04-random-negative-examples.h5" 19 | corpora="non-word-error-detection-experiment-04-generated-negative-examples.h5" 20 | 21 | for corpus in $corpora 22 | do 23 | model_dest=$experiment_dir/$(echo $corpus | sed -e 's,-,_,g' -e 's,.h5,,') 24 | if [ -d $model_dest ] 25 | then 26 | continue 27 | fi 28 | ./train_keras_simple.py $model_dir \ 29 | $data_dir/$corpus \ 30 | $data_dir/$corpus \ 31 | non_word_marked_chars \ 32 | --target-name multiclass_correction_target \ 33 | --model-dest $model_dest \ 34 | --n-embeddings 255 \ 35 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=$n_fully_connected n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 \ 36 | --class-weight-exponent 3 \ 37 | --verbose \ 38 | --n-epochs 3 \ 39 | --no-save 40 | done 41 | #--log 42 | #| parallel --gnu -j 2 43 | -------------------------------------------------------------------------------- /bin/run-spelling-lstm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | 3 | ./train_keras.py models/keras/spelling/lstm \ 4 | data/spelling/birbeck-train.h5 \ 5 | data/spelling/birbeck-valid.h5 \ 6 | word \ 7 | --target-name is_real_word \ 8 | --n-embeddings 56 \ 9 | --model-cfg n_units=20 n_embed_dims=25 patience=1000 train_embeddings=true embedding_init=uniform optimizer=Adam \ 10 | --shuffle \ 11 | --log \ 12 | --confusion-matrix \ 13 | --classification-report \ 14 | --class-weight-auto \ 15 | --class-weight-exponent 5 \ 16 | --n-epochs 350 17 | -------------------------------------------------------------------------------- /contrasting_cases.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | import numpy as np 6 | np.random.seed(1337) # for reproducibility 7 | 8 | import sys 9 | import argparse 10 | import h5py 11 | 12 | from keras.datasets import mnist 13 | from keras.models import Sequential 14 | from keras.layers.core import Dense, Dropout, Activation 15 | from keras.layers.normalization import BatchNormalization 16 | from keras.optimizers import SGD, Adadelta 17 | from keras.utils import np_utils 18 | 19 | from outliers import PMeansMultivariateNormal 20 | 21 | def create_dataset(n, train_size, valid_size): 22 | means = np.arange(100) 23 | cov = [range(1, 101)] * 100 24 | mvn = PMeansMultivariateNormal(means, cov, (n,)) 25 | X = mvn.generate() 26 | 27 | assert n % 2 == 0 28 | assert n > train_size + valid_size 29 | 30 | # Make the data different along one dimension. 31 | even = np.arange(0, n, step=2) 32 | X[even, 0] = np.random.uniform(-.25, 1.75, size=n/2) 33 | # Make each odd-numbered row the inverse of its previous row. 34 | X[even+1, 0] = np.random.uniform(-1.75, .25, size=n/2) 35 | 36 | X += np.random.uniform(0.01, size=X.shape) 37 | X = X.astype(np.float32) 38 | 39 | y = np.array([[0,1] * (n/2)]).reshape((n,1)) 40 | y = y.astype(np.int32) 41 | 42 | X_train = X[0:train_size, :] 43 | X_valid = X[train_size:train_size+valid_size, :] 44 | X_test = X[train_size+valid_size:, :] 45 | 46 | y_train = y[0:train_size] 47 | y_valid = y[train_size:train_size+valid_size] 48 | y_test = y[train_size+valid_size:] 49 | 50 | return X_train, X_valid, X_test, \ 51 | y_train, y_valid, y_test 52 | 53 | 54 | def build_model(n_inputs, n_hidden, n_classes): 55 | model = Sequential() 56 | model.add(Dense(n_inputs, n_hidden)) 57 | model.add(BatchNormalization((n_hidden,))) 58 | model.add(Activation('relu')) 59 | model.add(Dense(n_hidden, n_hidden)) 60 | model.add(BatchNormalization((n_hidden,))) 61 | model.add(Activation('relu')) 62 | model.add(Dense(n_hidden, n_hidden)) 63 | model.add(BatchNormalization((n_hidden,))) 64 | model.add(Activation('relu')) 65 | model.add(Dense(n_hidden, n_hidden)) 66 | model.add(BatchNormalization((n_hidden,))) 67 | model.add(Activation('relu')) 68 | model.add(Dense(n_hidden, n_classes)) 69 | model.add(Activation('softmax')) 70 | 71 | optimizer = Adadelta() 72 | model.compile(loss='categorical_crossentropy', optimizer=optimizer) 73 | 74 | return model 75 | 76 | def get_parser(): 77 | parser = argparse.ArgumentParser( 78 | description='train a model to demonstrate contrasting cases') 79 | parser.add_argument( 80 | '--shuffle', action='store_true', 81 | help='shuffle the training examples after each epoch (i.e. do not use contrasting cases)') 82 | parser.add_argument( 83 | '--n', type=int, default=10000, 84 | help='the size of the data set to create') 85 | parser.add_argument( 86 | '--train-size', type=int, default=7000, 87 | help='the number of examples from the data set to allocate to training') 88 | parser.add_argument( 89 | '--valid-size', type=int, default=1500, 90 | help='the number of examples from the data set to allocate to validation') 91 | parser.add_argument( 92 | '--batch-size', type=int, default=10, 93 | help='mini-batch size') 94 | parser.add_argument( 95 | '--n-epochs', type=int, default=20, 96 | help='number of epochs to train') 97 | parser.add_argument( 98 | '--verbose', action='store_true', 99 | help='print progress') 100 | 101 | return parser.parse_args() 102 | 103 | def main(args): 104 | x_train, x_valid, x_test, \ 105 | y_train, y_valid, y_test = create_dataset( 106 | args.n, args.train_size, args.valid_size) 107 | 108 | y_train = y_train.reshape((y_train.shape[0], 1)) 109 | y_valid = y_valid.reshape((y_valid.shape[0], 1)) 110 | y_test = y_test.reshape((y_test.shape[0], 1)) 111 | 112 | n_classes = len(np.unique(y_train)) 113 | 114 | ''' 115 | print('y_train', y_train.shape) 116 | print('y_valid', y_valid.shape) 117 | print('y_test', y_test.shape) 118 | print('n_classes', n_classes, np.unique(y_train)) 119 | ''' 120 | 121 | # convert class vectors to binary class matrices 122 | y_train = np_utils.to_categorical( 123 | y_train, n_classes).astype(np.int32) 124 | y_valid = np_utils.to_categorical( 125 | y_valid, n_classes).astype(np.int32) 126 | y_test = np_utils.to_categorical( 127 | y_test, n_classes).astype(np.int32) 128 | 129 | if args.shuffle: 130 | print('Training (shuffled)') 131 | # Leave odd-numbered rows where they are; shuffle only 132 | # even-numbered ones. This ensures that each minibatch has one 133 | # example from each class. 134 | perm = np.arange(x_train.shape[0]) 135 | evens = np.arange(0, x_train.shape[0], 2) 136 | perm[evens] = np.random.permutation(evens) 137 | else: 138 | print('Training (contrasting cases)') 139 | 140 | model = build_model(100, 20, n_classes) 141 | 142 | print('x_train', x_train.dtype) 143 | print('y_train', y_train.dtype) 144 | 145 | model.fit(x_train, y_train, 146 | batch_size=args.batch_size, 147 | shuffle=False, 148 | nb_epoch=args.n_epochs, 149 | show_accuracy=True, 150 | verbose=2 if args.verbose else 0, 151 | validation_data=(x_valid, y_valid)) 152 | 153 | score = model.evaluate(x_test, y_test, 154 | show_accuracy=True, 155 | verbose=1 if args.verbose else 0) 156 | 157 | if args.shuffle: 158 | print('Test accuracy (shuffled)', score[1]) 159 | else: 160 | print('Test accuracy (contrasting cases)', score[1]) 161 | 162 | if __name__ == '__main__': 163 | sys.exit(main(get_parser())) 164 | -------------------------------------------------------------------------------- /modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import (split_data, mask_zero_for_rnn, balance_datasets) 2 | -------------------------------------------------------------------------------- /modeling/autograd_examples.py: -------------------------------------------------------------------------------- 1 | import autograd.numpy as np 2 | from autograd import grad 3 | 4 | def sigmoid(x): 5 | return 0.5*(np.tanh(x) + 1) 6 | 7 | def logistic_predictions(weights, inputs): 8 | # Outputs probability of a label being true according to logistic model. 9 | return sigmoid(np.dot(inputs, weights)) 10 | 11 | def training_loss(weights): 12 | # Training loss is the negative log-likelihood of the training labels. 13 | preds = logistic_predictions(weights, inputs) 14 | label_probabilities = preds * targets + (1 - preds) * (1 - targets) 15 | return -np.sum(np.log(label_probabilities)) 16 | 17 | # Build a toy dataset. 18 | inputs = np.array([[0.52, 1.12, 0.77], 19 | [0.88, -1.08, 0.15], 20 | [0.52, 0.06, -1.30], 21 | [0.74, -2.49, 1.39]]) 22 | targets = np.array([True, True, False, True]) 23 | 24 | # Define a function that returns gradients of training loss using autograd. 25 | training_gradient_fun = grad(training_loss) 26 | 27 | # Optimize weights using gradient descent. 28 | weights = np.array([0.0, 0.0, 0.0]) 29 | print "Initial loss:", training_loss(weights) 30 | for i in xrange(100): 31 | weights -= training_gradient_fun(weights) * 0.01 32 | print "Trained loss:", training_loss(weights) 33 | 34 | def taylor_sine(x): 35 | ans = currterm = x 36 | i = 0 37 | while np.abs(currterm) > 0.001: 38 | currterm = -currterm * x**2 / ((2 * i + 3) * (2 * i + 2)) 39 | ans = ans + currterm 40 | i += 1 41 | return ans 42 | 43 | grad_sine = grad(taylor_sine) 44 | print "Gradient of sin(pi) is", grad_sine(np.pi) 45 | -------------------------------------------------------------------------------- /modeling/builders.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from keras.layers.core import Dense 4 | from keras.layers.embeddings import Embedding 5 | from keras.layers.convolutional import (Convolution1D, MaxPooling1D) 6 | from keras.optimizers import (SGD, Adam, Adadelta, Adagrad, RMSprop) 7 | from keras.constraints import maxnorm 8 | from keras.regularizers import l2 9 | 10 | from modeling.layers import ImmutableEmbedding, HierarchicalSoftmax 11 | 12 | def build_embedding_layer(config, input_width=None): 13 | try: 14 | n_embeddings = config.n_vocab 15 | except AttributeError: 16 | n_embeddings = config.n_embeddings 17 | 18 | try: 19 | input_width = config.input_width 20 | except AttributeError: 21 | input_width = input_width 22 | 23 | try: 24 | mask_zero = config.mask_zero 25 | except AttributeError: 26 | mask_zero = False 27 | 28 | if hasattr(config, 'embedding_weights') and config.embedding_weights is not None: 29 | W = np.load(config.embedding_weights) 30 | if config.train_embeddings is True or config.train_embeddings == 'true': 31 | return Embedding(n_embeddings, config.n_embed_dims, 32 | weights=[W], input_length=input_width, 33 | W_constraint=maxnorm(config.embedding_max_norm), 34 | mask_zero=mask_zero) 35 | else: 36 | return ImmutableEmbedding(n_embeddings, config.n_embed_dims, 37 | weights=[W], mask_zero=mask_zero, 38 | input_length=input_width) 39 | else: 40 | if config.train_embeddings is True: 41 | return Embedding(n_embeddings, config.n_embed_dims, 42 | init=config.embedding_init, 43 | W_constraint=maxnorm(config.embedding_max_norm), 44 | mask_zero=mask_zero, 45 | input_length=input_width) 46 | else: 47 | return ImmutableEmbedding(n_embeddings, config.n_embed_dims, 48 | init=config.embedding_init, 49 | mask_zero=mask_zero, 50 | input_length=input_width) 51 | 52 | def build_convolutional_layer(config): 53 | return Convolution1D(config.n_filters, config.filter_width, 54 | W_constraint=maxnorm(config.filter_max_norm), 55 | border_mode=config.border_mode, 56 | W_regularizer=l2(config.l2_penalty)) 57 | 58 | def build_pooling_layer(config, input_width=None, filter_width=None): 59 | try: 60 | input_width = config.input_width 61 | except AttributeError: 62 | assert input_width is not None 63 | 64 | try: 65 | filter_width = config.filter_width 66 | except AttributeError: 67 | assert filter_width is not None 68 | 69 | return MaxPooling1D( 70 | pool_length=input_width - filter_width + 1, 71 | stride=1) 72 | 73 | def build_dense_layer(config, n_hidden=None, activation='linear'): 74 | if n_hidden is None: 75 | n_hidden = config.n_hidden 76 | return Dense(n_hidden, 77 | W_regularizer=l2(config.l2_penalty), 78 | W_constraint=maxnorm(config.dense_max_norm), 79 | activation=activation) 80 | 81 | def build_hierarchical_softmax_layer(config): 82 | # This n_classes is different from the number of unique target values in 83 | # the training set. Hierarchical softmax assigns each word to a class 84 | # and decomposes the softmax into a prediction that's conditioned on 85 | # class membership. 86 | return HierarchicalSoftmax(config.n_classes, config.n_hsm_classes, 87 | batch_size=config.batch_size) 88 | 89 | def load_weights(config, model): 90 | if hasattr(config, 'model_weights') and config.model_weights is not None: 91 | print('Loading weights from %s' % config.model_weights) 92 | model.load_weights(config.model_weights) 93 | 94 | def build_optimizer(config): 95 | if config.optimizer == 'SGD': 96 | optimizer = SGD(lr=config.learning_rate, 97 | decay=config.decay, momentum=config.momentum, 98 | clipnorm=config.clipnorm) 99 | elif config.optimizer == 'Adam': 100 | optimizer = Adam(clipnorm=config.clipnorm) 101 | elif config.optimizer == 'RMSprop': 102 | optimizer = RMSprop(clipnorm=config.clipnorm) 103 | elif config.optimizer == 'Adadelta': 104 | optimizer = Adadelta(clipnorm=config.clipnorm) 105 | elif config.optimizer == 'Adagrad': 106 | optimizer = Adagrad(clipnorm=config.clipnorm) 107 | else: 108 | raise ValueError("don't know how to use optimizer {0}".format(config.optimizer)) 109 | 110 | return optimizer 111 | -------------------------------------------------------------------------------- /modeling/callbacks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import keras 4 | from keras.callbacks import Callback, EarlyStopping 5 | import keras.callbacks 6 | import numpy as np 7 | import six 8 | from sklearn.metrics import (classification_report, 9 | confusion_matrix, f1_score, fbeta_score) 10 | 11 | def predict(model, x, marshaller, batch_size=128): 12 | if isinstance(model, keras.models.Graph): 13 | if marshaller is None: 14 | raise ValueError("a marshaller is required with Graphs") 15 | x = marshaller.marshal(x) 16 | output = model.predict(x, batch_size=batch_size) 17 | y_hat = marshaller.unmarshal(output) 18 | y_hat = np.argmax(y_hat, axis=1) 19 | else: 20 | y_hat = model.predict_classes(x, verbose=0, batch_size=batch_size) 21 | return y_hat 22 | 23 | class PredictionCallback(Callback): 24 | def __init__(self, x, logger, marshaller=None, iteration_freq=10, batch_size=128): 25 | self.__dict__.update(locals()) 26 | self.callbacks = [] 27 | 28 | def add(self, callback): 29 | self.callbacks.append(callback) 30 | 31 | def _set_model(self, model): 32 | self.model = model 33 | for cb in self.callbacks: 34 | cb._set_model(model) 35 | 36 | def on_batch_begin(self, batch, logs={}): 37 | pass 38 | 39 | def on_batch_end(self, batch, logs={}): 40 | pass 41 | 42 | def on_epoch_begin(self, epoch, logs={}): 43 | pass 44 | 45 | def on_epoch_end(self, epoch, logs={}): 46 | if 'iteration' in logs.keys() and logs['iteration'] % self.iteration_freq != 0: 47 | # If we've broken a large training set into smaller chunks, we don't 48 | # need to run the classification report after every chunk. 49 | return 50 | 51 | y_hat = predict(self.model, self.x, self.marshaller, batch_size=self.batch_size) 52 | logs['y_hat'] = y_hat 53 | for cb in self.callbacks: 54 | cb.on_epoch_end(epoch, logs) 55 | 56 | def on_train_begin(self, logs={}): 57 | pass 58 | 59 | def on_train_end(self, logs={}): 60 | pass 61 | 62 | class DelegatingMetricCallback(Callback): 63 | def __init__(self, x, y, logger, metric_name, delegate, marshaller=None, batch_size=128): 64 | self.__dict__.update(locals()) 65 | del self.self 66 | 67 | def _set_model(self, model): 68 | self.model = model 69 | self.delegate._set_model(model) 70 | 71 | def on_epoch_end(self, epoch, logs={}): 72 | try: 73 | y_hat = logs['y_hat'] 74 | except KeyError: 75 | y_hat = predict(self.model, self.x, self.marshaller, batch_size=self.batch_size) 76 | metric = self.build_metric(logs) 77 | logs[self.metric_name] = metric(self.y, y_hat) 78 | self.logger('%s %.03f' % (self.metric_name, logs[self.metric_name])) 79 | self.delegate.on_epoch_end(epoch, logs) 80 | 81 | def build_metric(self, logs): 82 | return { 83 | 'val_loss': lambda y,y_hat: logs['val_loss'], 84 | 'val_acc': lambda y,y_hat: logs['val_acc'], 85 | 'val_f1': f1_score, 86 | 'val_f1': lambda y,y_hat: fbeta_score(y, y_hat, beta=0.5), 87 | 'val_f2': lambda y,y_hat: fbeta_score(y, y_hat, beta=2) 88 | }[self.metric_name] 89 | 90 | class ConfusionMatrix(Callback): 91 | def __init__(self, x, y, logger, marshaller=None, batch_size=128): 92 | self.__dict__.update(locals()) 93 | del self.self 94 | 95 | def on_epoch_end(self, epoch, logs={}): 96 | try: 97 | y_hat = logs['y_hat'] 98 | except KeyError: 99 | y_hat = predict(self.model, self.x, self.marshaller, batch_size=self.batch_size) 100 | self.logger('\nConfusion matrix') 101 | self.logger(confusion_matrix(self.y, y_hat)) 102 | 103 | class ClassificationReport(Callback): 104 | def __init__(self, x, y, logger, target_names=None, marshaller=None, batch_size=128): 105 | self.__dict__.update(locals()) 106 | del self.self 107 | 108 | self.labels = np.arange(max(y)+1) 109 | 110 | if target_names is None: 111 | self.target_names = [str(t) for t in self.labels] 112 | else: 113 | self.target_names = [str(tn) for tn in target_names] 114 | 115 | def on_epoch_end(self, epoch, logs={}): 116 | try: 117 | y_hat = logs['y_hat'] 118 | except KeyError: 119 | y_hat = predict(self.model, self.x, self.marshaller, batch_size=self.batch_size) 120 | 121 | self.logger('\nClassification report') 122 | self.logger(classification_report( 123 | self.y, y_hat, 124 | labels=self.labels, target_names=self.target_names)) 125 | 126 | class OptimizerMonitor(Callback): 127 | def __init__(self, logger): 128 | self.logger = logger 129 | 130 | def on_epoch_end(self, epoch, logs={}): 131 | if not hasattr(self.model.optimizer, 'lr'): 132 | return 133 | 134 | lr = self.model.optimizer.lr.get_value() 135 | optimizer_state = str({ 'lr': lr }) 136 | 137 | if 'iteration' in logs.keys(): 138 | self.logger("epoch {epoch} iteration {iteration} - optimizer state {optimizer_state}".format( 139 | epoch=epoch, iteration=logs['iteration'], optimizer_state=optimizer_state)) 140 | else: 141 | self.logger("epoch {epoch} - optimizer state {optimizer_state}".format( 142 | epoch=epoch, optimizer_state=optimizer_state)) 143 | 144 | class VersionedModelCheckpoint(Callback): 145 | def __init__(self, filepath, max_epochs=10000, **kwargs): 146 | kwargs['save_best_only'] = False 147 | self.delegate = keras.callbacks.ModelCheckpoint(filepath, **kwargs) 148 | self.filepath = filepath 149 | self.basepath, self.ext = os.path.splitext(filepath) 150 | self.epoch = 0 151 | width = int(np.log10(max_epochs)) + 1 152 | self.fmt_string = '{basepath}-{epoch:0' + str(width) + 'd}{ext}' 153 | 154 | def on_epoch_end(self, epoch, logs={}): 155 | logs['val_loss'] = -self.epoch 156 | self.delegate.on_epoch_end(epoch, logs) 157 | 158 | if os.path.exists(self.filepath): 159 | newpath = self.fmt_string.format( 160 | basepath=self.basepath, epoch=self.epoch, ext=self.ext) 161 | os.rename(self.filepath, newpath) 162 | self.epoch += 1 163 | 164 | def _set_model(self, model): 165 | self.model = model 166 | self.delegate._set_model(model) 167 | 168 | class SingleStepLearningRateSchedule(keras.callbacks.Callback): 169 | def __init__(self, patience=5, learning_rate_divisor=10.): 170 | self.patience = patience 171 | self.learning_rate_divisor = learning_rate_divisor 172 | self.best_loss = np.inf 173 | self.best_epoch = 0 174 | self.updated_lr = False 175 | 176 | def on_epoch_end(self, epoch, logs={}): 177 | if self.updated_lr: 178 | return 179 | 180 | if logs['val_loss'] < self.best_loss: 181 | self.best_loss = logs['val_loss'] 182 | self.best_epoch = epoch 183 | 184 | if epoch - self.best_epoch > self.patience: 185 | old_lr = self.model.optimizer.lr.get_value() 186 | new_lr = (old_lr / self.learning_rate_divisor).astype(np.float32) 187 | print('old_lr', old_lr, 'new_lr', new_lr) 188 | self.model.optimizer.lr.set_value(new_lr) 189 | self.learning_rate_divisor = 1. 190 | -------------------------------------------------------------------------------- /modeling/chainer_model.py: -------------------------------------------------------------------------------- 1 | import chainer 2 | import chainer.functions as F 3 | from chainer import optimizers 4 | 5 | class Model(object): 6 | def __init__(self, args): 7 | for k,v in vars(args).iteritems(): 8 | self.__dict__[k] = v 9 | self.init_params() 10 | self.init_optimizer() 11 | self.optimizer.setup(self.params) 12 | 13 | def init_optimizer(self): 14 | if self.optimizer == 'SGD': 15 | self.optimizer = optimizers.MomentumSGD( 16 | lr=self.learning_rate, momentum=self.momentum) 17 | elif self.optimizer == 'AdaDelta': 18 | self.optimizer = optimizers.AdaDelta() 19 | elif self.optimizer == 'AdaGrad': 20 | self.optimizer = optimizers.AdaGrad() 21 | elif self.optimizer == 'Adam': 22 | self.optimizer = optimizers.Adam() 23 | elif self.optimizer == 'RMSprop': 24 | self.optimizer = optimizers.RMSprop() 25 | 26 | def update(self): 27 | if hasattr(self, 'weight_decay'): 28 | if self.weight_decay > 0: 29 | self.optimizer.weight_decay(self.weight_decay) 30 | self.optimizer.update() 31 | 32 | def iteration(self, data, target, train=False): 33 | if train: 34 | self.optimizer.zero_grads() 35 | pred = self.forward(data) 36 | loss, metric = self.loss(pred, target) 37 | if train: 38 | loss.backward() 39 | self.update() 40 | return pred, loss, metric 41 | 42 | def fit(self, data, target): 43 | pred, loss, metric = self.iteration(data, target, train=True) 44 | return pred, loss, metric 45 | 46 | def evaluate(self, data, target): 47 | pred, loss, metric = self.iteration(data, target) 48 | return pred, loss, metric 49 | 50 | def init_params(self): 51 | raise NotImplementedError() 52 | 53 | def forward(self): 54 | raise NotImplementedError() 55 | 56 | def loss(self, pred, target): 57 | raise NotImplementedError() 58 | 59 | def predict(self, data, target=None): 60 | raise NotImplementedError() 61 | 62 | def predict_proba(self, data): 63 | raise NotImplementedError() 64 | 65 | def to_gpu(self): 66 | self.params.to_gpu() 67 | 68 | def to_cpu(self): 69 | self.params.to_cpu() 70 | 71 | class Classifier(Model): 72 | def loss(self, pred, target): 73 | target = chainer.Variable(target) 74 | loss = F.softmax_cross_entropy(pred, target) 75 | metric = F.accuracy(pred, target) 76 | return loss, metric 77 | 78 | def predict(self, data, target=None): 79 | pred = self.forward(data, train=False) 80 | if target is None: 81 | return np.argmax(F.softmax(pred).data, axis=1) 82 | else: 83 | loss, metric = self.loss(pred, target) 84 | return pred, loss, metric 85 | 86 | def predict_proba(self, data): 87 | pred = self.forward(data, train=False) 88 | return F.softmax(pred).data 89 | 90 | class Regressor(Model): 91 | def loss(self, pred, target): 92 | target = chainer.Variable(target) 93 | loss = F.mean_squared_error(pred, target) 94 | return loss, loss 95 | 96 | def predict(self, data, target=None): 97 | pred = self.forward(data, train=False) 98 | if target is None: 99 | return pred 100 | else: 101 | loss, metric = self.loss(pred, target) 102 | return pred, loss, metric 103 | -------------------------------------------------------------------------------- /modeling/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import numpy as np 4 | 5 | class GraphMarshaller(object): 6 | """ 7 | Interface for classes that handle preparing inputs and unpacking 8 | outputs of Keras Graph models. 9 | """ 10 | def marshal(self, data, target=None): 11 | raise NotImplementedError() 12 | 13 | def unmsrhal(self, output): 14 | raise NotImplementedError() 15 | 16 | def split_data(hdf5_path, split_size, output_dir=None): 17 | """ 18 | Split the datasets in an HDF5 file into smaller sets and save them 19 | to new files. By default the files are put into a subdirectory of 20 | the directory containing `hdf5_path`. The subdirectory is created 21 | if it does not exist; the name of the directory is `hdf5_path` with 22 | the file suffix removed. To write to a different directory, provide 23 | the path to the existing directory in `output_dir`. 24 | 25 | Parameters 26 | ------- 27 | hdf5_path : str 28 | The path to the HDF5 file. 29 | split_size : int 30 | The size of the 31 | """ 32 | f = h5py.File(hdf5_path) 33 | n = 0 34 | # Find the largest n. 35 | for k,v in f.iteritems(): 36 | n = max(n, v.value.shape[0]) 37 | 38 | if output_dir is None: 39 | output_dir = os.path.splitext(hdf5_path)[0] 40 | os.mkdir(output_dir) 41 | 42 | # Copy subsequences of the data to smaller files. 43 | width = int(np.ceil(np.log10(n / split_size))) 44 | for i,j in enumerate(range(0, n, split_size)): 45 | outfile = '{dir}/{num:{fill}{width}}.h5'.format( 46 | dir=output_dir, num=i, fill='0', width=width) 47 | print(outfile) 48 | fout = h5py.File(outfile, 'w') 49 | for k,v in f.iteritems(): 50 | subset = v[j:j+split_size] 51 | fout.create_dataset(k, data=subset, dtype=v.dtype) 52 | fout.close() 53 | 54 | def balance_classes(target): 55 | """ 56 | Get a subset of the indices in the target variable of an imbalanced dataset 57 | such that each class has the same number of occurrences. This is to be used 58 | in conjunction with `balance_datasets` to create a balanced dataset. 59 | 60 | Parameters 61 | --------- 62 | target : array-like of int 63 | The target variable from which to sample. 64 | """ 65 | n = min(np.bincount(target)) 66 | n_even = n/2 67 | indices = [] 68 | 69 | for code in np.arange(max(target)+1): 70 | mask = target == code 71 | idx = np.sort(np.where(mask)[0]) 72 | # Only sample from the even indices so the downsampled dataset 73 | # still consists of pairs of positive and negative examples. 74 | even_idx = idx[idx % 2 == 0] 75 | sampled_even_idx = np.sort(np.random.choice(even_idx, size=n_even, replace=False)) 76 | # Add the odd-numbered examples of errors. 77 | sampled_idx = np.concatenate([sampled_even_idx, sampled_even_idx+1]) 78 | sampled_idx = np.sort(sampled_idx) 79 | indices.extend(sampled_idx) 80 | 81 | return np.sort(indices) 82 | 83 | def balance_datasets(hdf5_file, key='original_word_code'): 84 | """ 85 | Balance the datasets in an HDF5 file. A balanced sample of 86 | the dataset denoted by `key` is taken. The corresponding 87 | examples from all other datasets are sampled, too. 88 | 89 | Parameters 90 | ----------- 91 | hdf5_file : h5py.File 92 | An open HDF5 file. 93 | key : str 94 | The key of the target variable in `hdf5_file` to balance. 95 | """ 96 | idx = balance_classes(hdf5_file[key].value) 97 | for key in hdf5_file.keys(): 98 | value = hdf5_file[key].value 99 | del hdf5_file[key] 100 | hdf5_file.create_dataset(key, data=value[idx], dtype=value.dtype) 101 | 102 | def mask_zero_for_rnn(hdf5_fh, n_vocab): 103 | """ 104 | Given an HDF5 data set with inputs `X` (the entire sentence), 105 | `Xwindow` (the window of words around e.g. a preposition), and 106 | `XwindowNULL` (the window of words as in `Xwindow` with the center 107 | word replaced by a nonce), transform the inputs as follows: 108 | 109 | a) Change 0 in every position before the end of the sentence to 110 | vocab_size + 1. 111 | b) Change 0 in every position after the beginning of the sentence 112 | to vocab_size + 1. 113 | 114 | Unmodified, the inputs `X`, etc., use 0 to indicate both that the 115 | word is unknown and that the sentence has ended (i.e. for padding 116 | a variable-length input like a sentence to fill all of the columns 117 | of a matrix). The reasons to change this is that (1) some models, 118 | like recurrent neural networks, pay attention to every detail of 119 | their input and (2) some frameworks, like Keras, allow you do mask 120 | out 0's, so the model gets less confused. 121 | 122 | The `len` key has the offset at which the sentence ends in `X`. 123 | 124 | The `window_position` key in the data set has the offset at which 125 | the preposition occurs in `X`. 126 | 127 | Parameters 128 | ------------ 129 | hdf5_fh : 130 | A open, writable HDF5 file. 131 | n_vocab : int 132 | The number of words in the model's vocabulary. 133 | """ 134 | XRNN = renumber_unknowns_in_sentence( 135 | hdf5_fh['X'].value, 136 | hdf5_fh['len'].value, 137 | n_vocab) 138 | hdf5_fh.create_dataset('XRNN', data=XRNN, dtype=XRNN.dtype) 139 | 140 | XwindowRNN = renumber_unknowns_in_window( 141 | hdf5_fh['Xwindow'].value, 142 | hdf5_fh['window_position'].value, 143 | n_vocab) 144 | hdf5_fh.create_dataset('XwindowRNN', data=XwindowRNN, dtype=XwindowRNN.dtype) 145 | 146 | XwindowNULLRNN = renumber_unknowns_in_window( 147 | hdf5_fh['XwindowNULL'].value, 148 | hdf5_fh['window_position'].value, 149 | n_vocab) 150 | hdf5_fh.create_dataset('XwindowNULLRNN', data=XwindowNULLRNN, dtype=XwindowNULLRNN.dtype) 151 | 152 | return hdf5_fh 153 | 154 | def renumber_unknowns_in_sentence(X, lengths, n_vocab): 155 | """ 156 | So, to transform `X` as described in item (a) above, 157 | 158 | * Find every occurrence of a 0 before the end of a sentence, 159 | using `len` to determine where the sentence ends. 160 | * Replace those occurences with `n_vocab`. 161 | """ 162 | 163 | X = X.copy() 164 | for i,length in enumerate(lengths): 165 | sent = X[i] 166 | zeros_in_sent = [False] * X.shape[1] 167 | # Add 2 for leading '' and trailing ''. 168 | zeros_in_sent[:length+2] = sent[:length+2] == 0 169 | if np.any(zeros_in_sent): 170 | X[i, zeros_in_sent] = n_vocab 171 | return X 172 | 173 | def renumber_unknowns_in_window(Xwindow, window_positions, n_vocab): 174 | """ 175 | And to transform `Xwindow` and `XwindowNULL` for item (b), 176 | 177 | * Find every occurrence of a 0 after the beginning of a sentence 178 | using `window_position` to determine where in the window the 179 | sentence begins. If `window_position` is 0, the first two 180 | positions in the window will be 0, because the preposition in 181 | that case is the first word in the sentence and it appears at 182 | the center of the window (index 2, with windows of length 5). 183 | Those first two words must remain 0, as they indicate the 184 | absence of words. If `window_position` is 1, only the first 185 | word must remain 0; the word in the second position of the 186 | window could be 0 because it is out of vocabulary. And if 187 | `window_position` is 2, then the first two words, if 0, are 188 | 0 because they're out of vocabulary. Thus, the indices in the 189 | window that should be checked for the "zero because out of 190 | vocabulary" case start at max(0, 2-`window_position`). (NB: 191 | I didn't find any occurrences of `window_position` > `len`, 192 | just some occurrences of `window_position` == `len` - 2, 193 | which with sentence-terminating punctuation and the 194 | padding character at the end of each sentence just means 195 | that there are several sentences that end with a preposition. 196 | So we only need to deal with the beginning of the window.) 197 | * Replace those occurrences with `n_vocab`. 198 | """ 199 | Xwindow = Xwindow.copy() 200 | for i,window_position in enumerate(window_positions): 201 | window = Xwindow[i] 202 | start = max(0, 2 - window_position) 203 | zeros_in_window = window == 0 204 | zeros_in_window[0:start] = False 205 | if np.any(zeros_in_window): 206 | Xwindow[i, zeros_in_window] = n_vocab 207 | return Xwindow 208 | 209 | def create_window(sentence, position, size=7, nonce=None): 210 | """ 211 | Create a fixed-width window onto a sentence centered at some position. 212 | The sentence is assumed not to contain sentence-initial and -terminating 213 | markup (i.e. no '' element immediately before the start of the 214 | sentence and no '' immediately after its end). (If they were included 215 | in `sentence`, we would exclude them for backward compatibility with other 216 | preprocesing code.) It is also assumed not to be padded with trailing zeros. 217 | 218 | Parameters 219 | --------- 220 | sentence : np.ndarray 221 | An array of integers that represents a sentence. The integers 222 | are indices in a model's vocabulary. 223 | position : int 224 | The 0-based index of the word in the sentence on which the window 225 | should be centered. 226 | size : int 227 | The size of the window. Must be odd. 228 | nonce : int or None 229 | The index in the vocabulary of the nonce word to put at the 230 | center of the window, replacing the index of the existing word. 231 | When None, this does not occur. 232 | """ 233 | if position < 0 or position >= len(sentence): 234 | raise ValueError("`position` (%d) must lie within sentence (len=%d)" % 235 | (position, len(sentence))) 236 | 237 | # Get exactly the positions in `sentence` to copy to `window`. 238 | window_start = position - size/2 239 | window_end = position + size/2 240 | sent_range = np.arange(window_start, window_end+1) 241 | sent_mask = (sent_range >= 0) & (sent_range < len(sentence)) 242 | sent_indices = sent_range[sent_mask] 243 | 244 | window_range = np.arange(0, size) 245 | window_indices = window_range[sent_mask] 246 | 247 | #print('window_start', window_start, 'window_end', window_end, 'sent_range', sent_range, 'sent_mask', sent_mask, 'sent_indices', sent_indices, 'window_range', window_range, 'window_indices', window_indices, 'sentence', sentence, 'position', position) 248 | 249 | window = np.zeros(size) 250 | window[window_indices] = sentence[sent_indices] 251 | 252 | if nonce is not None: 253 | window[size/2] = nonce 254 | 255 | return window 256 | 257 | def create_windows(sentences, lengths, positions, size, nonce=None): 258 | windows = np.zeros((len(sentences), size)) 259 | for i, sentence in enumerate(sentences): 260 | length = lengths[i] 261 | position = positions[i] 262 | sentence_without_zero_padding = sentence[0:length+2] 263 | sentence_without_markup = sentence_without_zero_padding[1:-1] 264 | windows[i] = create_window( 265 | sentence_without_markup, 266 | position=position, 267 | size=size, 268 | nonce=nonce) 269 | return windows 270 | 271 | def add_window_dataset(hdf5_file, name, size, nonce=None, sentences_name='X'): 272 | sentences = hdf5_file[sentences_name].value 273 | lengths = hdf5_file['len'].value 274 | positions = hdf5_file['window_position'].value 275 | 276 | windows = create_windows(sentences, lengths, positions, size, nonce) 277 | hdf5_file.create_dataset(name, data=windows, dtype=np.int32) 278 | 279 | def create_contrasting_cases(X, seed=17, values=[7,8,10,12,13,17,18,19,27]): 280 | center_idx = int(X.shape[1]/2) 281 | rng = np.random.RandomState(seed) 282 | Xcc = np.zeros((X.shape[0]*2, X.shape[1]), dtype=X.dtype) 283 | 284 | for i in np.arange(len(X)): 285 | 286 | # Original example 287 | j = i * 2 288 | Xcc[j, :] = X[i, :] 289 | 290 | # Contrasting case 291 | cc = X[i, :].copy() 292 | 293 | while True: 294 | replacement_value = rng.choice(values) 295 | if replacement_value != cc[center_idx]: 296 | break 297 | 298 | cc[center_idx] = replacement_value 299 | Xcc[j+1, :] = cc 300 | 301 | return Xcc 302 | 303 | def duplicate_values(values): 304 | new_values = np.zeros(len(values)*2) 305 | for i,value in enumerate(values): 306 | j = i * 2 307 | new_values[j] = value 308 | new_values[j+1] = value 309 | return new_values 310 | -------------------------------------------------------------------------------- /modeling/dataset.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | from sklearn.utils import check_random_state 3 | import numpy as np 4 | from modeling.utils import balanced_class_weights 5 | from keras.utils import np_utils 6 | 7 | class HDF5FileDataset(object): 8 | def __init__(self, file_path, data_name, target_name, batch_size, one_hot=True, random_state=17): 9 | assert isinstance(data_name, (list,tuple)) 10 | assert isinstance(target_name, (list,tuple)) 11 | 12 | random_state = check_random_state(random_state) 13 | 14 | self.__dict__.update(locals()) 15 | del self.self 16 | 17 | self._load_data() 18 | self._check_data() 19 | 20 | def _load_data(self): 21 | self.hdf5_file = h5py.File(self.file_path) 22 | self.n_classes = {} 23 | for target_name in self.target_name: 24 | self.n_classes[target_name] = np.max(self.hdf5_file[target_name])+1 25 | 26 | def _check_data(self): 27 | self.n = None 28 | for data_name in self.data_name: 29 | if self.n is None: 30 | self.n = len(self.hdf5_file[data_name]) 31 | else: 32 | assert len(self.hdf5_file[data_name]) == self.n 33 | for target_name in self.target_name: 34 | assert len(self.hdf5_file[target_name]) == self.n 35 | 36 | def __getitem__(self, name): 37 | return self.hdf5_file[name].value 38 | 39 | def class_weights(self, class_weight_exponent, target): 40 | return balanced_class_weights( 41 | self.hdf5_file[target], 42 | 2, 43 | class_weight_exponent) 44 | 45 | def generator(self, one_hot=None, batch_size=None): 46 | if one_hot is None: one_hot = self.one_hot 47 | if batch_size is None: batch_size = self.batch_size 48 | 49 | while 1: 50 | idx = self.random_state.choice(self.n, size=batch_size, replace=False) 51 | batch = {} 52 | for data_name in self.data_name: 53 | batch[data_name] = self.hdf5_file[data_name].value[idx] 54 | for target_name in self.target_name: 55 | target = self.hdf5_file[target_name].value[idx] 56 | if one_hot: 57 | batch[target_name] = np_utils.to_categorical(target, 58 | self.n_classes[target_name]) 59 | else: 60 | batch[target_name] = target 61 | 62 | yield batch 63 | -------------------------------------------------------------------------------- /modeling/difference.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | import unittest 5 | import numpy as np 6 | from theano import function 7 | import theano.tensor as T 8 | 9 | from keras.layers.core import Layer 10 | 11 | class TemporalDifference(Layer): 12 | """ 13 | Given a 3-tensor with shape (nb_samples, maxlen, output_dim), outputs 14 | the difference X[ 15 | """ 16 | def _get_output(self, X): 17 | return X[:, 1:, :] - X[:, 0:X.shape[1]-1, :] 18 | 19 | def get_output(self, train): 20 | return self._get_output(self.get_input(train)) 21 | 22 | def get_config(self): 23 | return {"name": self.__class__.__name__} 24 | 25 | class TestTemporalDifference(unittest.TestCase): 26 | def testForward(self): 27 | nb_examples = 2 28 | maxlen = 7 29 | output_dim = nb_word_dim = 5 30 | x = np.random.normal(size=(nb_examples, maxlen, output_dim)).astype(np.float32) 31 | expected = x[:, 1:, :] - x[:, 0:x.shape[1]-1, :] 32 | X = T.tensor3('X') 33 | retval = TemporalDifference()._get_output(X) 34 | f = function([X], retval) 35 | actual = f(x) 36 | self.assertTrue(np.allclose(actual, expected)) 37 | -------------------------------------------------------------------------------- /modeling/fbeta.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano.tensor as tt 3 | from theano import function 4 | 5 | eps = 1e-20 6 | 7 | def support(y): 8 | return y.sum(axis=0) 9 | 10 | def true_positive(y, y_hat): 11 | return (tt.eq(y_hat, y) & tt.eq(y, 1)).sum(axis=0) 12 | 13 | def make_y_diff(y, y_hat): 14 | return y_hat - y 15 | 16 | def false_positive(y_diff): 17 | return tt.eq(y_diff, 1).sum(axis=0) 18 | 19 | def true_negative(y_diff): 20 | return tt.eq(y_diff, 0).sum(axis=0) 21 | 22 | def false_negative(y_diff): 23 | return tt.eq(y_diff, -1).sum(axis=0) 24 | 25 | def precision(y, y_hat, eps=1e-9, y_diff=None): 26 | tp = true_positive(y, y_hat) 27 | if y_diff is None: 28 | y_diff = make_y_diff(y, y_hat) 29 | fp = false_positive(y_diff) 30 | return tp/(tp+fp+eps) 31 | 32 | def recall(y, y_hat, eps=1e-9, y_diff=None): 33 | tp = true_positive(y, y_hat) 34 | if y_diff is None: 35 | y_diff = make_y_diff(y, y_hat) 36 | fn = false_negative(y_diff) 37 | return tp/(tp+fn+eps) 38 | 39 | def fbeta_loss(y, y_hat, beta=0.5, eps=1e-9, average=None): 40 | """ 41 | Returns the negative of the F_beta measure, because the 42 | optimizer is trying to minimize the objective. 43 | """ 44 | y_diff = make_y_diff(y, y_hat) 45 | pr = precision(y, y_hat, eps=eps, y_diff=y_diff) 46 | rc = recall(y, y_hat, eps=eps, y_diff=y_diff) 47 | 48 | f_per_class = ( (1 + beta**2) * (pr * rc) ) / (beta**2 * pr + rc + eps) 49 | 50 | if average is None: 51 | f = f_per_class 52 | elif average == 'macro': 53 | f = f_per_class.mean() 54 | elif average == 'weighted': 55 | s = support(y) 56 | f = ((f_per_class * s) / s.sum()).sum() 57 | 58 | return -f 59 | 60 | 61 | y = tt.matrix('y', dtype='int64') 62 | y_hat = tt.matrix('y', dtype='int64') 63 | 64 | floss = fbeta_loss(y, y_hat, average='weighted') 65 | f = function([y, y_hat], floss) 66 | 67 | loss = f(np.array([[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0]]), 68 | np.array([[0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]])) 69 | 70 | print("loss", loss) 71 | print("grad", tt.grad(loss, floss)) 72 | 73 | import numpy 74 | import theano 75 | import theano.tensor as T 76 | rng = numpy.random 77 | 78 | N = 400 79 | feats = 784 80 | D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2)) 81 | training_steps = 10000 82 | 83 | ########################################################################### 84 | # Declare Theano symbolic variables 85 | ########################################################################### 86 | 87 | x = T.matrix("x") 88 | y = T.vector("y") 89 | w = theano.shared(rng.randn(feats), name="w") 90 | b = theano.shared(0., name="b") 91 | 92 | print("Initial model:") 93 | print(w.get_value()) 94 | print(b.get_value()) 95 | 96 | ########################################################################### 97 | # Construct Theano expression graph 98 | ########################################################################### 99 | 100 | # Probability that target = 1 101 | p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b)) 102 | 103 | # The prediction thresholded 104 | prediction = p_1 > 0.5 105 | 106 | # Cross-entropy loss function 107 | xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) 108 | 109 | # The cost to minimize 110 | cost = xent.mean() + 0.01 * (w ** 2).sum() 111 | 112 | # Compute the gradient of the cost (we shall return to this in a following 113 | # section of this tutorial). 114 | gw, gb = T.grad(cost, [w, b]) 115 | 116 | # Compile 117 | train = theano.function( 118 | inputs=[x,y], 119 | outputs=[prediction, xent], 120 | updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb))) 121 | predict = theano.function(inputs=[x], outputs=prediction) 122 | 123 | # Train 124 | for i in range(training_steps): 125 | pred, err = train(D[0], D[1]) 126 | 127 | print("Final model:") 128 | print(w.get_value()) 129 | print(b.get_value()) 130 | print("target values for D:") 131 | print(D[1]) 132 | print("prediction on D:") 133 | print(predict(D[0])) 134 | -------------------------------------------------------------------------------- /modeling/fbeta_predict.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import fbeta_score 3 | 4 | def make_default_targets(y, target_names): 5 | default_targets = [] 6 | for target in target_names[y]: 7 | # at-on => on-on 8 | # from-about => about-about 9 | s,t = target.split('-') 10 | default = '-'.join([t, t]) 11 | default_targets.append( 12 | np.where(target_names == default)[0][0]) 13 | return default_targets 14 | 15 | def predict_for_fbeta(y_hat_proba, default_targets, threshold=0.5, threshold_type='margin'): 16 | n = y_hat_proba.shape[0] 17 | y_hat_for_fbeta = np.zeros(n, dtype=np.int) 18 | 19 | if threshold_type not in ['margin', 'value']: 20 | raise ValueError('threshold_type must be either "margin" or "value"') 21 | 22 | for i in np.arange(n): 23 | most, next_most = np.argsort(y_hat_proba[i, :])[[-2,-1]] 24 | if threshold_type == 'margin': 25 | if y_hat_proba[i, most] - y_hat_proba[i, next_most] > threshold: 26 | y_hat_for_fbeta[i] = most 27 | else: 28 | y_hat_for_fbeta[i] = default_targets[most] 29 | elif threshold_type == 'value': 30 | if y_hat_proba[i, most] > threshold: 31 | y_hat_for_fbeta[i] = most 32 | else: 33 | y_hat_for_fbeta[i] = default_targets[most] 34 | 35 | return y_hat_for_fbeta 36 | -------------------------------------------------------------------------------- /modeling/lasagne_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import lasagne 3 | import theano.tensor as T 4 | import theano 5 | 6 | class Model(object): 7 | def __init__(self, config): 8 | self.config = config 9 | 10 | self.input_var = self.build_input_var() 11 | self.target_var = self.build_target_var() 12 | 13 | self.model = self.build_model() 14 | 15 | self.train_output = lasagne.layers.get_output(self.model) 16 | self.train_loss = self.build_loss(self.train_output) 17 | self.params = lasagne.layers.get_all_params(self.model, trainable=True) 18 | self.updates = self.build_updates() 19 | 20 | self.test_output = lasagne.layers.get_output(self.model, 21 | deterministic=True) 22 | self.test_loss = self.build_loss(self.test_output) 23 | self.test_accuracy = T.eq( 24 | T.argmax(self.test_output, axis=1), self.target_var) 25 | self.test_accuracy = T.mean( 26 | self.test_accuracy, dtype=theano.config.floatX) 27 | 28 | self.train_fn = theano.function( 29 | [self.input_var, self.target_var], 30 | self.train_loss, 31 | updates=self.updates) 32 | 33 | self.val_fn = theano.function( 34 | [self.input_var, self.target_var], 35 | [self.test_loss, self.test_accuracy]) 36 | 37 | self.pred_fun = theano.function([self.input_var], self.test_output) 38 | 39 | def build_input_var(self): 40 | raise NotImplementedError() 41 | 42 | def build_target_var(self): 43 | raise NotImplementedError() 44 | 45 | def build_updates(self): 46 | raise NotImplementedError() 47 | 48 | def build_model(self): 49 | raise NotImplementedError() 50 | 51 | def fit(self, data, target): 52 | return self.train_fn(data, target) 53 | 54 | def evaluate(self, data, target): 55 | output = self.val_fn(data, target) 56 | return output[0], output[1] 57 | 58 | def predict(self, data): 59 | pred = self.pred_fn(data) 60 | return pred 61 | 62 | def save_weights(self, path): 63 | np.savez(path, *lasagne.layers.get_all_param_values(self.model)) 64 | 65 | def load_weights(self, path): 66 | with np.load(path) as f: 67 | params = [f['arr_%d' % i] for i in range(len(f.files))] 68 | lasagne.layers.set_all_param_values(self.model, params) 69 | 70 | class Classifier(Model): 71 | def build_loss(self, output): 72 | loss = lasagne.objectives.categorical_crossentropy( 73 | output, self.target_var) 74 | return loss.mean() 75 | 76 | class Regressor(Model): 77 | def build_loss(self, output): 78 | loss = lasagne.objectives.squared_error( 79 | output, self.target_var) 80 | return loss.mean() 81 | -------------------------------------------------------------------------------- /modeling/layers.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import logging 3 | import numpy as np 4 | import theano.tensor as T 5 | import theano.tensor.nnet 6 | 7 | from keras.layers.embeddings import Embedding 8 | from keras.layers.convolutional import Convolution1D 9 | from keras.layers.core import Layer 10 | from keras import activations, initializations, regularizers, constraints 11 | 12 | from keras import backend as K 13 | 14 | logger = logging.getLogger() 15 | 16 | class ImmutableEmbedding(Embedding): 17 | ''' 18 | Same as Embedding except the weights are not parameters of the 19 | network. This can be useful when the layer is initialized with 20 | pre-trained embeddings, such as Word2Vec. 21 | 22 | @input_dim: size of vocabulary (highest input integer + 1) 23 | @out_dim: size of dense representation 24 | ''' 25 | def __init__(self, input_dim, output_dim, **kwargs): 26 | super(ImmutableEmbedding, self).__init__( 27 | input_dim, output_dim, **kwargs) 28 | self.params = [] 29 | 30 | def build(self): 31 | super(ImmutableEmbedding, self).build() 32 | self.params = [] 33 | 34 | class ImmutableConvolution1D(Convolution1D): 35 | ''' 36 | Same as Convolution1D except the convolutional filters are not 37 | parameters of the network. This can be useful when the layer 38 | is initialized with pre-trained convolutional filters. 39 | 40 | @nb_filters: the number of convolutional filters 41 | @filter_width: the width of each filter 42 | ''' 43 | def __init__(self, nb_filters, filter_width, **kwargs): 44 | super(ImmutableConvolution1D, self).__init__( 45 | nb_filters, filter_width, **kwargs) 46 | self.params = [] 47 | 48 | def build(self): 49 | super(ImmutableConvolution1D, self).build() 50 | self.params = [] 51 | 52 | class Transpose(Layer): 53 | def __init__(self): 54 | super(Transpose, self).__init__() 55 | self.input = T.matrix() 56 | 57 | def _get_output(self, X): 58 | return X.T 59 | 60 | def get_output(self, train): 61 | return self._get_output(self.get_input(train)) 62 | 63 | def get_config(self): 64 | return {"name": self.__class__.__name__} 65 | 66 | class HierarchicalSoftmax(Layer): 67 | def __init__(self, output_dim, nb_hsm_classes, batch_size, 68 | init='glorot_uniform', 69 | W1_weights=None, W1_regularizer=None, W1_constraint=None, 70 | W2_weights=None, W2_regularizer=None, W2_constraint=None, 71 | b1_regularizer=None, b1_constraint=None, 72 | b2_regularizer=None, b2_constraint=None, 73 | input_dim=None, **kwargs): 74 | 75 | self.__dict__.update(locals()) 76 | del self.self 77 | 78 | self.init = initializations.get(init) 79 | #self.output_dim = nb_classes * nb_outputs_per_class 80 | self.nb_outputs_per_class = int(np.ceil(output_dim / float(nb_hsm_classes))) 81 | 82 | self.W1_regularizer = regularizers.get(W1_regularizer) 83 | self.b1_regularizer = regularizers.get(b1_regularizer) 84 | self.W2_regularizer = regularizers.get(W2_regularizer) 85 | self.b2_regularizer = regularizers.get(b2_regularizer) 86 | 87 | self.W1_constraint = constraints.get(W1_constraint) 88 | self.b1_constraint = constraints.get(b1_constraint) 89 | self.W2_constraint = constraints.get(W2_constraint) 90 | self.b2_constraint = constraints.get(b2_constraint) 91 | 92 | self.constraints = [self.W1_constraint, self.b1_constraint, 93 | self.W2_constraint, self.b2_constraint] 94 | 95 | #self.initial_weights = weights 96 | self.input_dim = input_dim 97 | if self.input_dim: 98 | kwargs['input_shape'] = (self.input_dim,) 99 | self.input = T.matrix() 100 | super(HierarchicalSoftmax, self).__init__(**kwargs) 101 | 102 | def build(self): 103 | #print('self.input_shape', self.input_shape) 104 | n_features = self.input_shape[1] 105 | 106 | self.W1 = self.init((n_features, self.nb_hsm_classes)) 107 | self.b1 = K.zeros((self.nb_hsm_classes,)) 108 | 109 | self.W2 = self.init((self.nb_hsm_classes, n_features, self.nb_outputs_per_class)) 110 | self.b2 = K.zeros((self.nb_hsm_classes, self.nb_outputs_per_class)) 111 | 112 | self.trainable_weights = [self.W1, self.b1, 113 | self.W2, self.b2] 114 | 115 | self.regularizers = [] 116 | if self.W1_regularizer: 117 | self.W1_regularizer.set_param(self.W1) 118 | self.regularizers.append(self.W1_regularizer) 119 | 120 | if self.b1_regularizer: 121 | self.b1_regularizer.set_param(self.b1) 122 | self.regularizers.append(self.b1_regularizer) 123 | 124 | if self.W2_regularizer: 125 | self.W2_regularizer.set_param(self.W2) 126 | self.regularizers.append(self.W2_regularizer) 127 | 128 | if self.b2_regularizer: 129 | self.b2_regularizer.set_param(self.b2) 130 | self.regularizers.append(self.b2_regularizer) 131 | 132 | @property 133 | def output_shape(self): 134 | print('HierarchicalSoftmax.output_shape', self.input_shape[0], self.output_dim) 135 | return (self.input_shape[0], self.output_dim) 136 | 137 | def _get_output(self, X): 138 | output = theano.tensor.nnet.h_softmax(X, 139 | #self.input_shape[1], self.output_dim, 140 | self.batch_size, self.output_dim, 141 | self.nb_hsm_classes, self.nb_outputs_per_class, 142 | self.W1, self.b1, 143 | self.W2, self.b2) 144 | return output 145 | 146 | def get_output(self, train=False): 147 | return self._get_output(self.get_input(train)) 148 | 149 | -------------------------------------------------------------------------------- /modeling/nonconvnet.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | import unittest 6 | import logging 7 | 8 | logger = logging.getLogger() 9 | 10 | from keras.layers.core import Layer 11 | from keras.utils.theano_utils import sharedX 12 | 13 | class SplitOutputByFilter(Layer): 14 | """ 15 | input: (batch_size, max_seq_len, n_filters * filter_width) 16 | output: (batch_size, n_filters, max_seq_len, filter_width) 17 | """ 18 | def __init__(self, n_filters, filter_width): 19 | super(SplitOutputByFilter, self).__init__() 20 | self.n_filters = n_filters 21 | self.filter_width = filter_width 22 | self.input = T.tensor3() 23 | 24 | def slice(self, i, X): 25 | start = i * self.filter_width 26 | end = (i+1) * self.filter_width 27 | return X[:, :, start:end] 28 | 29 | def _get_output(self, X): 30 | outputs, updates = theano.scan( 31 | fn=self.slice, 32 | outputs_info=None, 33 | sequences=[T.arange(self.n_filters)], 34 | non_sequences=X) 35 | return outputs.dimshuffle(1, 0, 2, 3) 36 | 37 | def get_output(self, train): 38 | return self._get_output(self.get_input(train)) 39 | 40 | def get_config(self): 41 | return {"name": self.__class__.__name__} 42 | 43 | class SlidingWindowL2MaxPooling(Layer): 44 | ''' 45 | input: (batch_size, n_filters, max_seq_len, filter_width) 46 | output: (batch_size, n_filters, filter_width, filter_width) 47 | ''' 48 | def __init__(self, batch_size, n_filters, filter_width, max_seq_len): 49 | super(SlidingWindowL2MaxPooling, self).__init__() 50 | self.batch_size = batch_size 51 | self.n_filters = n_filters 52 | self.filter_width = filter_width 53 | self.max_seq_len = max_seq_len 54 | 55 | def get_output(self, train): 56 | return self._get_output(self.get_input(train)) 57 | 58 | def _get_output(self, X): 59 | outputs, updates = theano.scan( 60 | fn=self.sample_dimension, 61 | sequences=[T.arange(self.batch_size)], 62 | non_sequences=X) 63 | return outputs 64 | 65 | def sample_dimension(self, i, X): 66 | ''' 67 | Takes a 4-tensor of shape `(batch_size, n_filters, max_seq_len, 68 | filter_width)` and an index into its first dimension. Returns the 69 | `(batch_size, n_filters, filter_width, filter_width)` subtensor 70 | with the greatest L2 norm along the third dimension. 71 | 72 | Parameters 73 | ---------- 74 | X : a 4-tensor 75 | An `(batch_size, n_filters, max_seq_len, filter_width)` tensor. 76 | i : int 77 | An index into the first dimension of `X`. 78 | 79 | Returns 80 | ---------- 81 | A 3-tensor of shape `(n_filters, filter_width, filter_width)` 82 | consisting of the subtensor of `X` with the greatest L2 norm along 83 | `X`'s third dimension (where `max_seq_len` lies). 84 | ''' 85 | outputs, updates = theano.scan( 86 | fn=self.filter_dimension, 87 | sequences=[T.arange(self.n_filters)], 88 | non_sequences=X[i, :, :, :]) 89 | 90 | return outputs 91 | 92 | def filter_dimension(self, i, X): 93 | ''' 94 | Takes a 3-tensor of shape `(n_filters, max_seq_len, filter_width)` 95 | and an index into its first dimension. Returns the 96 | `(filter_width, filter_width)` subtensor of `X` with the greatest 97 | L2 norm along the second dimension. 98 | 99 | Parameters 100 | ---------- 101 | X : a 3-tensor 102 | An `(batch_size, n_filters, max_seq_len, filter_width)` tensor. 103 | i : int 104 | An index into the first dimension of `X`. 105 | 106 | Returns 107 | ---------- 108 | A 2-tensor of shape `(filter_width, filter_width)` consisting 109 | of the subtensor of the i-th element along the first dimension 110 | of `X` with the greatest L2 norm along `X`'s second dimension 111 | (where `max_seq_len` lies). 112 | ''' 113 | norms, updates = theano.scan( 114 | fn=self.norm, 115 | sequences=[T.arange(self.max_seq_len)], 116 | non_sequences=X[i, :, :]) 117 | start_window = T.argmax(norms) 118 | end_window = start_window + self.filter_width 119 | return X[i, start_window:end_window, :] 120 | 121 | def norm(self, i, X): 122 | return (X[i:i+self.filter_width, :] ** 2).sum() 123 | 124 | class ZeroFillDiagonals(Layer): 125 | ''' 126 | input: (batch_size, n_filters, filter_width, filter_width) 127 | output: (batch_size, n_filters, filter_width, filter_width) with the 128 | diagonal of the last two `(filter_width, filter_width)` dimensions 129 | zeroed out. 130 | ''' 131 | def __init__(self, batch_size, n_filters, filter_width): 132 | super(ZeroFillDiagonals, self).__init__() 133 | self.batch_size = batch_size 134 | self.n_filters = n_filters 135 | self.filter_width = filter_width 136 | 137 | # Construct a shared boolean matrix by which to multiply the input 138 | # element-wise. It should be 0 everywhere except on the diagonals 139 | # of the last two dimensions. 140 | input_shape = (batch_size, n_filters, filter_width, filter_width) 141 | mask = np.ones(input_shape) 142 | diag_indices = np.arange(filter_width) 143 | for i in np.arange(batch_size): 144 | for j in np.arange(n_filters): 145 | mask[i, j, diag_indices, diag_indices] = 0 146 | self.mask = sharedX(mask, dtype='int32') 147 | 148 | def get_output(self, train): 149 | return self._get_output(self.get_input(train)) 150 | 151 | def _get_output(self, X): 152 | return X * self.mask 153 | -------------------------------------------------------------------------------- /modeling/outliers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | import os.path 6 | import cPickle 7 | from itertools import product 8 | 9 | import theano 10 | import pylearn2 11 | from pylearn2.config import yaml_parse 12 | 13 | import numpy as np 14 | from numpy.random import multivariate_normal as mvnormal 15 | from numpy.random import uniform 16 | from scipy.spatial.distance import pdist, squareform 17 | from scipy.stats import pearsonr 18 | 19 | import matplotlib.pyplot as plt 20 | from mpl_toolkits.mplot3d import axes3d 21 | 22 | from sklearn.covariance import MinCovDet, EmpiricalCovariance 23 | from sklearn.decomposition import PCA 24 | 25 | ########################################################################### 26 | # This class was useful for simulating data sets while developing 27 | # this script. 28 | # means = [[0, 0]] 29 | # cov = [[2, 1], [1, 2]] 30 | # n = 5000 31 | # mvn = PMeansMultivariateNormal(n, means, cov) 32 | # X = mvn.generate() 33 | # X.shape 34 | # np.savetxt(X, file='simulated.csv') 35 | ########################################################################### 36 | class PMeansMultivariateNormal(object): 37 | def __init__(self, means, cov, size): 38 | self.__dict__.update(locals()) 39 | del self.self 40 | #self.n = n 41 | #self.means = means 42 | #self.cov = cov 43 | 44 | def generate(self): 45 | return mvnormal(self.means, self.cov, self.size) 46 | ''' 47 | X = np.empty(shape=(self.n*len(self.means), 2)) 48 | for i, mean in enumerate(self.means): 49 | idx = range(i*self.n, i*self.n+self.n) 50 | x, y = mvnormal(mean, self.cov, self.n).T 51 | X[idx, 0] = x 52 | X[idx, 1] = y 53 | return X 54 | ''' 55 | 56 | def reconstruction_error(a, b): 57 | return ((a - b)**2).sum(axis=1) 58 | 59 | def train_autoencoder(dataset_path, nvis=2, nhid=2, act_enc=None, act_dec=None): 60 | yaml = open('outliers.yaml', 'r').read() 61 | if act_enc is None: 62 | act_enc = 'null' 63 | else: 64 | act_enc = "'" + act_enc + "'" 65 | 66 | if act_dec is None: 67 | act_dec = 'null' 68 | else: 69 | act_dec = "'" + act_dec + "'" 70 | 71 | params = { 72 | 'dataset_path': dataset_path, 73 | 'nvis': nvis, 74 | 'nhid': nhid, 75 | 'act_enc': act_enc, 76 | 'act_dec': act_dec, 77 | 'learning_rate': 0.05, 78 | 'save_path': 'outliers.pkl' 79 | } 80 | 81 | yaml = yaml % (params) 82 | 83 | train = yaml_parse.load(yaml) 84 | train.main_loop() 85 | 86 | pkl = open('outliers.pkl') 87 | return cPickle.load(pkl) 88 | 89 | class NullTransformer(object): 90 | def fit(self, X): 91 | pass 92 | 93 | def fit_transform(self, X): 94 | return X 95 | 96 | def transform(self, X): 97 | return X 98 | 99 | def main(): 100 | parser = argparse.ArgumentParser( 101 | description='Plot outlier-like distances for a 2-dimensional dataset') 102 | parser.add_argument( 103 | 'dataset', type=argparse.FileType('r'), 104 | help='a CSV file containing the dataset') 105 | parser.add_argument( 106 | '--plot', type=str, choices=['train', 'grid'], default='grid', 107 | help='plot the dataset or a grid evenly distributed over its span') 108 | parser.add_argument( 109 | '--plotdims', type=int, choices=[2, 3], default=2, 110 | help='the number of dimensions to plot') 111 | 112 | args = parser.parse_args() 113 | 114 | X = np.loadtxt(args.dataset, delimiter=',') 115 | fig = plt.figure() 116 | 117 | xformer = NullTransformer() 118 | 119 | if X.shape[1] > 2: 120 | xformer = PCA(n_components=2) 121 | X = xformer.fit_transform(X) 122 | 123 | if args.plotdims == 2: 124 | plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0') 125 | else: 126 | plt.scatter(X[:, 0], X[:, 1]) 127 | plt.show(block=False) 128 | 129 | path_to_script = os.path.realpath(__file__) 130 | dir_of_script = os.path.dirname(path_to_script) 131 | dataset_path = dir_of_script + '/outliers.npy' 132 | np.save(dataset_path, X) 133 | 134 | ########################################################################### 135 | # Train autoencoder with the n samples until convergence. Run 136 | # evenly distributed samples through the autoencoder and compute 137 | # their reconstruction error. 138 | ########################################################################### 139 | 140 | maxseq_orig = np.max(X) 141 | minseq_orig = np.min(X) 142 | seqrange = np.abs(maxseq_orig - minseq_orig) 143 | maxseq = maxseq_orig + 0.5 * seqrange 144 | minseq = minseq_orig - 0.5 * seqrange 145 | print("minseq", minseq, "maxseq", maxseq) 146 | if args.plot == 'grid': 147 | seq = np.linspace(minseq, maxseq, num=50, endpoint=True) 148 | Xplot = np.array([_ for _ in product(seq, seq)]) 149 | else: 150 | Xplot = X 151 | 152 | robust_cov = MinCovDet().fit(X) 153 | robust_md = robust_cov.mahalanobis(Xplot) 154 | 155 | empirical_cov = EmpiricalCovariance().fit(X) 156 | empirical_md = empirical_cov.mahalanobis(Xplot) 157 | 158 | # Assume Xplot is at least 2-dimensional. 159 | if Xplot.shape[1] > 2: 160 | Xplot2d = bh_sne(Xplot) 161 | else: 162 | Xplot2d = Xplot 163 | 164 | robust_md01 = robust_md - np.nanmin(robust_md) 165 | robust_md01 = robust_md01 / np.nanmax(robust_md01) 166 | 167 | empirical_md01 = empirical_md - np.nanmin(empirical_md) 168 | empirical_md01 = empirical_md01 / np.nanmax(empirical_md01) 169 | 170 | fig = plt.figure() 171 | if args.plotdims == 2: 172 | ax = fig.add_subplot(1, 1, 1) 173 | ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], 174 | cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0') 175 | else: 176 | ax = fig.add_subplot(1, 1, 1, projection='3d') 177 | ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01, 178 | cmap=plt.cm.jet, color=robust_md01) 179 | ax.set_zlabel('Mahalanobis distance') 180 | ax.set_xlabel('x') 181 | ax.set_ylabel('y') 182 | ax.set_title('Mahalanobis distance (robust covariance)') 183 | 184 | fig = plt.figure() 185 | if args.plotdims == 2: 186 | ax = fig.add_subplot(1, 1, 1) 187 | ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], 188 | cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0') 189 | else: 190 | ax = fig.add_subplot(1, 1, 1, projection='3d') 191 | ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01, 192 | cmap=plt.cm.jet, color=empirical_md01) 193 | ax.set_zlabel('Mahalanobis distance') 194 | 195 | ax.set_xlabel('x') 196 | ax.set_ylabel('y') 197 | ax.set_title('Mahalanobis distance (empirical covariance)') 198 | 199 | enc_dec = [ 200 | # tanh encoder, linear decoder 201 | ['tanh', 'linear'], 202 | # sigmoid encoder, linear decoder 203 | ['sigmoid', 'linear'], 204 | ####################################################################### 205 | # The reconstruction error of the autoencoders trained with the 206 | # remaining commented-out pairs don't seem to match Mahalanobis 207 | # distance very well. Feel free to uncomment them to see for 208 | # yourself. 209 | # linear encoder, linear decoder 210 | # ['linear', 'linear'], 211 | # tanh encoder, tanh decoder 212 | # ['tanh', 'tanh'], 213 | # tanh encoder, sigmoid decoder 214 | # ['tanh', 'sigmoid'], 215 | # sigmoid encoder, tanh decoder 216 | # ['sigmoid', 'tanh'], 217 | # sigmoid encoder, sigmoid decoder 218 | # ['sigmoid', 'sigmoid'] 219 | ####################################################################### 220 | ] 221 | 222 | for i, act in enumerate(enc_dec): 223 | enc, dec = act 224 | if dec == 'linear': 225 | dec = None 226 | model = train_autoencoder(dataset_path, 227 | act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16) 228 | 229 | Xshared = theano.shared( 230 | np.asarray(Xplot, dtype=theano.config.floatX), borrow=True) 231 | f = theano.function([], outputs=model.reconstruct(Xshared)) 232 | fit = f() 233 | error = reconstruction_error(Xplot, fit) 234 | 235 | error01 = error - np.nanmin(error) 236 | error01 = error01 / np.nanmax(error01) 237 | 238 | fig = plt.figure() 239 | if args.plotdims == 2: 240 | ax = fig.add_subplot(1, 1, 1) 241 | ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1], 242 | cmap=plt.cm.jet, c=error, s=60, linewidth='0') 243 | else: 244 | ax = fig.add_subplot(1, 1, 1, projection='3d') 245 | ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error, 246 | cmap=plt.cm.jet, color=error01) 247 | ax.set_zlabel('Reconstruction error') 248 | 249 | ax.set_xlabel('x') 250 | ax.set_ylabel('y') 251 | encdec_type = ', '.join(act) 252 | ax.set_title('Reconstruction error (' + encdec_type + ')') 253 | 254 | print("Correlation of robust MD and reconstruction error (" + 255 | str(encdec_type) + ") " + str(pearsonr(robust_md, error))) 256 | print("Correlation of empirical MD and reconstruction error (" + 257 | str(encdec_type) + ") " + str(pearsonr(empirical_md, error))) 258 | 259 | print("Correlation of robust MD and empirical MD " + 260 | str(pearsonr(robust_md, empirical_md))) 261 | 262 | os.remove(dataset_path) 263 | os.remove('outliers.pkl') 264 | 265 | plt.show(block=True) 266 | 267 | if __name__ == '__main__': 268 | sys.exit(main()) 269 | -------------------------------------------------------------------------------- /modeling/parser.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import numpy 4 | 5 | def kvpair(s): 6 | try: 7 | k,v = s.split('=') 8 | if '.' in v: 9 | try: 10 | v = float(v) 11 | except ValueError: 12 | pass 13 | else: 14 | try: 15 | v = int(v) 16 | except ValueError: 17 | pass 18 | return k,v 19 | except: 20 | raise argparse.ArgumentTypeError( 21 | '--model-cfg arguments must be KEY=VALUE pairs') 22 | 23 | def build_chainer(): 24 | parser = build() 25 | parser.add_argument('--gpu', '-g', default=-1, type=int, 26 | help='GPU ID (negative value indicates CPU)') 27 | return parser 28 | 29 | def build_keras(): 30 | parser = build() 31 | return parser 32 | 33 | def build_lasagne(): 34 | parser = build() 35 | parser.add_argument('--progress', action='store_true', 36 | help='Whether to display a progress for training and validation') 37 | return parser 38 | 39 | def build(): 40 | parser = argparse.ArgumentParser( 41 | description='Train a model.') 42 | parser.add_argument('model_dir', metavar="MODEL_DIR", type=str, 43 | help='The base directory of this model. Must contain a model.py (model code) and a model.json (hyperparameters). Model configuration and weights are saved to model_dir/UUID.') 44 | parser.add_argument('--model-cfg', type=kvpair, nargs='+', default=[], 45 | help='Model hyper-parameters as KEY=VALUE pairs; overrides parameters in MODEL_DIR/model.json') 46 | parser.add_argument('--model-dest', type=str, default='', 47 | help='Directory to which to copy model.py and model.json. This overrides copying to model_dir/UUID.') 48 | parser.add_argument( 49 | '--mode', type=str, 50 | choices=['transient', 'persistent', 'persistent-background'], 51 | default='persistent', 52 | help='How to run the model; in "transient" mode, output goes to the console and the model is not saved; in "persistent" mode, output goes to the console and the model is saved; in "persistent-background" mode, output goes to the model.log file and the model is saved. The default is "persistent"') 53 | 54 | return parser 55 | -------------------------------------------------------------------------------- /modeling/preprocess.py: -------------------------------------------------------------------------------- 1 | class NullPreprocessor(object): 2 | def __init__(self): 3 | pass 4 | 5 | def fit(self, X, y=None): 6 | pass 7 | 8 | def transform(self, X, y=None): 9 | if y is None: 10 | return X 11 | else: 12 | return X, y 13 | 14 | def fit_transform(self, X, y=None): 15 | return self.transform(X, y) 16 | -------------------------------------------------------------------------------- /modeling/residual.py: -------------------------------------------------------------------------------- 1 | from keras.models import Sequential, Graph 2 | from keras.layers.core import Dense, Activation, Layer, Dropout 3 | from keras.activations import relu 4 | 5 | class Identity(Layer): 6 | def get_output(self, train): 7 | return self.get_input(train) 8 | 9 | def build_residual_block(name, input_shape, n_hidden, n_skip=2): 10 | """ 11 | Rough sketch of building blocks of layers for residual learning. 12 | See http://arxiv.org/abs/1512.03385 for motivation. 13 | """ 14 | block = Graph() 15 | input_name = 'x' 16 | block.add_input(input_name, input_shape=input_shape) 17 | 18 | # The current keras graph implementation doesn't allow you to connect 19 | # an input node to an output node. Use Identity to work around that. 20 | block.add_node(Identity(), name=name+'identity', input=input_name) 21 | 22 | prev_output = input_name 23 | for i in range(n_skip): 24 | layer_name = 'h' + str(i) 25 | l = Dense(n_hidden, activation='relu') 26 | block.add_node(l, name=layer_name, input=prev_output) 27 | prev_output = layer_name 28 | if i < n_skip: 29 | block.add_node(Dropout(0.5), name=layer_name+'do', input=layer_name) 30 | prev_output = layer_name+'do' 31 | 32 | block.add_output(name=name+'output', inputs=[name+'identity', prev_output], merge_mode='sum') 33 | 34 | return block 35 | -------------------------------------------------------------------------------- /models/keras/attention/model.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "train_embeddings": true, 4 | "regularization_layer": "", 5 | "dropout_p": 0.5, 6 | "dropout_p_conv": 0.0, 7 | "n_embed_dims": 25, 8 | "loss": "categorical_crossentropy", 9 | "patience": 20, 10 | "batch_size": 128, 11 | "decay": 0.0, 12 | "embedding_max_norm": 1000, 13 | "filter_max_norm": 1000, 14 | "dense_max_norm": 1000, 15 | "l2_penalty": 0.0, 16 | "clipnorm": 0, 17 | "truncate_gradient": -1 18 | } 19 | -------------------------------------------------------------------------------- /models/keras/attention/model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.setrecursionlimit(5000) 3 | import json 4 | import h5py 5 | 6 | import numpy as np 7 | 8 | from keras.models import Sequential, Graph 9 | from keras.layers.core import (Layer, Dense, Activation, Dropout, 10 | TimeDistributedDense, TimeDistributedMerge, 11 | Flatten, Reshape) 12 | from keras.layers.normalization import BatchNormalization 13 | from keras.layers.recurrent import LSTM, GRU 14 | from keras.layers.embeddings import Embedding 15 | from keras.constraints import maxnorm 16 | from keras.regularizers import l2 17 | from keras.optimizers import SGD, Adam, Adadelta, Adagrad, RMSprop 18 | 19 | from modeling.layers import ImmutableEmbedding 20 | from modeling.difference import TemporalDifference 21 | from modeling.builders import (build_embedding_layer, 22 | build_convolutional_layer, build_pooling_layer, 23 | build_dense_layer, build_optimizer, load_weights) 24 | 25 | def error_free_examples(path): 26 | f = h5py.File(path) 27 | # Target_code is 0 when the preposition in the example is the original 28 | # preposition in the corpus and 1 when the preposition has been randomly 29 | # replaced with another one in the confusion set. 30 | idx = f['target_code'].value == 0 31 | f.close() 32 | return idx 33 | 34 | class Identity(Layer): 35 | def get_output(self, train): 36 | return self.get_input(train) 37 | 38 | class Transpose(Layer): 39 | def get_output(self, train): 40 | return self.get_input(train).T 41 | 42 | def build_model(args): 43 | np.random.seed(args.seed) 44 | 45 | graph = Graph() 46 | 47 | graph.add_input('input', input_shape=(args.input_width,), dtype='int') 48 | 49 | graph.add_node(build_embedding_layer(args), 50 | input='input', name='embedding') 51 | 52 | graph.add_node(LSTM(args.n_units, 53 | truncate_gradient=args.truncate_gradient, 54 | return_sequences=True), 55 | input='embedding', name='lstm0') 56 | 57 | graph.add_node(LSTM(args.n_units, 58 | truncate_gradient=args.truncate_gradient, 59 | return_sequences=True), 60 | input='lstm0', name='lstm1') 61 | 62 | # Attention module. 63 | graph.add_node(TimeDistributedDense(args.n_units, activation='relu'), 64 | input='lstm1', name='attention0') 65 | graph.add_node(TimeDistributedDense(args.n_units, activation='relu'), 66 | input='attention0', name='attention1') 67 | graph.add_node(TimeDistributedDense(args.n_units, activation='softmax'), 68 | input='attention1', name='attention2') 69 | 70 | # Apply mask from output of attention module to LSTM output. 71 | graph.add_node(TimeDistributedMerge(mode='sum'), 72 | inputs=['lstm1', 'attention2'], 73 | name='applyattn', 74 | merge_mode='mul') 75 | 76 | graph.add_node(Dense(args.n_classes, activation='softmax'), 77 | input='applyattn', name='softmax') 78 | 79 | graph.add_output(input='softmax', name='output') 80 | 81 | load_weights(args, graph) 82 | 83 | optimizer = build_optimizer(args) 84 | 85 | graph.compile(loss={'output': args.loss}, optimizer=optimizer) 86 | 87 | return graph 88 | -------------------------------------------------------------------------------- /models/keras/preposition/convnet/4e0ae5dc683611e5950afcaa149e39ea/model.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import numpy as np 4 | 5 | from keras.models import Sequential 6 | from keras.layers.core import Dense, Dropout, Activation, Flatten 7 | from keras.layers.normalization import BatchNormalization 8 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 9 | from keras.layers.embeddings import Embedding 10 | from keras.constraints import maxnorm 11 | from keras.regularizers import l2 12 | from keras.optimizers import SGD, Adam, Adadelta, Adagrad, RMSprop 13 | 14 | from modeling.layers import ImmutableEmbedding 15 | from modeling.difference import TemporalDifference 16 | 17 | def build_model(args): 18 | print("args", vars(args)) 19 | 20 | np.random.seed(args.seed) 21 | 22 | model = Sequential() 23 | 24 | if hasattr(args, 'embedding_weights') and args.embedding_weights is not None: 25 | W = np.load(args.embedding_weights) 26 | if args.train_embeddings is True or args.train_embeddings == 'true': 27 | model.add(Embedding(args.n_vocab, args.n_word_dims, 28 | weights=[W], input_length=args.input_width, 29 | W_constraint=maxnorm(args.embedding_max_norm))) 30 | else: 31 | model.add(ImmutableEmbedding(args.n_vocab, args.n_word_dims, 32 | weights=[W], input_length=args.input_width)) 33 | else: 34 | model.add(Embedding(args.n_vocab, args.n_word_dims, 35 | W_constraint=maxnorm(args.embedding_max_norm), 36 | input_length=args.input_width)) 37 | 38 | if args.use_difference: 39 | model.add(TemporalDifference()) 40 | 41 | model.add(Convolution1D(args.n_filters, args.filter_width, 42 | W_constraint=maxnorm(args.filter_max_norm), 43 | border_mode=args.border_mode, 44 | W_regularizer=l2(args.l2_penalty), 45 | activation='relu')) 46 | #if 'normalization' in args.regularization_layer: 47 | # model.add(BatchNormalization( 48 | # (args.input_width-args.filter_width+1, args.n_filters))) 49 | #model.add(Activation('relu')) 50 | 51 | model.add(MaxPooling1D( 52 | pool_length=args.input_width - args.filter_width + 1, 53 | stride=1, ignore_border=False)) 54 | model.add(Flatten()) 55 | 56 | if 'dropout' in args.regularization_layer: 57 | model.add(Dropout(args.dropout_p_conv)) 58 | if 'normalization' in args.regularization_layer: 59 | model.add(BatchNormalization()) 60 | 61 | model.add(Dense(2*args.n_filters, 62 | W_regularizer=l2(args.l2_penalty), 63 | activation='relu')) 64 | if 'dropout' in args.regularization_layer: 65 | model.add(Dropout(args.dropout_p)) 66 | if 'normalization' in args.regularization_layer: 67 | model.add(BatchNormalization()) 68 | 69 | model.add(Dense(2*args.n_filters, 70 | W_regularizer=l2(args.l2_penalty), 71 | activation='relu')) 72 | if 'dropout' in args.regularization_layer: 73 | model.add(Dropout(args.dropout_p)) 74 | if 'normalization' in args.regularization_layer: 75 | model.add(BatchNormalization()) 76 | 77 | model.add(Dense(2*args.n_filters, 78 | W_regularizer=l2(args.l2_penalty), 79 | activation='relu')) 80 | if 'dropout' in args.regularization_layer: 81 | model.add(Dropout(args.dropout_p)) 82 | if 'normalization' in args.regularization_layer: 83 | model.add(BatchNormalization()) 84 | 85 | model.add(Dense(args.n_classes, 86 | W_regularizer=l2(args.l2_penalty), 87 | activation='softmax')) 88 | #if 'normalization' in args.regularization_layer: 89 | # model.add(BatchNormalization((args.n_classes,))) 90 | 91 | if args.optimizer == 'SGD': 92 | optimizer = SGD(lr=args.learning_rate, 93 | decay=args.decay, momentum=args.momentum, 94 | clipnorm=args.clipnorm) 95 | elif args.optimizer == 'Adam': 96 | optimizer = Adam(clipnorm=args.clipnorm) 97 | elif args.optimizer == 'RMSprop': 98 | optimizer = RMSprop(clipnorm=args.clipnorm) 99 | elif args.optimizer == 'Adadelta': 100 | optimizer = Adadelta(clipnorm=args.clipnorm) 101 | elif args.optimizer == 'Adagrad': 102 | optimizer = Adagrad(clipnorm=args.clipnorm) 103 | else: 104 | raise ValueError("don't know how to use optimizer {0}".format(args.optimizer)) 105 | 106 | if hasattr(args, 'model_weights'): 107 | print('Checking for weights file ' + str(args.model_weights)) 108 | if os.path.exists(args.model_weights): 109 | print('Loading weights') 110 | model.load_weights(args.model_weights) 111 | 112 | print('Compiling') 113 | model.compile(loss=args.loss, optimizer=optimizer) 114 | 115 | return model 116 | -------------------------------------------------------------------------------- /models/keras/preposition/convnet/4e0ae5dc683611e5950afcaa149e39ea/model_old_keras.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from keras.models import Sequential 4 | from keras.layers.core import Dense, Dropout, Activation, Flatten 5 | from keras.layers.normalization import BatchNormalization 6 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 7 | from keras.layers.embeddings import Embedding 8 | from keras.constraints import maxnorm 9 | from keras.regularizers import l2 10 | from keras.optimizers import SGD, Adam, Adadelta, Adagrad, RMSprop 11 | 12 | from modeling.layers import ImmutableEmbedding 13 | from modeling.difference import TemporalDifference 14 | 15 | def build_model(args): 16 | print("args", vars(args)) 17 | 18 | np.random.seed(args.seed) 19 | 20 | model = Sequential() 21 | 22 | if hasattr(args, 'embedding_weights') and args.embedding_weights is not None: 23 | W = np.load(args.embedding_weights) 24 | if args.train_embeddings: 25 | model.add(Embedding(args.n_vocab, args.n_word_dims, 26 | weights=[W], 27 | W_constraint=maxnorm(args.embedding_max_norm))) 28 | else: 29 | model.add(ImmutableEmbedding(args.n_vocab, args.n_word_dims, 30 | weights=[W])) 31 | else: 32 | model.add(Embedding(args.n_vocab, args.n_word_dims, 33 | W_constraint=maxnorm(args.embedding_max_norm))) 34 | 35 | if args.use_difference: 36 | model.add(TemporalDifference()) 37 | 38 | model.add(Convolution1D(args.n_word_dims, args.n_filters, args.filter_width, 39 | W_constraint=maxnorm(args.filter_max_norm), 40 | border_mode=args.border_mode, 41 | W_regularizer=l2(args.l2_penalty))) 42 | #if 'normalization' in args.regularization_layer: 43 | # model.add(BatchNormalization( 44 | # (args.input_width-args.filter_width+1, args.n_filters))) 45 | model.add(Activation('relu')) 46 | 47 | model.add(MaxPooling1D( 48 | pool_length=args.input_width - args.filter_width + 1, 49 | stride=None, ignore_border=False)) 50 | model.add(Flatten()) 51 | if 'dropout' in args.regularization_layer: 52 | model.add(Dropout(args.dropout_p_conv)) 53 | if 'normalization' in args.regularization_layer: 54 | model.add(BatchNormalization((args.n_filters,))) 55 | 56 | model.add(Dense(args.n_filters, 2*args.n_filters, 57 | W_regularizer=l2(args.l2_penalty))) 58 | model.add(Activation('relu')) 59 | if 'dropout' in args.regularization_layer: 60 | model.add(Dropout(args.dropout_p)) 61 | if 'normalization' in args.regularization_layer: 62 | model.add(BatchNormalization((2*args.n_filters,))) 63 | 64 | model.add(Dense(2*args.n_filters, 2*args.n_filters)) 65 | model.add(Activation('relu')) 66 | if 'dropout' in args.regularization_layer: 67 | model.add(Dropout(args.dropout_p)) 68 | if 'normalization' in args.regularization_layer: 69 | model.add(BatchNormalization((2*args.n_filters,))) 70 | 71 | model.add(Dense(2*args.n_filters, 2*args.n_filters, 72 | W_regularizer=l2(args.l2_penalty))) 73 | model.add(Activation('relu')) 74 | if 'dropout' in args.regularization_layer: 75 | model.add(Dropout(args.dropout_p)) 76 | if 'normalization' in args.regularization_layer: 77 | model.add(BatchNormalization((2*args.n_filters,))) 78 | 79 | model.add(Dense(2*args.n_filters, args.n_classes, 80 | W_regularizer=l2(args.l2_penalty))) 81 | #if 'normalization' in args.regularization_layer: 82 | # model.add(BatchNormalization((args.n_classes,))) 83 | model.add(Activation('softmax')) 84 | 85 | if args.optimizer == 'SGD': 86 | optimizer = SGD(lr=args.learning_rate, 87 | decay=args.decay, momentum=args.momentum, 88 | clipnorm=args.clipnorm) 89 | elif args.optimizer == 'Adam': 90 | optimizer = Adam(clipnorm=args.clipnorm) 91 | elif args.optimizer == 'RMSprop': 92 | optimizer = RMSprop(clipnorm=args.clipnorm) 93 | elif args.optimizer == 'Adadelta': 94 | optimizer = Adadelta(clipnorm=args.clipnorm) 95 | elif args.optimizer == 'Adagrad': 96 | optimizer = Adagrad(clipnorm=args.clipnorm) 97 | else: 98 | raise ValueError("don't know how to use optimizer {0}".format(args.optimizer)) 99 | 100 | if hasattr(args, 'model_weights') and args.model_weights is not None: 101 | model.load_weights(args.model_weights) 102 | 103 | model.compile(loss=args.loss, optimizer=optimizer) 104 | 105 | return model 106 | -------------------------------------------------------------------------------- /models/keras/preposition/convnet/model-word2vec.json: -------------------------------------------------------------------------------- 1 | { 2 | "embedding_weights": "data/prepositions-weights.npy", 3 | "train_embeddings": false, 4 | "regularization_layer": "normalization", 5 | "n_word_dims": 300, 6 | "border_mode": "valid", 7 | "use_difference": true, 8 | "n_filters": 1000, 9 | "filter_width": 3, 10 | "loss": "categorical_crossentropy", 11 | "patience": 30, 12 | "batch_size": 128, 13 | "optimizer": "SGD", 14 | "learning_rate": 0.001, 15 | "momentum": 0.9, 16 | "decay": 0.0, 17 | "embedding_max_norm": 1000, 18 | "filter_max_norm": 1000, 19 | "dense_max_norm": 1000, 20 | "l2_penalty": 0.0, 21 | "clipnorm": 0 22 | } 23 | -------------------------------------------------------------------------------- /models/keras/preposition/convnet/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_embeddings": true, 3 | "regularization_layer": "", 4 | "dropout_p": 0.5, 5 | "dropout_p_conv": 0.0, 6 | "n_word_dims": 50, 7 | "border_mode": "valid", 8 | "use_difference": false, 9 | "n_filters": 500, 10 | "n_hidden": 500, 11 | "filter_width": 4, 12 | "loss": "categorical_crossentropy", 13 | "patience": 20, 14 | "batch_size": 128, 15 | "optimizer": "SGD", 16 | "learning_rate": 0.001, 17 | "momentum": 0.9, 18 | "decay": 0.0, 19 | "embedding_max_norm": 1000, 20 | "filter_max_norm": 1000, 21 | "dense_max_norm": 1000, 22 | "l2_penalty": 0.0, 23 | "clipnorm": 0 24 | } 25 | -------------------------------------------------------------------------------- /models/keras/preposition/convnet/model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.setrecursionlimit(5000) 3 | import json 4 | import h5py 5 | 6 | import numpy as np 7 | 8 | from keras.models import Sequential, Graph 9 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Layer 10 | from keras.layers.normalization import BatchNormalization 11 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 12 | from keras.layers.embeddings import Embedding 13 | from keras.constraints import maxnorm 14 | from keras.regularizers import l2 15 | from keras.optimizers import SGD, Adam, Adadelta, Adagrad, RMSprop 16 | 17 | from modeling.layers import ImmutableEmbedding 18 | from modeling.difference import TemporalDifference 19 | from modeling.builders import (build_embedding_layer, 20 | build_convolutional_layer, build_pooling_layer, 21 | build_dense_layer, build_optimizer, load_weights) 22 | 23 | class EncartaExamplesWithOKWindows(): 24 | def __init__(self, seed=17): 25 | self.random_state = np.random.RandomState(seed=seed) 26 | self.prepositions = set([7, 8, 10, 12, 13, 17, 18, 19, 27]) 27 | 28 | def fit_transform(self, X, y=None): 29 | return self.transform(X, y) 30 | 31 | def transform(self, X, y=None): 32 | # Select the examples where the middle column is in our 33 | # preposition set. 34 | middle_column = X[:, X.shape[1]/2] 35 | ok = np.array([True] * len(X)) 36 | for i,val in enumerate(middle_column): 37 | if val not in self.prepositions: 38 | ok[i] = False 39 | print('in %d out %d' % (len(X), len(X[ok]))) 40 | if y is not None: 41 | return X[ok], y[ok] 42 | else: 43 | return X[ok] 44 | 45 | class TrainingSetRealExamples(): 46 | def __init__(self, seed=17): 47 | self.random_state = np.random.RandomState(seed=seed) 48 | 49 | def fit_transform(self, X, y=None): 50 | evens = [i*2 for i in np.arange(X.shape[0]/2)] 51 | if y is not None: 52 | return X[evens], y[evens] 53 | else: 54 | return X[evens] 55 | 56 | def transform(self, X, y=None): 57 | if y is None: 58 | return X 59 | else: 60 | return X, y 61 | 62 | class RandomPermuter(object): 63 | def __init__(self, seed=17): 64 | self.random_state = np.random.RandomState(seed=seed) 65 | 66 | def fit(self, X, y=None): 67 | pass 68 | 69 | def _transform(self, X, y=None): 70 | X = X.copy() 71 | middle_column_idx = np.int(X.shape[1]/2) 72 | middle_column_values = X[:, middle_column_idx] 73 | random_values = self.random_state.permutation(middle_column_values) 74 | X[:, middle_column_idx] = random_values 75 | if y is None: 76 | return X 77 | else: 78 | return X, y 79 | 80 | class ValidationSetRealExamples(RandomPermuter): 81 | def __init__(self, seed=17): 82 | self.random_state = np.random.RandomState(seed=seed) 83 | 84 | def fit_transform(self, X, y=None): 85 | if y is None: 86 | return X 87 | else: 88 | return X, y 89 | 90 | def transform(self, X, y=None): 91 | evens = [i*2 for i in np.arange(X.shape[0]/2)] 92 | if y is not None: 93 | return X[evens], y[evens] 94 | else: 95 | return X[evens] 96 | 97 | class TrainingSetPrepositionRandomPermuter(RandomPermuter): 98 | def fit_transform(self, X, y=None): 99 | return self._transform(X, y) 100 | 101 | def transform(self, X, y=None): 102 | if y is None: 103 | return X 104 | else: 105 | return X, y 106 | 107 | class ValidationSetPrepositionRandomPermuter(RandomPermuter): 108 | def fit_transform(self, X, y=None): 109 | if y is None: 110 | return X 111 | else: 112 | return X, y 113 | 114 | def transform(self, X, y=None): 115 | return self._transform(X, y) 116 | 117 | class RandomRegularizer(object): 118 | def __init__(self, seed=17): 119 | self.random_state = np.random.RandomState(seed=seed) 120 | 121 | def fit(self, X, y=None): 122 | pass 123 | 124 | def _transform(self, X, y=None): 125 | X = X.copy() 126 | middle_column_idx = np.int(X.shape[1]/2) 127 | middle_column_values = X[:, middle_column_idx] 128 | value_set = list(set(middle_column_values.tolist())) 129 | random_values = [] 130 | for i in np.arange(len(X)): 131 | current_value = middle_column_values[i] 132 | while True: 133 | random_value = self.random_state.choice(value_set) 134 | if random_value != current_value: 135 | random_values.append(random_value) 136 | break 137 | X[:, middle_column_idx] = random_values 138 | if y is None: 139 | return X 140 | else: 141 | return X, y 142 | 143 | class TrainingSetPrepositionRandomRegularizer(RandomRegularizer): 144 | """ 145 | Takes examples in the form of a vector of indices. Replaces each 146 | middle value in each vector with a value from some other example. 147 | """ 148 | def fit_transform(self, X, y=None): 149 | return self._transform(X, y) 150 | 151 | def transform(self, X, y=None): 152 | if y is None: 153 | return X 154 | else: 155 | return X, y 156 | 157 | class ValidationSetPrepositionRandomRegularizer(RandomRegularizer): 158 | def fit_transform(self, X, y=None): 159 | if y is None: 160 | return X 161 | else: 162 | return X, y 163 | 164 | def transform(self, X, y=None): 165 | return self._transform(X, y) 166 | 167 | class UnconstrainedTrainingSetPrepositionPermuter(object): 168 | def __init__(self, seed=17): 169 | self.random_state = np.random.RandomState(seed=seed) 170 | 171 | def fit(self, X, y=None): 172 | pass 173 | 174 | def fit_transform(self, X, y=None): 175 | X = X.copy() 176 | middle_column_idx = np.int(X.shape[1]/2) 177 | middle_column_values = X[:, middle_column_idx] 178 | random_values = self.random_state.permutation(middle_column_values) 179 | X[:, middle_column_idx] = random_values 180 | if y is None: 181 | return X 182 | else: 183 | return X, y 184 | 185 | def transform(self, X, y=None): 186 | if y is None: 187 | return X 188 | else: 189 | return X, y 190 | 191 | 192 | def real_examples(path): 193 | f = h5py.File(path) 194 | # Target_code is 0 when the preposition in the example is the original 195 | # preposition in the corpus and 1 when the preposition has been randomly 196 | # replaced with another one in the confusion set. 197 | idx = f['target_code'].value == 0 198 | f.close() 199 | return idx 200 | 201 | def random_regularization_examples(path): 202 | f = h5py.File(path) 203 | idx = f['target_code'].value == 1 204 | f.close() 205 | return idx 206 | 207 | class Identity(Layer): 208 | def get_output(self, train): 209 | return self.get_input(train) 210 | 211 | def build_residual_model(args): 212 | graph = Graph() 213 | 214 | graph.add_input('input', input_shape=(args.input_width,), dtype='int') 215 | 216 | graph.add_node(build_embedding_layer(args), name='embedding', input='input') 217 | 218 | graph.add_node(build_convolutional_layer(args), name='conv', input='embedding') 219 | prev_layer = 'conv' 220 | if 'normalization' in args.regularization_layer: 221 | graph.add_node(BatchNormalization(), name='conv_bn', input=prev_layer) 222 | prev_layer = 'conv_bn' 223 | graph.add_node(Activation('relu'), name='conv_relu', input=prev_layer) 224 | 225 | graph.add_node(build_pooling_layer(args), name='pool', input='conv_relu') 226 | 227 | graph.add_node(Flatten(), name='flatten', input='pool') 228 | prev_layer = 'flatten' 229 | 230 | # Add two dense layers. 231 | for i in range(2): 232 | layer_name = 'dense%02d' %i 233 | l = build_dense_layer(args, n_hidden=args.n_filters) 234 | graph.add_node(l, name=layer_name, input=prev_layer) 235 | prev_layer = layer_name 236 | if 'normalization' in args.regularization_layer: 237 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer) 238 | prev_layer = layer_name+'bn' 239 | if 'dropout' in args.regularization_layer: 240 | graph.add_node(Dropout(args.dropout_p), name=layer_name+'do', input=prev_layer) 241 | prev_layer = layer_name+'do' 242 | 243 | # Add sequence of residual blocks. 244 | for i in range(args.n_residual_blocks): 245 | # Add a fixed number of layers per residual block. 246 | block_name = '%02d' % i 247 | 248 | graph.add_node(Identity(), name=block_name+'input', input=prev_layer) 249 | prev_layer = block_input_layer = block_name+'input' 250 | 251 | for layer_num in range(args.n_layers_per_residual_block): 252 | layer_name = 'h%s%02d' % (block_name, layer_num) 253 | 254 | l = build_dense_layer(args, n_hidden=args.n_filters) 255 | graph.add_node(l, name=layer_name, input=prev_layer) 256 | prev_layer = layer_name 257 | 258 | if 'normalization' in args.regularization_layer: 259 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer) 260 | prev_layer = layer_name+'bn' 261 | 262 | if i < args.n_layers_per_residual_block: 263 | a = Activation('relu') 264 | graph.add_node(Activation('relu'), name=layer_name+'relu', input=prev_layer) 265 | prev_layer = layer_name+'relu' 266 | if 'dropout' in args.regularization_layer: 267 | graph.add_node(Dropout(args.dropout_p), name=layer_name+'do', input=prev_layer) 268 | prev_layer = layer_name+'do' 269 | 270 | graph.add_node(Identity(), name=block_name+'output', inputs=[block_input_layer, prev_layer], merge_mode='sum') 271 | graph.add_node(Activation('relu'), name=block_name+'relu', input=block_name+'output') 272 | prev_layer = block_input_layer = block_name+'relu' 273 | 274 | graph.add_node(build_dense_layer(args, args.n_classes, 275 | activation='softmax'), name='softmax', input=prev_layer) 276 | 277 | graph.add_output(name='output', input='softmax') 278 | 279 | load_weights(args, graph) 280 | 281 | optimizer = build_optimizer(args) 282 | 283 | graph.compile(loss={'output': args.loss}, optimizer=optimizer) 284 | 285 | return graph 286 | 287 | 288 | def build_ordinary_model(args): 289 | model = Sequential() 290 | model.add(build_embedding_layer(args)) 291 | if args.dropout_embedding_p > 0.: 292 | model.add(Dropout(args.dropout_embedding_p)) 293 | model.add(build_convolutional_layer(args)) 294 | if 'normalization' in args.regularization_layer: 295 | model.add(BatchNormalization()) 296 | model.add(Activation('relu')) 297 | if args.dropout_conv_p > 0.: 298 | model.add(Dropout(args.dropout_conv_p)) 299 | 300 | model.add(build_pooling_layer(args)) 301 | model.add(Flatten()) 302 | 303 | for i in range(args.n_fully_connected): 304 | model.add(build_dense_layer(args)) 305 | if 'normalization' in args.regularization_layer: 306 | model.add(BatchNormalization()) 307 | model.add(Activation('relu')) 308 | if 'dropout' in args.regularization_layer: 309 | model.add(Dropout(args.dropout_p)) 310 | 311 | model.add(build_dense_layer(args, args.n_classes, 312 | activation='softmax')) 313 | 314 | load_weights(args, model) 315 | 316 | optimizer = build_optimizer(args) 317 | 318 | model.compile(loss=args.loss, optimizer=optimizer) 319 | 320 | for k,v in json.loads(model.to_json()).items(): 321 | print(k) 322 | if k == 'layers': 323 | for l in v: 324 | print(' => %s' % l['name']) 325 | 326 | return model 327 | 328 | def build_model(args): 329 | np.random.seed(args.seed) 330 | 331 | if isinstance(args.n_residual_blocks, int): 332 | return build_residual_model(args) 333 | else: 334 | return build_ordinary_model(args) 335 | 336 | -------------------------------------------------------------------------------- /models/keras/preposition/convnet/run-medium.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | 3 | N=10000000 4 | 5 | #--extra-train-file $(ls data/preposition/prepositions-all-new-train-$N/* | grep -v 00.h5) \ 6 | 7 | embedding_weights=data/preposition/prepositions-all-new-weights.npy 8 | 9 | ./train_keras.py \ 10 | models/preposition/convnet \ 11 | data/preposition/prepositions-all-new-train-$N.h5 \ 12 | data/preposition/prepositions-all-new-validate.h5 \ 13 | XwindowNULL \ 14 | --target-name original_word_code \ 15 | --target-data data/preposition/prepositions-all-new-target-data.json \ 16 | --description "comparing inputs with convnets - input = XwindowNULL, target = original_word_code, contrasting, $N training examples, Adagrad, n_filters=500 , n_hidden=1000, n_word_dims=300 (pre-trained, frozen), 3 hidden layers, shuffled data" \ 17 | --n-vocab 83064 \ 18 | --model-cfg optimizer=Adagrad regularization_layer="" patience=10 n_filters=500 n_hidden=1000 n_word_dims=300 embedding_weights=$embedding_weights train_embeddings=false \ 19 | --n-validation 20000 \ 20 | --classification-report \ 21 | --shuffle \ 22 | --n-epochs 10 \ 23 | --log 24 | 25 | ./train_keras.py \ 26 | models/preposition/convnet \ 27 | data/preposition/prepositions-all-new-train-$N.h5 \ 28 | data/preposition/prepositions-all-new-validate.h5 \ 29 | XwindowNULL X \ 30 | --target-name original_word_code \ 31 | --target-data data/preposition/prepositions-all-new-target-data.json \ 32 | --description "comparing inputs with convnets - input = XwindowNULL X, target = original_word_code, contrasting, $N training examples, Adagrad, n_filters=500 , n_hidden=1000, n_word_dims=300 (pre-trained, frozen), 3 hidden layers, shuffled data" \ 33 | --n-vocab 83064 \ 34 | --model-cfg optimizer=Adagrad regularization_layer="" patience=10 n_filters=500 n_hidden=1000 n_word_dims=300 embedding_weights=$embedding_weights train_embeddings=false \ 35 | --n-validation 20000 \ 36 | --classification-report \ 37 | --shuffle \ 38 | --n-epochs 10 \ 39 | --log 40 | 41 | -------------------------------------------------------------------------------- /models/keras/preposition/convnet/run-small.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -xe 2 | 3 | N=1000000 4 | 5 | embedding_weights=data/preposition/prepositions-all-new-weights.npy 6 | 7 | function train() { 8 | n_filters=$1 9 | shift 10 | filter_width=$1 11 | shift 12 | features=$@ 13 | 14 | features_name=$(echo $features | sed 's, ,-,g') 15 | dest=$features_name-$n_filters-$filter_width 16 | 17 | ./train_keras.py \ 18 | models/keras/preposition/convnet \ 19 | data/preposition/prepositions-all-new-train-$N-balanced.h5 \ 20 | data/preposition/prepositions-all-new-validate-balanced.h5 \ 21 | $features \ 22 | --model-dest models/keras/preposition/convnet/small/feature-evaluation/$dest \ 23 | --target-name original_word_code \ 24 | --target-data data/preposition/prepositions-all-new-target-data.json \ 25 | --description "comparing inputs with convnets - input = $features, target = original_word_code, contrasting, $N training examples, Adagrad, patience=5, n_filters=$n_filters, filter_width=$filter_width, n_word_dims=300 (pre-trained, frozen), 1 hidden layer, shuffled data" \ 26 | --n-vocab 83064 \ 27 | --model-cfg optimizer=Adagrad regularization_layer="dropout" n_filters=$n_filters n_word_dims=300 embedding_weights=$embedding_weights train_embeddings=false filter_width=$filter_width patience=5 \ 28 | --n-validation 20000 \ 29 | --n-epochs 10 \ 30 | --shuffle \ 31 | --log 32 | } 33 | 34 | function xval5() { 35 | features=$@ 36 | for filter_width in 2 3 5 37 | do 38 | for n_filters in 100 39 | do 40 | train $n_filters $filter_width $features 41 | done 42 | done 43 | } 44 | 45 | function xval7() { 46 | features=$@ 47 | for filter_width in 2 3 5 7 48 | do 49 | for n_filters in 100 50 | do 51 | train $n_filters $filter_width $features 52 | done 53 | done 54 | } 55 | 56 | function xval9() { 57 | features=$@ 58 | for filter_width in 2 3 5 7 9 59 | do 60 | for n_filters in 100 61 | do 62 | train $n_filters $filter_width $features 63 | done 64 | done 65 | } 66 | 67 | xval5 Xwindow 68 | xval7 Xwindow7 69 | xval9 Xwindow9 70 | 71 | xval5 XwindowNULL X 72 | xval7 Xwindow7NULL X 73 | xval9 Xwindow9NULL X 74 | 75 | xval9 X 76 | 77 | xval5 XwindowNULL 78 | xval7 Xwindow7NULL 79 | xval9 Xwindow9NULL 80 | 81 | xval5 Xwindow X 82 | xval7 Xwindow7 X 83 | xval9 Xwindow9 X 84 | -------------------------------------------------------------------------------- /models/keras/preposition/convnet/small/find-best-filter-size/find-best.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Get a unique list of the feature names. 4 | for input in $(for file in */model.log; do echo $(dirname $file) | sed 's,-[0-9]$,,'; done | sort | uniq) 5 | do 6 | # For each feature, find the one filter width that yielded the lowest 7 | # validation loss. 8 | for file in ${input}*/model.log 9 | do 10 | echo $input $(dirname $file) $(grep val_acc $file | cat -n | sort -n -r -k17 | tail -1) 11 | done | sort -n -r -k17 | tail -1 12 | done | sort -n -r -k 17 13 | -------------------------------------------------------------------------------- /models/keras/preposition/convnet/small/find-best-filter-size/find-best.txt: -------------------------------------------------------------------------------- 1 | X-100 X-100-5 10 11-25 18:13 root INFO 170s - loss: 1.2008 - acc: 0.6007 - val_loss: 1.2047 - val_acc: 0.5964 2 | 3 | XwindowNULL-100 XwindowNULL-100-3 10 11-25 19:56 root INFO 160s - loss: 1.1836 - acc: 0.6147 - val_loss: 1.2035 - val_acc: 0.6055 4 | XwindowNULL-X-100 XwindowNULL-X-100-5 10 11-25 12:33 root INFO 177s - loss: 1.0559 - acc: 0.6551 - val_loss: 1.0390 - val_acc: 0.6549 5 | Xwindow-X-100 Xwindow-X-100-5 10 11-26 00:28 root INFO 178s - loss: 1.0339 - acc: 0.6663 - val_loss: 1.0215 - val_acc: 0.6684 6 | Xwindow-100 Xwindow-100-3 10 11-25 08:02 root INFO 161s - loss: 0.9886 - acc: 0.6886 - val_loss: 1.0048 - val_acc: 0.6834 7 | 8 | Xwindow7NULL-100 Xwindow7NULL-100-5 10 11-25 21:07 root INFO 135s - loss: 1.1051 - acc: 0.6404 - val_loss: 1.1552 - val_acc: 0.6223 9 | Xwindow7NULL-X-100 Xwindow7NULL-X-100-5 10 11-25 13:54 root INFO 180s - loss: 1.0307 - acc: 0.6636 - val_loss: 1.0158 - val_acc: 0.6630 10 | Xwindow7-X-100 Xwindow7-X-100-7 10 11-26 02:20 root INFO 189s - loss: 0.9867 - acc: 0.6817 - val_loss: 0.9934 - val_acc: 0.6765 11 | Xwindow7-100 Xwindow7-100-5 10 11-25 09:12 root INFO 135s - loss: 0.9192 - acc: 0.7094 - val_loss: 0.9673 - val_acc: 0.6980 12 | 13 | Xwindow9NULL-100 Xwindow9NULL-100-5 10 11-25 22:30 root INFO 204s - loss: 1.0893 - acc: 0.6448 - val_loss: 1.1373 - val_acc: 0.6236 14 | Xwindow9NULL-X-100 Xwindow9NULL-X-100-9 10 11-25 16:56 root INFO 211s - loss: 0.9888 - acc: 0.6783 - val_loss: 1.0049 - val_acc: 0.6674 15 | Xwindow9-X-100 Xwindow9-X-100-9 10 11-26 04:50 root INFO 211s - loss: 0.9618 - acc: 0.6897 - val_loss: 0.9908 - val_acc: 0.6795 16 | Xwindow9-100 Xwindow9-100-7 10 11-25 10:58 root INFO 135s - loss: 0.8795 - acc: 0.7216 - val_loss: 0.9521 - val_acc: 0.7008 17 | -------------------------------------------------------------------------------- /models/keras/preposition/lstm/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "regularization_layer": null, 3 | "n_word_dims": 50, 4 | "n_units": 100, 5 | "loss": "categorical_crossentropy", 6 | "patience": 10, 7 | "batch_size": 128, 8 | "optimizer": "SGD", 9 | "learning_rate": 0.001, 10 | "momentum": 0.9, 11 | "decay": 0.0, 12 | "embedding_max_norm": 1000, 13 | "truncate_gradient": -1, 14 | "clipnorm": 0, 15 | "mask_zero": false, 16 | "l2_penalty": 0.0 17 | } 18 | -------------------------------------------------------------------------------- /models/keras/preposition/lstm/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from keras.models import Sequential 4 | from keras.layers.core import Dense, Dropout, Activation, Flatten 5 | from keras.layers.recurrent import LSTM, GRU 6 | from keras.layers.embeddings import Embedding 7 | from keras.constraints import maxnorm 8 | from keras.regularizers import l2 9 | from keras.optimizers import SGD, Adam, RMSprop, Adadelta, Adagrad 10 | 11 | from modeling.layers import ImmutableEmbedding 12 | 13 | def build_model(args): 14 | print("args", vars(args)) 15 | 16 | model = Sequential() 17 | 18 | np.random.seed(args.seed) 19 | 20 | if hasattr(args, 'embedding_weights') and args.embedding_weights is not None: 21 | W = np.load(args.embedding_weights) 22 | if args.train_embeddings: 23 | model.add(Embedding(args.n_vocab, args.n_word_dims, 24 | weights=[W], 25 | W_constraint=maxnorm(args.embedding_max_norm))) 26 | else: 27 | model.add(ImmutableEmbedding(args.n_vocab, args.n_word_dims, 28 | weights=[W])) 29 | else: 30 | model.add(Embedding(args.n_vocab, args.n_word_dims, 31 | mask_zero=args.mask_zero, 32 | W_constraint=maxnorm(args.embedding_max_norm))) 33 | 34 | model.add(LSTM(args.n_word_dims, args.n_units, 35 | truncate_gradient=args.truncate_gradient, 36 | return_sequences=True)) 37 | if args.regularization_layer == 'dropout': 38 | model.add(Dropout(0.2)) 39 | #elif args.regularization_layer == 'normalization': 40 | # model.add(BatchNormalization((args.n_filters,))) 41 | 42 | model.add(LSTM(args.n_units, args.n_units, 43 | truncate_gradient=args.truncate_gradient, 44 | return_sequences=True)) 45 | if args.regularization_layer == 'dropout': 46 | model.add(Dropout(0.2)) 47 | #elif args.regularization_layer == 'normalization': 48 | # model.add(BatchNormalization((args.n_filters,))) 49 | 50 | ''' 51 | model.add(LSTM(args.n_units, args.n_units, 52 | truncate_gradient=args.truncate_gradient, 53 | return_sequences=True)) 54 | if args.regularization_layer == 'dropout': 55 | model.add(Dropout(0.2)) 56 | #elif args.regularization_layer == 'normalization': 57 | # model.add(BatchNormalization((args.n_filters,))) 58 | ''' 59 | 60 | model.add(LSTM(args.n_units, args.n_units, 61 | truncate_gradient=args.truncate_gradient, 62 | return_sequences=False)) 63 | if args.regularization_layer == 'dropout': 64 | model.add(Dropout(0.2)) 65 | #elif args.regularization_layer == 'normalization': 66 | # model.add(BatchNormalization((args.n_filters,))) 67 | 68 | model.add(Dense(args.n_units, args.n_classes, 69 | W_regularizer=l2(args.l2_penalty))) 70 | model.add(Activation('softmax')) 71 | 72 | if args.optimizer == 'SGD': 73 | optimizer = SGD(lr=args.learning_rate, 74 | decay=args.decay, momentum=args.momentum, 75 | clipnorm=args.clipnorm) 76 | elif args.optimizer == 'Adam': 77 | optimizer = Adam(clipnorm=args.clipnorm) 78 | elif args.optimizer == 'RMSprop': 79 | optimizer = RMSprop(clipnorm=args.clipnorm) 80 | elif args.optimizer == 'Adadelta': 81 | optimizer = Adadelta(clipnorm=args.clipnorm) 82 | elif args.optimizer == 'Adagrad': 83 | optimizer = Adagrad(clipnorm=args.clipnorm) 84 | else: 85 | raise ValueError("don't know how to use optimizer {0}".format(args.optimizer)) 86 | 87 | model.compile(loss=args.loss, optimizer=optimizer) 88 | 89 | return model 90 | -------------------------------------------------------------------------------- /models/keras/spelling/convnet/exp03-inputs/op_transpose_n_ops_1_n_errors_per_word_3/analysis.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import modeling.utils 6 | import spelling.baseline 7 | 8 | def mark(words): 9 | return ['^'+w+'$' for w in words] 10 | 11 | def build_index(): 12 | train_hdf5_file = 'data/spelling/experimental/op-transpose-distance-1-errors-per-word-3.h5' 13 | train_h5 = h5py.File(train_hdf5_file) 14 | 15 | train_csv_file = 'data/spelling/experimental/op-transpose-distance-1-errors-per-word-3.csv' 16 | train_df = pd.read_csv(train_csv_file, sep='\t', encoding='utf8') 17 | words = train_df.real_word.tolist() 18 | marked_words = mark(words) 19 | 20 | X_train = train_h5['marked_chars'].value 21 | index_size = np.max(X_train) 22 | i = 0 23 | index = {} 24 | 25 | while len(index) < index_size: 26 | marked_word = marked_words[i] 27 | row = X_train[i] 28 | 29 | for j,idx in enumerate(row): 30 | if idx == 0: 31 | break 32 | index[marked_word[j]] = idx 33 | 34 | i += 1 35 | 36 | return index 37 | 38 | index = build_index() 39 | 40 | model_dir = 'models/keras/spelling/convnet/exp03-inputs/op_transpose_n_ops_1_n_errors_per_word_3' 41 | 42 | df = pd.read_csv('../spelling/data/aspell-dict.csv.gz', sep='\t', encoding='utf8') 43 | words = df.word.tolist() 44 | vocab = set(words) 45 | 46 | lm = spelling.baseline.CharacterLanguageModel('witten-bell', order=3) 47 | lm.fit(words) 48 | 49 | model, model_cfg = modeling.utils.load_model(model_dir, model_weights=True) 50 | 51 | bins = np.arange(0, 1, .1) 52 | outputs = {} 53 | histograms = {} 54 | 55 | for order in range(1, 4): 56 | print('order %d' % order) 57 | generated = [] 58 | # Generate 500k words, controlling for length and excluding those 59 | # that are already in the vocabulary. Only keep the first 100k 60 | # of those that satisfy our requirements. 61 | for g in lm.generate(order, 500000): 62 | if len(g) < 5 or len(g) > 10: 63 | continue 64 | if g in vocab: 65 | continue 66 | generated.append(g) 67 | if len(generated) == 100000: 68 | break 69 | 70 | marked = mark(generated) 71 | X = np.zeros((len(marked), input_width)) 72 | for i,word in enumerate(marked): 73 | for j,chr in enumerate(word): 74 | X[i,j] = index[chr] 75 | 76 | output = zip(generated, model.predict(X)[:, 1]) 77 | outputs[order] = output 78 | histograms[order] = np.histogram([o[1] for o in output], bins=bins) 79 | -------------------------------------------------------------------------------- /models/keras/spelling/convnet/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_word_dims": 0, 3 | "n_filters": 0, 4 | "filter_width": 0, 5 | "n_fully_connected": 0, 6 | "n_residual_blocks": 0, 7 | 8 | "train_embeddings": true, 9 | "embedding_init": "uniform", 10 | "batch_normalization": true, 11 | 12 | "optimizer": "Adam", 13 | "loss": "categorical_crossentropy", 14 | "l2_penalty": 0.0, 15 | 16 | "dropout_embedding_p": 0.0, 17 | "dropout_conv_p": 0.0, 18 | "dropout_fc_p": 0.0, 19 | 20 | "patience": 1, 21 | "batch_size": 128, 22 | 23 | "embedding_max_norm": 1000, 24 | "filter_max_norm": 1000, 25 | "dense_max_norm": 1000, 26 | "clipnorm": 0, 27 | "border_mode": "valid" 28 | } 29 | -------------------------------------------------------------------------------- /models/keras/spelling/convnet/model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.setrecursionlimit(5000) 3 | import json 4 | import h5py 5 | 6 | import numpy as np 7 | 8 | from keras.models import Sequential, Graph 9 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Layer 10 | from keras.layers.normalization import BatchNormalization 11 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 12 | from keras.layers.embeddings import Embedding 13 | from keras.constraints import maxnorm 14 | from keras.regularizers import l2 15 | from keras.optimizers import SGD, Adam, Adadelta, Adagrad, RMSprop 16 | 17 | from modeling.layers import ImmutableEmbedding 18 | from modeling.difference import TemporalDifference 19 | import modeling.data 20 | from modeling.builders import (build_embedding_layer, 21 | build_convolutional_layer, build_pooling_layer, 22 | build_dense_layer, build_optimizer, load_weights) 23 | 24 | class GraphMarshaller(modeling.data.GraphMarshaller): 25 | def marshal(self, data, target=None): 26 | return { 27 | 'input': data, 28 | 'output': target 29 | } 30 | 31 | def unmarshal(self, output): 32 | return output['output'] 33 | 34 | class Identity(Layer): 35 | def get_output(self, train): 36 | return self.get_input(train) 37 | 38 | def build_residual_model(args): 39 | graph = Graph() 40 | 41 | graph.add_input('input', input_shape=(args.input_width,), dtype='int') 42 | 43 | graph.add_node(build_embedding_layer(args), name='embedding', input='input') 44 | 45 | graph.add_node(build_convolutional_layer(args), name='conv', input='embedding') 46 | prev_layer = 'conv' 47 | if args.batch_normalization: 48 | graph.add_node(BatchNormalization(), name='conv_bn', input=prev_layer) 49 | prev_layer = 'conv_bn' 50 | graph.add_node(Activation('relu'), name='conv_relu', input=prev_layer) 51 | 52 | graph.add_node(build_pooling_layer(args), name='pool', input='conv_relu') 53 | 54 | graph.add_node(Flatten(), name='flatten', input='pool') 55 | prev_layer = 'flatten' 56 | 57 | # Add some number of fully-connected layers without skip connections. 58 | for i in range(args.n_fully_connected): 59 | layer_name = 'dense%02d' %i 60 | l = build_dense_layer(args, n_hidden=args.n_hidden) 61 | graph.add_node(l, name=layer_name, input=prev_layer) 62 | prev_layer = layer_name 63 | if args.batch_normalization: 64 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer) 65 | prev_layer = layer_name+'bn' 66 | if args.dropout_fc_p > 0.: 67 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer) 68 | prev_layer = layer_name+'do' 69 | 70 | # Add sequence of residual blocks. 71 | for i in range(args.n_residual_blocks): 72 | # Add a fixed number of layers per residual block. 73 | block_name = '%02d' % i 74 | 75 | graph.add_node(Identity(), name=block_name+'input', input=prev_layer) 76 | prev_layer = block_input_layer = block_name+'input' 77 | 78 | try: 79 | n_layers_per_residual_block = args.n_layers_per_residual_block 80 | except AttributeError: 81 | n_layers_per_residual_block = 2 82 | 83 | for layer_num in range(n_layers_per_residual_block): 84 | layer_name = 'h%s%02d' % (block_name, layer_num) 85 | 86 | l = build_dense_layer(args, n_hidden=args.n_hidden) 87 | graph.add_node(l, name=layer_name, input=prev_layer) 88 | prev_layer = layer_name 89 | 90 | if args.batch_normalization: 91 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer) 92 | prev_layer = layer_name+'bn' 93 | 94 | if i < n_layers_per_residual_block: 95 | a = Activation('relu') 96 | graph.add_node(Activation('relu'), name=layer_name+'relu', input=prev_layer) 97 | prev_layer = layer_name+'relu' 98 | if args.dropout_fc_p > 0.: 99 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer) 100 | prev_layer = layer_name+'do' 101 | 102 | graph.add_node(Identity(), name=block_name+'output', inputs=[block_input_layer, prev_layer], merge_mode='sum') 103 | graph.add_node(Activation('relu'), name=block_name+'relu', input=block_name+'output') 104 | prev_layer = block_input_layer = block_name+'relu' 105 | 106 | graph.add_node(build_dense_layer(args, args.n_classes, 107 | activation='softmax'), name='softmax', input=prev_layer) 108 | 109 | graph.add_output(name='output', input='softmax') 110 | 111 | load_weights(args, graph) 112 | 113 | optimizer = build_optimizer(args) 114 | 115 | graph.compile(loss={'output': args.loss}, optimizer=optimizer) 116 | 117 | return graph 118 | 119 | def build_ordinary_model(args): 120 | model = Sequential() 121 | model.add(build_embedding_layer(args)) 122 | if args.dropout_embedding_p > 0.: 123 | model.add(Dropout(args.dropout_embedding_p)) 124 | model.add(build_convolutional_layer(args)) 125 | if args.batch_normalization: 126 | model.add(BatchNormalization()) 127 | model.add(Activation('relu')) 128 | if args.dropout_conv_p > 0.: 129 | model.add(Dropout(args.dropout_conv_p)) 130 | 131 | model.add(build_pooling_layer(args)) 132 | model.add(Flatten()) 133 | 134 | for i in range(args.n_fully_connected): 135 | model.add(build_dense_layer(args)) 136 | if args.batch_normalization: 137 | model.add(BatchNormalization()) 138 | model.add(Activation('relu')) 139 | if args.dropout_fc_p > 0.: 140 | model.add(Dropout(args.dropout_fc_p)) 141 | 142 | model.add(build_dense_layer(args, args.n_classes, 143 | activation='softmax')) 144 | 145 | load_weights(args, model) 146 | 147 | optimizer = build_optimizer(args) 148 | 149 | model.compile(loss=args.loss, optimizer=optimizer) 150 | 151 | if args.verbose: 152 | for k,v in json.loads(model.to_json()).items(): 153 | if k == 'layers': 154 | for l in v: 155 | print(' => %s' % l['name']) 156 | 157 | return model 158 | 159 | def build_model(args): 160 | np.random.seed(args.seed) 161 | 162 | if args.n_residual_blocks > 0: 163 | return build_residual_model(args) 164 | else: 165 | return build_ordinary_model(args) 166 | 167 | -------------------------------------------------------------------------------- /models/keras/spelling/correction/isolated/binary/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_word_dims": 0, 3 | "n_filters": 0, 4 | "filter_width": 0, 5 | "n_fully_connected": 0, 6 | "n_residual_blocks": 0, 7 | 8 | "train_embeddings": true, 9 | "embedding_init": "uniform", 10 | "batch_normalization": true, 11 | 12 | "optimizer": "Adam", 13 | "loss": "categorical_crossentropy", 14 | "l2_penalty": 0.0, 15 | 16 | "dropout_embedding_p": 0.0, 17 | "dropout_conv_p": 0.0, 18 | "dropout_fc_p": 0.0, 19 | 20 | "patience": 1, 21 | "batch_size": 128, 22 | 23 | "embedding_max_norm": 1000, 24 | "filter_max_norm": 1000, 25 | "dense_max_norm": 1000, 26 | "clipnorm": 0, 27 | "border_mode": "valid" 28 | } 29 | -------------------------------------------------------------------------------- /models/keras/spelling/correction/isolated/binary/model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import h5py 3 | sys.setrecursionlimit(5000) 4 | import json 5 | import h5py 6 | 7 | from sklearn.utils import check_random_state 8 | 9 | import numpy as np 10 | 11 | from keras.models import Sequential, Graph 12 | from keras.utils import np_utils 13 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Layer 14 | from keras.layers.normalization import BatchNormalization 15 | 16 | import modeling.data 17 | from modeling.builders import (build_embedding_layer, 18 | build_convolutional_layer, build_pooling_layer, 19 | build_dense_layer, build_optimizer, load_weights, 20 | build_hierarchical_softmax_layer) 21 | from modeling.utils import balanced_class_weights 22 | 23 | class SingleFileDataset(object): 24 | def __init__(self, file_path, data_name, target_name, batch_size, random_state=17): 25 | assert isinstance(data_name, (list,tuple)) 26 | assert isinstance(target_name, (list,tuple)) 27 | 28 | random_state = check_random_state(random_state) 29 | 30 | self.__dict__.update(locals()) 31 | del self.self 32 | 33 | self.load_data() 34 | 35 | def load_data(self): 36 | self.data = {} 37 | self.target = {} 38 | self.target_one_hot = {} 39 | 40 | f = h5py.File(self.file_path) 41 | self.n = None 42 | 43 | for data_name in self.data_name: 44 | self.data[data_name] = f[data_name].value 45 | if self.n is None: 46 | self.n = len(self.data[data_name]) 47 | else: 48 | assert len(self.data[data_name]) == self.n 49 | for target_name in self.target_name: 50 | target = f[target_name].value 51 | assert len(target) == self.n 52 | 53 | self.target[target_name] = target 54 | n_classes = np.max(target) + 1 55 | self.target_one_hot[target_name] = np_utils.to_categorical(target, n_classes) 56 | f.close() 57 | 58 | def get_dict(self, one_hot=True): 59 | d = {} 60 | for data_name in self.data_name: 61 | d[data_name] = self.data[data_name] 62 | for target_name in self.target_name: 63 | if one_hot: 64 | d[target_name] = self.target_one_hot[target_name] 65 | else: 66 | d[target_name] = self.target[target_name] 67 | return d 68 | 69 | def class_weights(self, class_weight_exponent): 70 | return balanced_class_weights( 71 | self.target['binary_target'], 72 | 2, 73 | class_weight_exponent) 74 | 75 | def generate(self): 76 | while 1: 77 | idx = self.random_state.choice(self.n, size=self.batch_size, replace=False) 78 | batch = {} 79 | for data_name in self.data_name: 80 | batch[data_name] = self.data[data_name][idx] 81 | for target_name in self.target_name: 82 | batch[target_name] = self.target_one_hot[target_name][idx] 83 | yield batch 84 | 85 | class Identity(Layer): 86 | def get_output(self, train): 87 | return self.get_input(train) 88 | 89 | def add_bn_relu(graph, args, prev_layer): 90 | bn_name = prev_layer + '_bn' 91 | relu_name = prev_layer + '_relu' 92 | if args.batch_normalization: 93 | graph.add_node(BatchNormalization(), name=bn_name, input=prev_layer) 94 | prev_layer = bn_name 95 | graph.add_node(Activation('relu'), name=relu_name, input=prev_layer) 96 | return relu_name 97 | 98 | def build_model(args, train_data, validation_data): 99 | np.random.seed(args.seed) 100 | 101 | graph = Graph() 102 | 103 | non_word_input = 'non_word_marked_chars' 104 | real_word_input = 'real_word_marked_chars' 105 | 106 | non_word_input_width = train_data.data[non_word_input].shape[1] 107 | real_word_input_width = train_data.data[real_word_input].shape[1] 108 | 109 | print('non_word_input_width', non_word_input_width) 110 | print('real_word_input_width', real_word_input_width) 111 | 112 | graph.add_input(non_word_input, input_shape=(non_word_input_width,), dtype='int') 113 | graph.add_node(build_embedding_layer(args, input_width=non_word_input_width), 114 | name='non_word_embedding', input=non_word_input) 115 | graph.add_node(build_convolutional_layer(args), name='non_word_conv', input='non_word_embedding') 116 | non_word_prev_layer = add_bn_relu(graph, args, 'non_word_conv') 117 | graph.add_node(build_pooling_layer(args, input_width=non_word_input_width), 118 | name='non_word_pool', input=non_word_prev_layer) 119 | graph.add_node(Flatten(), name='non_word_flatten', input='non_word_pool') 120 | 121 | graph.add_input(real_word_input, input_shape=(real_word_input_width,), dtype='int') 122 | graph.add_node(build_embedding_layer(args, input_width=real_word_input_width), 123 | name='real_word_embedding', input=real_word_input) 124 | graph.add_node(build_convolutional_layer(args), name='real_word_conv', input='real_word_embedding') 125 | real_word_prev_layer = add_bn_relu(graph, args, 'real_word_conv') 126 | graph.add_node(build_pooling_layer(args, input_width=real_word_input_width), 127 | name='real_word_pool', input=real_word_prev_layer) 128 | graph.add_node(Flatten(), name='real_word_flatten', input='real_word_pool') 129 | 130 | # Add some number of fully-connected layers without skip connections. 131 | prev_layer = 'join_non_and_real' 132 | for i in range(args.n_fully_connected): 133 | layer_name = 'dense%02d' %i 134 | l = build_dense_layer(args, n_hidden=args.n_hidden) 135 | if i == 0: 136 | graph.add_node(l, name=layer_name, 137 | inputs=['non_word_flatten', 'real_word_flatten']) 138 | else: 139 | graph.add_node(l, name=layer_name, input=prev_layer) 140 | prev_layer = layer_name 141 | if args.batch_normalization: 142 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer) 143 | prev_layer = layer_name+'bn' 144 | if args.dropout_fc_p > 0.: 145 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer) 146 | prev_layer = layer_name+'do' 147 | 148 | # Add sequence of residual blocks. 149 | for i in range(args.n_residual_blocks): 150 | # Add a fixed number of layers per residual block. 151 | block_name = '%02d' % i 152 | 153 | graph.add_node(Identity(), name=block_name+'input', input=prev_layer) 154 | prev_layer = block_input_layer = block_name+'input' 155 | 156 | try: 157 | n_layers_per_residual_block = args.n_layers_per_residual_block 158 | except AttributeError: 159 | n_layers_per_residual_block = 2 160 | 161 | for layer_num in range(n_layers_per_residual_block): 162 | layer_name = 'h%s%02d' % (block_name, layer_num) 163 | 164 | l = build_dense_layer(args, n_hidden=args.n_hidden) 165 | graph.add_node(l, name=layer_name, input=prev_layer) 166 | prev_layer = layer_name 167 | 168 | if args.batch_normalization: 169 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer) 170 | prev_layer = layer_name+'bn' 171 | 172 | if i < n_layers_per_residual_block: 173 | a = Activation('relu') 174 | graph.add_node(Activation('relu'), name=layer_name+'relu', input=prev_layer) 175 | prev_layer = layer_name+'relu' 176 | if args.dropout_fc_p > 0.: 177 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer) 178 | prev_layer = layer_name+'do' 179 | 180 | graph.add_node(Identity(), name=block_name+'output', inputs=[block_input_layer, prev_layer], merge_mode='sum') 181 | graph.add_node(Activation('relu'), name=block_name+'relu', input=block_name+'output') 182 | prev_layer = block_input_layer = block_name+'relu' 183 | 184 | #if hasattr(args, 'n_hsm_classes'): 185 | # graph.add_node(build_hierarchical_softmax_layer(args), 186 | # name='softmax', input=prev_layer) 187 | #else: 188 | 189 | graph.add_node(build_dense_layer(args, 2, 190 | activation='softmax'), name='softmax', input=prev_layer) 191 | 192 | graph.add_output(name='binary_target', input='softmax') 193 | 194 | load_weights(args, graph) 195 | 196 | optimizer = build_optimizer(args) 197 | 198 | graph.compile(loss={'binary_target': args.loss}, optimizer=optimizer) 199 | 200 | return graph 201 | 202 | def load_train(args, model_cfg): 203 | return SingleFileDataset( 204 | args.train_path, 205 | args.data_name, [args.target_name], 206 | model_cfg.batch_size, args.seed) 207 | 208 | def load_validation(args, model_cfg): 209 | return SingleFileDataset( 210 | args.validation_path, 211 | args.data_name, [args.target_name], 212 | model_cfg.batch_size, args.seed) 213 | 214 | def fit_model(graph, train_data, validation_data, args, callbacks=[]): 215 | graph.fit_generator(train_data.generate(), 216 | samples_per_epoch=int(train_data.n/100), 217 | nb_epoch=args.n_epochs, 218 | validation_data=validation_data.get_dict(), 219 | callbacks=callbacks, 220 | class_weight=train_data.class_weights(args.class_weight_exponent)) 221 | 222 | #fit_generator(generator, samples_per_epoch, nb_epoch, verbose=1, callbacks=[], validation_data=None, 223 | # nb_val_samples=None, class_weight={}, nb_worker=1, nb_val_worker=None) 224 | 225 | -------------------------------------------------------------------------------- /models/keras/spelling/correction/isolated/multiclass/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_word_dims": 0, 3 | "n_filters": 0, 4 | "filter_width": 0, 5 | "n_fully_connected": 0, 6 | "n_residual_blocks": 0, 7 | 8 | "train_embeddings": true, 9 | "embedding_init": "uniform", 10 | "batch_normalization": true, 11 | 12 | "optimizer": "Adam", 13 | "loss": "categorical_crossentropy", 14 | "l2_penalty": 0.0, 15 | 16 | "dropout_embedding_p": 0.0, 17 | "dropout_conv_p": 0.0, 18 | "dropout_fc_p": 0.0, 19 | 20 | "n_classes": 119774, 21 | "patience": 1, 22 | "batch_size": 128, 23 | 24 | "embedding_max_norm": 1000, 25 | "filter_max_norm": 1000, 26 | "dense_max_norm": 1000, 27 | "clipnorm": 0, 28 | "border_mode": "valid" 29 | } 30 | -------------------------------------------------------------------------------- /models/keras/spelling/correction/isolated/multiclass/model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import h5py 3 | sys.setrecursionlimit(5000) 4 | import json 5 | import h5py 6 | 7 | from sklearn.utils import check_random_state 8 | 9 | import numpy as np 10 | 11 | from keras.models import Sequential, Graph 12 | from keras.utils import np_utils 13 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Layer 14 | from keras.layers.normalization import BatchNormalization 15 | 16 | import modeling.data 17 | from modeling.builders import (build_embedding_layer, 18 | build_convolutional_layer, build_pooling_layer, 19 | build_dense_layer, build_optimizer, load_weights, 20 | build_hierarchical_softmax_layer) 21 | from modeling.utils import balanced_class_weights 22 | 23 | class HDF5FileDataset(object): 24 | def __init__(self, file_path, data_name, target_name, batch_size, one_hot=True, random_state=17): 25 | assert isinstance(data_name, (list,tuple)) 26 | assert isinstance(target_name, (list,tuple)) 27 | 28 | random_state = check_random_state(random_state) 29 | 30 | self.__dict__.update(locals()) 31 | del self.self 32 | 33 | self._load_data() 34 | self._check_data() 35 | 36 | def _load_data(self): 37 | self.hdf5_file = h5py.File(self.file_path) 38 | self.n_classes = {} 39 | for target_name in self.target_name: 40 | self.n_classes[target_name] = np.max(self.hdf5_file[target_name])+1 41 | 42 | def _check_data(self): 43 | self.n = None 44 | for data_name in self.data_name: 45 | if self.n is None: 46 | self.n = len(self.hdf5_file[data_name]) 47 | else: 48 | assert len(self.hdf5_file[data_name]) == self.n 49 | for target_name in self.target_name: 50 | assert len(self.hdf5_file[target_name]) == self.n 51 | 52 | def __getitem__(self, name): 53 | return self.hdf5_file[name].value 54 | 55 | def class_weights(self, class_weight_exponent, target='multiclass_correction_target'): 56 | return balanced_class_weights( 57 | self.hdf5_file[target], 58 | 2, 59 | class_weight_exponent) 60 | 61 | def generator(self, one_hot=None, batch_size=None): 62 | if one_hot is None: one_hot = self.one_hot 63 | if batch_size is None: batch_size = self.batch_size 64 | 65 | while 1: 66 | idx = self.random_state.choice(self.n, size=batch_size, replace=False) 67 | batch = {} 68 | for data_name in self.data_name: 69 | batch[data_name] = self.hdf5_file[data_name].value[idx] 70 | for target_name in self.target_name: 71 | target = self.hdf5_file[target_name].value[idx] 72 | if one_hot: 73 | batch[target_name] = np_utils.to_categorical(target, 74 | self.n_classes[target_name]) 75 | else: 76 | batch[target_name] = target 77 | 78 | yield batch 79 | 80 | class Identity(Layer): 81 | def get_output(self, train): 82 | return self.get_input(train) 83 | 84 | def add_bn_relu(graph, args, prev_layer): 85 | bn_name = prev_layer + '_bn' 86 | relu_name = prev_layer + '_relu' 87 | if args.batch_normalization: 88 | graph.add_node(BatchNormalization(), name=bn_name, input=prev_layer) 89 | prev_layer = bn_name 90 | graph.add_node(Activation('relu'), name=relu_name, input=prev_layer) 91 | return relu_name 92 | 93 | def build_model(args, train_data): 94 | np.random.seed(args.seed) 95 | 96 | graph = Graph() 97 | 98 | non_word_input = 'non_word_marked_chars' 99 | non_word_input_width = train_data[non_word_input].shape[1] 100 | 101 | graph.add_input(non_word_input, input_shape=(non_word_input_width,), dtype='int') 102 | graph.add_node(build_embedding_layer(args, input_width=non_word_input_width), 103 | name='non_word_embedding', input=non_word_input) 104 | graph.add_node(build_convolutional_layer(args), name='non_word_conv', input='non_word_embedding') 105 | non_word_prev_layer = add_bn_relu(graph, args, 'non_word_conv') 106 | graph.add_node(build_pooling_layer(args, input_width=non_word_input_width), 107 | name='non_word_pool', input=non_word_prev_layer) 108 | graph.add_node(Flatten(), name='non_word_flatten', input='non_word_pool') 109 | 110 | # Add some number of fully-connected layers without skip connections. 111 | prev_layer = 'non_word_flatten' 112 | for i in range(args.n_fully_connected): 113 | layer_name = 'dense%02d' %i 114 | l = build_dense_layer(args, n_hidden=args.n_hidden) 115 | graph.add_node(l, name=layer_name, input=prev_layer) 116 | prev_layer = layer_name 117 | if args.batch_normalization: 118 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer) 119 | prev_layer = layer_name+'bn' 120 | if args.dropout_fc_p > 0.: 121 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer) 122 | prev_layer = layer_name+'do' 123 | 124 | # Add sequence of residual blocks. 125 | for i in range(args.n_residual_blocks): 126 | # Add a fixed number of layers per residual block. 127 | block_name = '%02d' % i 128 | 129 | graph.add_node(Identity(), name=block_name+'input', input=prev_layer) 130 | prev_layer = block_input_layer = block_name+'input' 131 | 132 | try: 133 | n_layers_per_residual_block = args.n_layers_per_residual_block 134 | except AttributeError: 135 | n_layers_per_residual_block = 2 136 | 137 | for layer_num in range(n_layers_per_residual_block): 138 | layer_name = 'h%s%02d' % (block_name, layer_num) 139 | 140 | l = build_dense_layer(args, n_hidden=args.n_hidden) 141 | graph.add_node(l, name=layer_name, input=prev_layer) 142 | prev_layer = layer_name 143 | 144 | if args.batch_normalization: 145 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer) 146 | prev_layer = layer_name+'bn' 147 | 148 | if i < n_layers_per_residual_block: 149 | a = Activation('relu') 150 | graph.add_node(Activation('relu'), name=layer_name+'relu', input=prev_layer) 151 | prev_layer = layer_name+'relu' 152 | if args.dropout_fc_p > 0.: 153 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer) 154 | prev_layer = layer_name+'do' 155 | 156 | graph.add_node(Identity(), name=block_name+'output', inputs=[block_input_layer, prev_layer], merge_mode='sum') 157 | graph.add_node(Activation('relu'), name=block_name+'relu', input=block_name+'output') 158 | prev_layer = block_input_layer = block_name+'relu' 159 | 160 | n_classes = np.max(train_data['multiclass_correction_target']) + 1 161 | if hasattr(args, 'n_hsm_classes'): 162 | graph.add_node(build_hierarchical_softmax_layer(args), 163 | name='softmax', input=prev_layer) 164 | else: 165 | graph.add_node(build_dense_layer(args, n_classes, 166 | activation='softmax'), name='softmax', input=prev_layer) 167 | 168 | graph.add_output(name='multiclass_correction_target', input='softmax') 169 | 170 | load_weights(args, graph) 171 | 172 | optimizer = build_optimizer(args) 173 | 174 | graph.compile(loss={'multiclass_correction_target': args.loss}, optimizer=optimizer) 175 | 176 | return graph 177 | 178 | def fit(config, callbacks=[]): 179 | train_data = HDF5FileDataset( 180 | config.train_path, 181 | config.data_name, 182 | [config.target_name], 183 | config.batch_size, 184 | config.seed) 185 | 186 | validation_data = HDF5FileDataset( 187 | config.validation_path, 188 | config.data_name, 189 | [config.target_name], 190 | config.batch_size, 191 | config.seed) 192 | 193 | graph = build_model(config, train_data) 194 | 195 | graph.fit_generator(train_data.generator(), 196 | samples_per_epoch=int(train_data.n/100), 197 | nb_epoch=config.n_epochs, 198 | validation_data=validation_data.generator(), 199 | nb_val_samples=10000, 200 | callbacks=callbacks, 201 | class_weight=train_data.class_weights(config.class_weight_exponent)) 202 | -------------------------------------------------------------------------------- /models/keras/spelling/toksents.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | from data import data 6 | import marshal 7 | 8 | sent_file = sys.argv[1] 9 | d = data.load_data(sent_file) 10 | token_seq = data.tokenize(d) 11 | marshal_file = os.path.splitext(sent_file)[0] + '.marshal' 12 | marshal.dump(token_seq, open(marshal_file, 'w')) 13 | print('DONE ' + sent_file) 14 | -------------------------------------------------------------------------------- /models/lasagne/spelling/convnet/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_word_dims": 50, 3 | "use_difference": false, 4 | "n_filters": 1000, 5 | "filter_width": 4, 6 | "loss": "categorical_crossentropy", 7 | "patience": 400, 8 | "batch_size": 128, 9 | "optimizer": "Adagrad", 10 | "learning_rate": 0.1, 11 | "momentum": 0.9, 12 | "decay": 0.0, 13 | "embedding_max_norm": 1000, 14 | "filter_max_norm": 1000, 15 | "dense_max_norm": 1000, 16 | "l2_penalty": 0.0, 17 | "clipnorm": 0, 18 | "regularization_layer": "dropout", 19 | "dropout_p_conv": 0.1, 20 | "dropout_p": 0.5 21 | } 22 | -------------------------------------------------------------------------------- /models/lasagne/spelling/convnet/model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import sys 4 | import os 5 | import time 6 | 7 | import numpy as np 8 | import theano 9 | import theano.tensor as T 10 | 11 | import modeling.lasagne_model 12 | import lasagne 13 | 14 | class Model(modeling.lasagne_model.Classifier): 15 | def build_input_var(self): 16 | return T.imatrix('inputs') 17 | 18 | def build_target_var(self): 19 | return T.ivector('targets') 20 | 21 | def build_updates(self): 22 | return lasagne.updates.nesterov_momentum( 23 | self.train_loss, self.params, 24 | learning_rate=0.01, momentum=0.9) 25 | 26 | def build_model(self): 27 | # Input layer 28 | input_shape = (self.config.batch_size, self.config.input_width) 29 | print('input_shape', input_shape) 30 | model = lasagne.layers.InputLayer(shape=input_shape, 31 | input_var=self.input_var) 32 | 33 | # Embedding layer 34 | model = lasagne.layers.EmbeddingLayer(model, 35 | self.config.n_vocab, self.config.n_word_dims) 36 | 37 | # Convolutional layer 38 | model = lasagne.layers.Conv1DLayer(model, 39 | num_filters=self.config.n_filters, 40 | filter_size=self.config.filter_width, 41 | nonlinearity=lasagne.nonlinearities.rectify, 42 | W=lasagne.init.GlorotUniform()) 43 | 44 | print('pool_size', self.config.input_width-self.config.filter_width-1) 45 | 46 | # Max-pooling layer 47 | model = lasagne.layers.MaxPool1DLayer(model, 48 | pool_size=self.config.input_width-self.config.filter_width-1) 49 | 50 | # Flatten layer 51 | #model = lasagne.layers.FlattenLayer(model) 52 | 53 | # Fully-connected layer 54 | model = lasagne.layers.DenseLayer( 55 | lasagne.layers.dropout(model, p=.0), 56 | num_units=self.config.n_filters*2, 57 | nonlinearity=lasagne.nonlinearities.rectify) 58 | 59 | # Output layer 60 | model = lasagne.layers.DenseLayer( 61 | lasagne.layers.dropout(model, p=.5), 62 | num_units=self.config.n_classes, 63 | nonlinearity=lasagne.nonlinearities.softmax) 64 | 65 | return model 66 | -------------------------------------------------------------------------------- /notebooks/ConvnetSensitivityAnalysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "np.set_printoptions(precision=3)\n", 13 | "np.set_printoptions(suppress=True)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import itertools\n", 25 | "\n", 26 | "def powerset(iterable):\n", 27 | " s = list(iterable)\n", 28 | " return itertools.chain.from_iterable(\n", 29 | " itertools.combinations(s, r) for r in range(len(s)+1))" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 4, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "import json\n", 41 | "import pandas as pd\n", 42 | "from sklearn.metrics import precision_recall_fscore_support\n", 43 | "import modeling.utils" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 5, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "# For scikit learn metrics.\n", 55 | "precision_recall_average = 'macro'" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "# Best so far, but imbalanced.\n", 67 | "model_dir = 'models/keras/preposition/convnet/20a7a6b088ee11e5b2b374d435ed6f3a/'\n", 68 | "\n", 69 | "# Balanced.\n", 70 | "# model_dir = 'models/keras/preposition/convnet/balanced/'\n", 71 | "\n", 72 | "# Load the test set for evaluation.\n", 73 | "data_file = 'data/preposition/prepositions-all-new-test.h5'" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 7, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "Loading weights (build_model)\n", 88 | "Loading weights\n" 89 | ] 90 | }, 91 | { 92 | "name": "stderr", 93 | "output_type": "stream", 94 | "text": [ 95 | "Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled)\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "model, model_cfg = modeling.utils.load_model(model_dir, load_weights=True)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 8, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "[(999552, 5), (999552, 52)]\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "model_data = modeling.utils.load_all_model_data(data_file, model_cfg)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 9, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "# Load target data or metadata (e.g. mapping between numeric target variable and preposition).\n", 131 | "target_data_file = 'data/preposition/prepositions-all-new-target-data.json'\n", 132 | "target_data = json.load(open(target_data_file))" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "def compute_n_unknown_words():\n", 144 | " n_unknown_words = np.zeros_like(model_data.len)\n", 145 | " for i in np.arange(0, len(model_data.len)):\n", 146 | " n_unknown_words[i] = len(np.where(model_data.data[i, 0:model_data.len[i]] == 0)[0])\n", 147 | " return n_unknown_words" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "Sensitivity analysis of effect of position of unknown words in window around preposition\n", 155 | "=======\n", 156 | "1. Take all examples in which the window around the preposition contains no unknown words.\n", 157 | "2. For each set in the powerset of positions in the window (excluding the center, where the preposition occurs):\n", 158 | " 1. Set the words in that position to be unknown (i.e. assign 0 to that position) for all examples.\n", 159 | " 2. Run the examples through the model.\n", 160 | "3. Evaluate the model's performance." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "def sensitivity_analysis(n=50000):\n", 172 | " n_unknown_words = compute_n_unknown_words()\n", 173 | "\n", 174 | " print('# of examples ' + str(len(model_data.data)))\n", 175 | " print('# of examples with no unknown words ' + str((n_unknown_words==0).sum()))\n", 176 | " \n", 177 | " error_detection_targets = np.ones_like(model_data.current_word_code)\n", 178 | " evens = np.arange(0, len(model_data.target), 2)\n", 179 | " error_detection_targets[evens] = 0\n", 180 | "\n", 181 | " no_unknown_words_data = model_data.data[n_unknown_words == 0]\n", 182 | " no_unknown_words_correction_targets = model_data.target[n_unknown_words == 0]\n", 183 | " no_unknown_words_detection_targets = error_detection_targets[n_unknown_words == 0]\n", 184 | "\n", 185 | " window_size = 5\n", 186 | " center = 2\n", 187 | "\n", 188 | " assert len(np.where(model_data.data[:, center] == 0)[0]) == 0\n", 189 | "\n", 190 | " indices_in_window = [center-2, center-1, center+1, center+2]\n", 191 | "\n", 192 | " masks = [mask for mask in powerset(indices_in_window)]\n", 193 | "\n", 194 | " correction_results = {}\n", 195 | " \n", 196 | " results_df = None\n", 197 | "\n", 198 | " for mask in masks:\n", 199 | " data = no_unknown_words_data.copy()[0:n]\n", 200 | " mask = np.array(mask, dtype=int)\n", 201 | "\n", 202 | " data[:, mask] = 0\n", 203 | "\n", 204 | " for i in np.arange(len(data)):\n", 205 | " data[i, mask + model_data.position[i] + 3] = 0\n", 206 | "\n", 207 | " no_unknown_words_correction_preds = model.predict_classes(data, verbose=0)\n", 208 | "\n", 209 | " unknowns_str = ['_'] * (len(indices_in_window) + 1)\n", 210 | " for x in mask:\n", 211 | " unknowns_str[x] = \"?\"\n", 212 | " unknowns_str[center] = \"P\"\n", 213 | "\n", 214 | " # Error correction\n", 215 | " p, r, f, _ = precision_recall_fscore_support(\n", 216 | " no_unknown_words_correction_targets[0:n],\n", 217 | " no_unknown_words_correction_preds,\n", 218 | " average=precision_recall_average)\n", 219 | " \n", 220 | " row = pd.DataFrame({\n", 221 | " \"pos-2\": [unknowns_str[0]],\n", 222 | " \"pos-1\": [unknowns_str[1]],\n", 223 | " \"pos-0\": [unknowns_str[2]],\n", 224 | " \"pos+1\": [unknowns_str[3]],\n", 225 | " \"pos+2\": [unknowns_str[4]],\n", 226 | " \"precision\": [p],\n", 227 | " \"recall\": [r],\n", 228 | " \"f1\": [f],\n", 229 | " \"n\": [n]\n", 230 | " })\n", 231 | " if results_df is None:\n", 232 | " results_df = row\n", 233 | " else:\n", 234 | " results_df = pd.concat([results_df, row])\n", 235 | "\n", 236 | " results_df = results_df[[\"pos-2\", \"pos-1\", \"pos-0\", \"pos+1\", \"pos+2\", \"precision\", \"recall\", \"f1\", \"n\"]]\n", 237 | " print(results_df.to_latex(index=False, float_format=lambda f: '%.02f' % f))\n", 238 | " \n", 239 | "sensitivity_analysis(n=10000)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": { 246 | "collapsed": true 247 | }, 248 | "outputs": [], 249 | "source": [] 250 | } 251 | ], 252 | "metadata": { 253 | "kernelspec": { 254 | "display_name": "Python 2", 255 | "language": "python", 256 | "name": "python2" 257 | }, 258 | "language_info": { 259 | "codemirror_mode": { 260 | "name": "ipython", 261 | "version": 2 262 | }, 263 | "file_extension": ".py", 264 | "mimetype": "text/x-python", 265 | "name": "python", 266 | "nbconvert_exporter": "python", 267 | "pygments_lexer": "ipython2", 268 | "version": "2.7.10" 269 | } 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 0 273 | } 274 | -------------------------------------------------------------------------------- /notebooks/notes.txt: -------------------------------------------------------------------------------- 1 | --------------------------------------------------------------------------- 2 | Given our current architecture, how helpful is increasing the training set size? 3 | --------------------------------------------------------------------------- 4 | 5 | Embedding size: 50 6 | Number of convolutional filters: 500 7 | Filter width: 4 8 | Max filter norm: 1 (might be too small, considering filter width) 9 | Hidden fully-connected layers: 3 10 | Fully-connected layer sizes: 1000, 1000, 500 11 | Learning rate: 0.03 12 | Momentum: 0.9 13 | Weight decay: 0 14 | 15 | Train on 1m, validate on 200k: 16 | 17 | acc: 0.1336 - val_acc: 0.2124 - val_f1: 0.12 18 | acc: 0.2628 - val_acc: 0.3049 - val_f1: 0.21 19 | acc: 0.3093 - val_acc: 0.3476 - val_f1: 0.26 20 | acc: 0.3571 - val_acc: 0.3972 - val_f1: 0.33 21 | acc: 0.3913 - val_acc: 0.4225 - val_f1: 0.36 22 | acc: 0.4091 - val_acc: 0.4312 - val_f1: 0.38 23 | 24 | Train on 2m, validate on 200k: 25 | 26 | acc: 0.1909 - val_acc: 0.3048 - val_f1: 0.22 27 | acc: 0.3421 - val_acc: 0.4070 - val_f1: 0.34 28 | acc: 0.4028 - val_acc: 0.4362 - val_f1: 0.38 29 | acc: 0.4208 - val_acc: 0.4464 - val_f1: 0.40 30 | acc: 0.4293 - val_acc: 0.4522 - val_f1: 0.40 31 | 32 | Train on 4m, validate on 200k: 33 | 34 | acc: 0.2616 - val_acc: 0.4063 - val_f1: 0.25 35 | acc: 0.4095 - val_acc: 0.4456 - val_f1: 0.33 36 | acc: 0.4315 - val_acc: 0.4573 - val_f1: 0.35 37 | acc: 0.4398 - val_acc: 0.4630 - val_f1: 0.37 38 | acc: 0.4448 - val_acc: 0.4658 - val_f1: 0.38 39 | acc: 0.4484 - val_acc: 0.4673 - val_f1: 0.39 40 | acc: 0.4509 - val_acc: 0.4693 - val_f1: 0.39 41 | 42 | Loosen max norm constraint on word embeddings to 2, use class weights 43 | to help model perform better on less frequent classes: 44 | 45 | Embedding size: 50 46 | Number of convolutional filters: 500 47 | Filter width: 4 48 | Hidden fully-connected layers: 49 | Number: 3 50 | Fully-connected layer sizes: 1000, 1000, 500 51 | Learning rate: 0.1 52 | Momentum: 0.9 53 | Decay: 0.000000001 54 | 55 | Train on 4m, validate on 200k: 56 | acc: 0.3606 - val_acc: 0.4497 - val_f1: 0.40 57 | acc: 0.4343 - val_acc: 0.4619 - val_f1: 0.42 58 | acc: 0.4366 - val_acc: 0.4585 - val_f1: 0.42 59 | acc: 0.4362 - val_acc: 0.4624 - val_f1: 0.42 60 | acc: 0.4335 - val_acc: 0.4553 - val_f1: 0.41 61 | acc: 0.4316 - val_acc: 0.4541 - val_f1: 0.41 62 | 63 | --------------------------------------------------------------------------- 64 | What happens when we use an LSTM network instead of a temporal 65 | convolutional network? 66 | --------------------------------------------------------------------------- 67 | 68 | Embedding size: 50 69 | Number of LSTM layers: 3 70 | Number of units in LSTM layers: 64 71 | Dropout after each LSTM layer: 0.2 72 | Learning rate: 0.1 73 | Momentum: 0.9 74 | Decay: 0 75 | Gradient truncation: -1 (classical BPTT) 76 | Norm clipping threshold: 0 (no clipping) 77 | 78 | Train on 4m, validate on 200k: 79 | acc: 0.2791 - val_acc: 0.3589 - val_f1: 0.29 80 | acc: 0.4383 - val_acc: 0.5068 - val_f1: 0.47 81 | acc: 0.4863 - val_acc: 0.5203 - val_f1: 0.49 82 | acc: 0.5251 - val_acc: 0.5438 - val_f1: 0.51 83 | acc: 0.5461 - val_acc: 0.5613 - val_f1: 0.53 84 | acc: 0.5614 - val_acc: 0.5697 - val_f1: 0.55 85 | acc: 0.5736 - val_acc: 0.5744 - val_f1: 0.56 86 | acc: 0.5825 - val_acc: 0.5800 - val_f1: 0.56 87 | 88 | New configuration (with clipping) 89 | 90 | Embedding size: 50 91 | Number of LSTM layers: 3 92 | Number of units in LSTM layers: 64 93 | Dropout after each LSTM layer: 0.2 94 | Learning rate: 0.1 95 | Momentum: 0.9 96 | Decay: 0 97 | Gradient truncation: -1 (classical BPTT) 98 | Norm clipping threshold: 5 (no clipping) 99 | 100 | Train on 8m, validate on 200k (here one epoch is 100k examples; I had 101 | to split up the 8m examples into separate files, and each gets its 102 | own epoch): 103 | 104 | acc: 0.4823 - val_acc: 0.491425 105 | 106 | New configuration (with clipping) 107 | 108 | Embedding size: 50 109 | Number of LSTM layers: 4 110 | Number of units in LSTM layers: 500 111 | Dropout after each LSTM layer: 0.2 112 | Learning rate: 0.1 113 | Momentum: 0.9 114 | Decay: 0 115 | Gradient truncation: -1 (classical BPTT) 116 | Norm clipping threshold: 5 (no clipping) 117 | 118 | Train on 16m, validate on 200k: 119 | 120 | --------------------------------------------------------------------------- 121 | After creating a balanced data set (which excluded 'about', because it 122 | is so less frequent than the other prepositions), I may have discovered 123 | that training a model with intra-minibatch contrasting cases -- that is, 124 | with every sentence from the corpus being accompanied by an example that 125 | is the same sentence with an error introduced -- is essential to being 126 | able to train this model. 127 | --------------------------------------------------------------------------- 128 | 129 | --------------------------------------------------------------------------- 130 | Multi-task learning; change train.py to allow multiple --target 131 | arguments ... or change the architecture so the targets used are 132 | determined by model.json and model.py? 133 | --------------------------------------------------------------------------- 134 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | keras=0.3.1 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='Modeling package', packages=['modeling']) 4 | -------------------------------------------------------------------------------- /tests/testdata.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | #import six 5 | import sys 6 | import os 7 | import numpy as np 8 | 9 | import unittest 10 | import modeling.data 11 | 12 | class TestData(unittest.TestCase): 13 | def test_create_window_position_at_beginning(self): 14 | sentence = np.arange(1, 12) 15 | position = 0 16 | expected_window = [0, 0, 0, 1, 2, 3, 4] 17 | window = modeling.data.create_window(sentence, position, 18 | size=7) 19 | 20 | self.assertEqual(7, len(window)) 21 | self.assertTrue(np.all(window == expected_window)) 22 | 23 | def test_create_window_position_at_end_nonce(self): 24 | sentence = np.arange(1, 12) 25 | position = len(sentence) - 1 26 | nonce = 99 27 | expected_window = [8, 9, 10, nonce, 0, 0, 0] 28 | window = modeling.data.create_window(sentence, position, 29 | size=7, nonce=nonce) 30 | 31 | self.assertEqual(7, len(window)) 32 | self.assertTrue(np.all(window == expected_window)) 33 | 34 | def test_create_window_position_before_sentence(self): 35 | sentence = np.arange(1, 12) 36 | position = -1 37 | self.assertRaises( 38 | ValueError, 39 | modeling.data.create_window, 40 | sentence, position) 41 | 42 | def test_create_window_position_after_sentence(self): 43 | sentence = np.arange(1, 12) 44 | position = 12 45 | self.assertRaises( 46 | ValueError, 47 | modeling.data.create_window, 48 | sentence, position) 49 | 50 | -------------------------------------------------------------------------------- /tests/testdifference.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | import unittest 5 | import numpy as np 6 | from theano import function 7 | import theano.tensor as T 8 | 9 | from keras.layers.core import Layer 10 | 11 | class TemporalDifference(Layer): 12 | """ 13 | Given a 3-tensor with shape (nb_samples, maxlen, output_dim), outputs 14 | the difference X[ 15 | """ 16 | def _get_output(self, X): 17 | return X[:, 1:, :] - X[:, 0:X.shape[1]-1, :] 18 | 19 | def get_output(self, train): 20 | return self._get_output(self.get_input(train)) 21 | 22 | def get_config(self): 23 | return {"name": self.__class__.__name__} 24 | 25 | class TestTemporalDifference(unittest.TestCase): 26 | def testForward(self): 27 | nb_examples = 2 28 | maxlen = 7 29 | output_dim = nb_word_dim = 5 30 | x = np.random.normal(size=(nb_examples, maxlen, output_dim)).astype(np.float32) 31 | expected = x[:, 1:, :] - x[:, 0:x.shape[1]-1, :] 32 | X = T.tensor3('X') 33 | retval = TemporalDifference()._get_output(X) 34 | f = function([X], retval) 35 | actual = f(x) 36 | self.assertTrue(np.allclose(actual, expected)) 37 | -------------------------------------------------------------------------------- /tests/testlasagne.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | 4 | import six 5 | import sys 6 | import os 7 | import numpy as np 8 | 9 | import unittest 10 | import modeling.lasagne_model 11 | import modeling.utils 12 | 13 | import theano.tensor as T 14 | import lasagne 15 | 16 | # From Lasagne/examples/mnist.py 17 | def load_mnist(): 18 | # We first define a download function, supporting both Python 2 and 3. 19 | if sys.version_info[0] == 2: 20 | from urllib import urlretrieve 21 | else: 22 | from urllib.request import urlretrieve 23 | 24 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'): 25 | print("Downloading %s" % filename) 26 | urlretrieve(source + filename, filename) 27 | 28 | # We then define functions for loading MNIST images and labels. 29 | # For convenience, they also download the requested files if needed. 30 | import gzip 31 | 32 | def load_mnist_images(filename): 33 | if not os.path.exists(filename): 34 | download(filename) 35 | # Read the inputs in Yann LeCun's binary format. 36 | with gzip.open(filename, 'rb') as f: 37 | data = np.frombuffer(f.read(), np.uint8, offset=16) 38 | # The inputs are vectors now, we reshape them to monochrome 2D images, 39 | # following the shape convention: (examples, channels, rows, columns) 40 | data = data.reshape(-1, 1, 28, 28) 41 | # The inputs come as bytes, we convert them to float32 in range [0,1]. 42 | # (Actually to range [0, 255/256], for compatibility to the version 43 | # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.) 44 | return data / np.float32(256) 45 | 46 | def load_mnist_labels(filename): 47 | if not os.path.exists(filename): 48 | download(filename) 49 | # Read the labels in Yann LeCun's binary format. 50 | with gzip.open(filename, 'rb') as f: 51 | data = np.frombuffer(f.read(), np.uint8, offset=8) 52 | # The labels are vectors of integers now, that's exactly what we want. 53 | return data 54 | 55 | # We can now download and read the training and test set images and labels. 56 | X_train = load_mnist_images('train-images-idx3-ubyte.gz') 57 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz') 58 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz') 59 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz') 60 | 61 | # We reserve the last 10000 training examples for validation. 62 | X_train, X_val = X_train[:-10000], X_train[-10000:] 63 | y_train, y_val = y_train[:-10000], y_train[-10000:] 64 | 65 | # We just return all the arrays in order, as expected in main(). 66 | # (It doesn't matter how we do this as long as we can read them again.) 67 | return X_train, y_train, X_val, y_val, X_test, y_test 68 | 69 | class TestModel(modeling.lasagne_model.Classifier): 70 | def build_input_var(self): 71 | return T.tensor4('inputs') 72 | 73 | def build_target_var(self): 74 | return T.ivector('targets') 75 | 76 | def build_updates(self): 77 | return lasagne.updates.nesterov_momentum( 78 | self.train_loss, self.params, learning_rate=0.01, momentum=0.9) 79 | 80 | def build_model(self, input_var): 81 | l_in = lasagne.layers.InputLayer( 82 | shape=(None, 1, 28, 28), input_var=input_var) 83 | 84 | l_in_drop = lasagne.layers.DropoutLayer(l_in, p=0.2) 85 | 86 | # Add a fully-connected layer of 800 units, using the linear rectifier, and 87 | # initializing weights with Glorot's scheme (which is the default anyway). 88 | l_hid1 = lasagne.layers.DenseLayer( 89 | l_in_drop, num_units=800, 90 | nonlinearity=lasagne.nonlinearities.rectify, 91 | W=lasagne.init.GlorotUniform()) 92 | 93 | # Finally, we'll add the fully-connected output layer, of 10 softmax units: 94 | l_out = lasagne.layers.DenseLayer( 95 | l_hid1, num_units=10, 96 | nonlinearity=lasagne.nonlinearities.softmax) 97 | 98 | # Each layer is linked to its incoming layer(s), so we only need to pass 99 | # the output layer to give access to a network in Lasagne: 100 | return l_out 101 | 102 | class TestLasagneClassifier(unittest.TestCase): 103 | def test_mnist(self): 104 | args = {} 105 | config = modeling.utils.ModelConfig(**args) 106 | model = TestModel(config) 107 | X_train, y_train, X_val, y_val, X_test, y_test = load_mnist() 108 | n_epochs = 5 109 | batch_size = 256 110 | for epoch in six.moves.range(n_epochs): 111 | for j in six.moves.range(0, len(X_train), batch_size): 112 | model.fit(X_train[j:j+batch_size], y_train[j:j+batch_size]) 113 | val_loss, val_acc = model.evaluate(X_val, y_val) 114 | self.assertTrue(val_acc > 0.9) 115 | 116 | def test_save_load(self): 117 | weights_file = '/tmp/model.npz' 118 | 119 | args = {} 120 | config = modeling.utils.ModelConfig(**args) 121 | rng1 = np.random.RandomState(17) 122 | lasagne.random.set_rng(rng1) 123 | model1 = TestModel(config) 124 | model1.save_weights(weights_file) 125 | 126 | rng2 = np.random.RandomState(23) 127 | lasagne.random.set_rng(rng2) 128 | model2 = TestModel(config) 129 | model2.load_weights(weights_file) 130 | 131 | weights1 = lasagne.layers.get_all_param_values(model1.model) 132 | weights2 = lasagne.layers.get_all_param_values(model2.model) 133 | 134 | for i in six.moves.range(len(weights1)): 135 | w1 = weights1[i] 136 | w2 = weights2[i] 137 | self.assertTrue(np.allclose(w1, w2)) 138 | -------------------------------------------------------------------------------- /tests/testlayers.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | #import theano.tensor.nnet 6 | 7 | import keras.models 8 | import keras.layers.core 9 | 10 | from modeling.layers import HierarchicalSoftmax 11 | import modeling.utils 12 | import modeling.builders 13 | 14 | class TestHierarchicalSoftmax(unittest.TestCase): 15 | def setUp(self): 16 | self.batch_size = 1 17 | self.input_dim = 4 18 | self.n_hsm_classes = 5 19 | self.n_outputs_per_class = 3 20 | self.output_size = self.n_hsm_classes * self.n_outputs_per_class 21 | 22 | def test_hierarchical_softmax_integrated(self): 23 | net = keras.models.Sequential() 24 | net.add(keras.layers.core.Dense(100, input_dim=self.input_dim, activation='relu')) 25 | net.add(HierarchicalSoftmax( 26 | self.output_size, self.n_hsm_classes, 27 | #self.n_hsm_classes, self.n_outputs_per_class, 28 | batch_size=self.batch_size)) 29 | net.compile(loss='categorical_crossentropy', optimizer='Adam') 30 | x = np.random.normal(size=(self.batch_size, self.input_dim)) 31 | target = net.predict_proba(x, verbose=0) 32 | n_classes = self.n_hsm_classes * self.n_outputs_per_class 33 | self.assertEqual((self.batch_size, n_classes), target.shape) 34 | 35 | def test_hierarchical_softmax_isolated(self): 36 | layer = HierarchicalSoftmax(self.output_size, self.n_hsm_classes, 37 | #self.n_outputs_per_class, 38 | batch_size=self.batch_size, 39 | input_dim=self.input_dim) 40 | layer.build() 41 | 42 | xt = T.matrix('x') 43 | f = theano.function([xt], layer._get_output(xt)) 44 | x = np.random.normal(size=(self.batch_size, self.input_dim)).astype(np.float32) 45 | 46 | output = f(x) 47 | self.assertTrue(output.shape == (self.batch_size, self.output_size)) 48 | self.assertTrue(np.allclose(1.0, output.sum())) 49 | 50 | #@unittest.skip('') 51 | def test_theano_h_softmax(self): 52 | """ 53 | Tests the output dimensions of the h_softmax when a target is provided or 54 | not. 55 | 56 | This test came from 57 | """ 58 | 59 | ############# 60 | # Initialize shared variables 61 | ############# 62 | 63 | floatX = theano.config.floatX 64 | shared = theano.shared 65 | 66 | # Class softmax. 67 | W1 = np.asarray(np.random.normal( 68 | size=(self.input_dim, self.n_hsm_classes)), dtype=floatX) 69 | W1 = shared(W1) 70 | b1 = np.asarray(np.zeros((self.n_hsm_classes,)), dtype=floatX) 71 | b1 = shared(b1) 72 | 73 | # Class member softmax. 74 | W2 = np.asarray(np.random.normal( 75 | size=(self.n_hsm_classes, self.input_dim, self.n_outputs_per_class)), 76 | dtype=floatX) 77 | W2 = shared(W2) 78 | b2 = np.asarray( 79 | np.zeros((self.n_hsm_classes, self.n_outputs_per_class)), dtype=floatX) 80 | b2 = shared(b2) 81 | 82 | ############# 83 | # Build graph 84 | ############# 85 | x = T.matrix('x') 86 | y = T.ivector('y') 87 | 88 | # This only computes the output corresponding to the target 89 | y_hat_tg = theano.tensor.nnet.h_softmax(x, 90 | self.batch_size, self.output_size, self.n_hsm_classes, self.n_outputs_per_class, 91 | W1, b1, W2, b2, y) 92 | 93 | # This computes all the outputs 94 | y_hat_all = theano.tensor.nnet.h_softmax(x, 95 | self.batch_size, self.output_size, self.n_hsm_classes, self.n_outputs_per_class, 96 | W1, b1, W2, b2) 97 | 98 | ############# 99 | # Compile functions 100 | ############# 101 | fun_output_tg = theano.function([x, y], y_hat_tg) 102 | fun_output = theano.function([x], y_hat_all) 103 | 104 | ############# 105 | # Test 106 | ############# 107 | x_mat = np.random.normal(size=(self.batch_size, self.input_dim)).astype(floatX) 108 | y_mat = np.random.randint(0, self.output_size, self.batch_size).astype('int32') 109 | 110 | self.assertTrue(fun_output_tg(x_mat, y_mat).shape == (self.batch_size,)) 111 | self.assertTrue(fun_output(x_mat).shape == (self.batch_size, self.output_size)) 112 | 113 | if __name__ == '__main__': 114 | unittest.main() 115 | -------------------------------------------------------------------------------- /tests/testnonconvnet.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import random 3 | import numpy as np 4 | import theano 5 | import theano.tensor as T 6 | 7 | from keras import models 8 | from keras.layers import embeddings 9 | from keras.layers import core 10 | 11 | from modeling.nonconvnet import ZeroFillDiagonals, \ 12 | SplitOutputByFilter, \ 13 | SlidingWindowL2MaxPooling 14 | 15 | class TestNonConvNet(unittest.TestCase): 16 | def setUp(self): 17 | self.n_vocab = 100 18 | self.n_word_dims = 5 19 | self.filter_width = 4 20 | self.n_filters = 3 21 | self.max_seq_len = 9 22 | self.batch_size = 3 23 | 24 | def setSeeds(self): 25 | np.random.seed(1) 26 | 27 | def testNonConvNet(self): 28 | self.setSeeds() 29 | 30 | x = np.random.randint(self.n_vocab, size=(self.batch_size, 31 | self.max_seq_len)) 32 | 33 | model = models.Sequential() 34 | 35 | # input: (batch_size, max_seq_len) 36 | # output: (batch_size, max_seq_len, n_word_dims) 37 | model.add(embeddings.Embedding(self.n_vocab, self.n_word_dims)) 38 | model.compile(loss='mse', optimizer='sgd') 39 | expected_shape_l1 = (self.batch_size, self.max_seq_len, 40 | self.n_word_dims) 41 | output_l1 = model.predict(x) 42 | self.assertEqual(expected_shape_l1, output_l1.shape) 43 | 44 | # input: (batch_size, max_seq_len, n_word_dims) 45 | # output: (batch_size, max_seq_len, n_filters * filter_width) 46 | model.add(core.TimeDistributedDense( 47 | self.n_word_dims, self.n_filters * self.filter_width)) 48 | model.compile(loss='mse', optimizer='sgd') 49 | expected_shape_l2 = (self.batch_size, self.max_seq_len, 50 | self.n_filters * self.filter_width) 51 | output_l2 = model.predict(x) 52 | self.assertEqual(expected_shape_l2, output_l2.shape) 53 | 54 | # input: (batch_size, max_seq_len, n_filters * filter_width) 55 | # output: (batch_size, n_filters, max_seq_len, filter_width) 56 | model.add(SplitOutputByFilter(self.n_filters, self.filter_width)) 57 | model.compile(loss='mse', optimizer='sgd') 58 | expected_shape_l3 = (self.batch_size, self.n_filters, 59 | self.max_seq_len, self.filter_width) 60 | output_l3 = model.predict(x) 61 | self.assertEqual(expected_shape_l3, output_l3.shape) 62 | 63 | # input: (batch_size, n_filters, max_seq_len, filter_width) 64 | # output: (batch_size, n_filters, filter_width, filter_width) 65 | model.add(SlidingWindowL2MaxPooling( 66 | self.batch_size, self.n_filters, 67 | self.filter_width, self.max_seq_len)) 68 | model.compile(loss='mse', optimizer='sgd') 69 | expected_shape_l4 = (self.batch_size, self.n_filters, 70 | self.filter_width, self.filter_width) 71 | output_l4 = model.predict(x) 72 | self.assertEqual(expected_shape_l4, output_l4.shape) 73 | 74 | # input: (batch_size, n_filters, filter_width, filter_width) 75 | # output: (batch_size, n_filters, filter_width, filter_width) 76 | model.add(ZeroFillDiagonals( 77 | self.batch_size, self.n_filters, self.filter_width)) 78 | model.compile(loss='mse', optimizer='sgd') 79 | expected_shape_l5 = (self.batch_size, self.n_filters, 80 | self.filter_width, self.filter_width) 81 | output_l5 = model.predict(x) 82 | self.assertEqual(expected_shape_l5, output_l5.shape) 83 | 84 | def testSplitOutputByFilter(self): 85 | self.setSeeds() 86 | 87 | input_shape = (self.batch_size, self.max_seq_len, 88 | self.n_filters * self.filter_width) 89 | output_shape = (self.batch_size, self.n_filters, 90 | self.max_seq_len, self.filter_width) 91 | 92 | x = np.arange(np.prod(input_shape)) 93 | x = x.reshape(input_shape).astype(np.int32) 94 | y = np.zeros_like(x) 95 | y = np.reshape(y, output_shape) 96 | 97 | for i in range(self.n_filters): 98 | s = x[:, :, i*self.filter_width:(i+1)*self.filter_width] 99 | y[:, i, :, :] = s 100 | 101 | xt = T.itensor3('xt') 102 | layer = SplitOutputByFilter(self.n_filters, self.filter_width) 103 | yt = layer._get_output(xt) 104 | 105 | f = theano.function(inputs=[xt], outputs=yt) 106 | y_theano = f(x) 107 | 108 | self.assertEquals(y.shape, y_theano.shape) 109 | self.assertTrue(np.all(y == y_theano)) 110 | 111 | def testSlidingWindowL2MaxPooling(self): 112 | self.assertTrue( 113 | self.max_seq_len - self.filter_width > self.n_filters) 114 | 115 | self.setSeeds() 116 | 117 | input_shape = (self.batch_size, self.n_filters, 118 | self.max_seq_len, self.filter_width) 119 | output_shape = (self.batch_size, self.n_filters, 120 | self.filter_width, self.filter_width) 121 | 122 | x = np.zeros(shape=input_shape) 123 | expected = np.zeros(shape=output_shape) 124 | 125 | max_input_shape = (self.batch_size, self.filter_width, self.filter_width) 126 | 127 | # For the i-th filter, make i the offset at which the maximum 128 | # L2 norm occurs. 129 | for i in np.arange(self.n_filters): 130 | start = i 131 | end = i+self.filter_width 132 | values = i + np.arange(np.prod(max_input_shape)) 133 | values = values.reshape(max_input_shape) 134 | x[:, i, start:end, :] = values 135 | expected[:, i, :, :] = values 136 | 137 | it = T.iscalar() 138 | x3d = T.dtensor3('x3d') 139 | x4d = T.dtensor4('x4d') 140 | 141 | layer = SlidingWindowL2MaxPooling( 142 | self.batch_size, self.n_filters, self.filter_width, 143 | self.max_seq_len) 144 | 145 | ''' 146 | Use the first sample and first filter to test `filter_dimension`. 147 | ''' 148 | yt_filter_dim = layer.filter_dimension(it, x3d) 149 | f_filter_dim = theano.function(inputs=[it, x3d], outputs=yt_filter_dim) 150 | y_filter_dim_out = f_filter_dim(0, x[0]) 151 | self.assertEquals((self.filter_width, self.filter_width), 152 | y_filter_dim_out.shape) 153 | self.assertTrue(np.all(expected[0, 0, :, :] == y_filter_dim_out)) 154 | 155 | ''' 156 | Use the first sample to test `filter_dimension`. 157 | ''' 158 | yt_sample_dim = layer.sample_dimension(it, x4d) 159 | f_sample_dim = theano.function(inputs=[it, x4d], outputs=yt_sample_dim) 160 | y_sample_dim_out = f_sample_dim(0, x) 161 | self.assertEquals((self.n_filters, self.filter_width, self.filter_width), 162 | y_sample_dim_out.shape) 163 | self.assertTrue(np.all(expected[0, :, :, :] == y_sample_dim_out)) 164 | 165 | ''' 166 | Use all of `x` to test `_get_output`. 167 | ''' 168 | yt_output = layer._get_output(x4d) 169 | f_output = theano.function(inputs=[x4d], outputs=yt_output) 170 | yt_out = f_output(x) 171 | self.assertEquals( 172 | (self.batch_size, self.n_filters, self.filter_width, 173 | self.filter_width), yt_out.shape) 174 | self.assertTrue(np.all(expected == yt_out)) 175 | 176 | def testZeroFillDiagonals(self): 177 | input_shape = (self.batch_size, self.n_filters, 178 | self.filter_width, self.filter_width) 179 | mask = np.ones(input_shape) 180 | diag_indices = np.arange(self.filter_width) 181 | for i in np.arange(self.batch_size): 182 | for j in np.arange(self.n_filters): 183 | mask[i, j, diag_indices, diag_indices] = 0 184 | 185 | x = np.arange(np.prod(input_shape)).reshape(input_shape) 186 | expected = x * mask 187 | 188 | x4d = T.dtensor4('x4d') 189 | layer = ZeroFillDiagonals( 190 | self.batch_size, self.n_filters, self.filter_width) 191 | yt_output = layer._get_output(x4d) 192 | f_output = theano.function(inputs=[x4d], outputs=yt_output) 193 | 194 | yt_out = f_output(x) 195 | self.assertEquals(expected.shape, yt_out.shape) 196 | self.assertTrue(np.all(expected == yt_out)) 197 | -------------------------------------------------------------------------------- /train_chainer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | 6 | import sys 7 | import six 8 | import argparse 9 | import progressbar 10 | import copy 11 | import cPickle 12 | 13 | import numpy as np 14 | import pandas as pd 15 | 16 | import chainer 17 | from chainer import cuda 18 | from modeling.chainer_model import Classifier 19 | from modeling.utils import ( 20 | load_model_data, load_model_json, build_model_id, build_model_path, 21 | setup_model_dir, setup_logging, ModelConfig) 22 | import modeling.parser 23 | 24 | def main(args): 25 | if args.gpu >= 0: 26 | cuda.check_cuda_available() 27 | xp = cuda.cupy if args.gpu >= 0 else np 28 | 29 | model_id = build_model_id(args) 30 | model_path = build_model_path(args, model_id) 31 | setup_model_dir(args, model_path) 32 | sys.stdout, sys.stderr = setup_logging(args) 33 | 34 | x_train, y_train = load_model_data(args.train_file, 35 | args.data_name, args.target_name, 36 | n=args.n_train) 37 | x_validation, y_validation = load_model_data( 38 | args.validation_file, 39 | args.data_name, args.target_name, 40 | n=args.n_validation) 41 | 42 | rng = np.random.RandomState(args.seed) 43 | 44 | N = len(x_train) 45 | N_validation = len(x_validation) 46 | 47 | n_classes = max(np.unique(y_train)) + 1 48 | json_cfg = load_model_json(args, x_train, n_classes) 49 | 50 | print('args.model_dir', args.model_dir) 51 | sys.path.append(args.model_dir) 52 | from model import Model 53 | model_cfg = ModelConfig(**json_cfg) 54 | model = Model(model_cfg) 55 | setattr(model, 'stop_training', False) 56 | 57 | if args.gpu >= 0: 58 | cuda.get_device(args.gpu).use() 59 | model.to_gpu() 60 | 61 | best_accuracy = 0. 62 | best_epoch = 0 63 | 64 | def keep_training(epoch, best_epoch): 65 | if model_cfg.n_epochs is not None and epoch > model_cfg.n_epochs: 66 | return False 67 | if epoch > 1 and epoch - best_epoch > model_cfg.patience: 68 | return False 69 | return True 70 | 71 | epoch = 1 72 | 73 | while True: 74 | if not keep_training(epoch, best_epoch): 75 | break 76 | 77 | if args.shuffle: 78 | perm = np.random.permutation(N) 79 | else: 80 | perm = np.arange(N) 81 | 82 | sum_accuracy = 0 83 | sum_loss = 0 84 | 85 | pbar = progressbar.ProgressBar(term_width=40, 86 | widgets=[' ', progressbar.Percentage(), 87 | ' ', progressbar.ETA()], 88 | maxval=N).start() 89 | 90 | for j, i in enumerate(six.moves.range(0, N, model_cfg.batch_size)): 91 | pbar.update(j+1) 92 | x_batch = xp.asarray(x_train[perm[i:i + model_cfg.batch_size]].flatten()) 93 | y_batch = xp.asarray(y_train[perm[i:i + model_cfg.batch_size]]) 94 | pred, loss, acc = model.fit(x_batch, y_batch) 95 | sum_loss += float(loss.data) * len(y_batch) 96 | sum_accuracy += float(acc.data) * len(y_batch) 97 | 98 | pbar.finish() 99 | print('train epoch={}, mean loss={}, accuracy={}'.format( 100 | epoch, sum_loss / N, sum_accuracy / N)) 101 | 102 | # Validation set evaluation 103 | sum_accuracy = 0 104 | sum_loss = 0 105 | 106 | pbar = progressbar.ProgressBar(term_width=40, 107 | widgets=[' ', progressbar.Percentage(), 108 | ' ', progressbar.ETA()], 109 | maxval=N_validation).start() 110 | 111 | for i in six.moves.range(0, N_validation, model_cfg.batch_size): 112 | pbar.update(i+1) 113 | x_batch = xp.asarray(x_validation[i:i + model_cfg.batch_size].flatten()) 114 | y_batch = xp.asarray(y_validation[i:i + model_cfg.batch_size]) 115 | pred, loss, acc = model.predict(x_batch, target=y_batch) 116 | sum_loss += float(loss.data) * len(y_batch) 117 | sum_accuracy += float(acc.data) * len(y_batch) 118 | 119 | pbar.finish() 120 | validation_accuracy = sum_accuracy / N_validation 121 | validation_loss = sum_loss / N_validation 122 | 123 | if validation_accuracy > best_accuracy: 124 | best_accuracy = validation_accuracy 125 | best_epoch = epoch 126 | if model_path is not None: 127 | if args.gpu >= 0: 128 | model.to_cpu() 129 | store = { 130 | 'args': args, 131 | 'model': model, 132 | } 133 | cPickle.dump(store, open(model_path + '.store', 'w')) 134 | if args.gpu >= 0: 135 | model.to_gpu() 136 | 137 | print('validation epoch={}, mean loss={}, accuracy={} best=[accuracy={} epoch={}]'.format( 138 | epoch, validation_loss, validation_accuracy, 139 | best_accuracy, 140 | best_epoch)) 141 | 142 | epoch += 1 143 | 144 | if __name__ == '__main__': 145 | parser = modeling.parser.build_chainer() 146 | sys.exit(main(parser.parse_args())) 147 | -------------------------------------------------------------------------------- /train_keras_simple.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | 6 | import os, sys, shutil 7 | import logging 8 | import json 9 | import uuid 10 | import json 11 | import itertools 12 | 13 | import numpy as np 14 | 15 | import theano 16 | import h5py 17 | import six 18 | from sklearn.metrics import accuracy_score 19 | 20 | from keras.utils import np_utils 21 | from keras.optimizers import SGD 22 | import keras.callbacks 23 | from keras.callbacks import ModelCheckpoint, EarlyStopping 24 | import keras.models 25 | 26 | sys.path.append('.') 27 | 28 | from modeling.callbacks import (ClassificationReport, 29 | ConfusionMatrix, PredictionCallback, 30 | DelegatingMetricCallback, 31 | SingleStepLearningRateSchedule) 32 | from modeling.utils import (count_parameters, callable_print, 33 | setup_logging, setup_model_dir, save_model_info, 34 | load_model_data, load_model_json, load_target_data, 35 | build_model_id, build_model_path, 36 | ModelConfig) 37 | import modeling.preprocess 38 | import modeling.parser 39 | 40 | def main(args): 41 | model_id = build_model_id(args) 42 | model_path = build_model_path(args, model_id) 43 | setup_model_dir(args, model_path) 44 | 45 | rng = np.random.RandomState(args.seed) 46 | 47 | json_cfg = load_model_json(args, x_train=None, n_classes=None) 48 | model_cfg = ModelConfig(**json_cfg) 49 | if args.verbose: 50 | print("model_cfg " + str(model_cfg)) 51 | 52 | sys.path.append(args.model_dir) 53 | import model 54 | from model import build_model, fit_model, load_train, load_validation 55 | 56 | train_data = load_train(args, model_cfg) 57 | validation_data = load_validation(args, model_cfg) 58 | 59 | if args.verbose: 60 | print("loading model") 61 | model = build_model(model_cfg, train_data, validation_data) 62 | fit_model(model, train_data, validation_data, args) 63 | 64 | if __name__ == '__main__': 65 | parser = modeling.parser.build_keras() 66 | sys.exit(main(parser.parse_args())) 67 | -------------------------------------------------------------------------------- /train_lasagne.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | 6 | import sys 7 | import six 8 | import argparse 9 | import progressbar 10 | import copy 11 | import cPickle 12 | import itertools 13 | 14 | import numpy as np 15 | import pandas as pd 16 | 17 | from modeling.lasagne_model import Classifier 18 | from modeling.utils import ( 19 | load_model_data, load_model_json, build_model_id, build_model_path, 20 | setup_model_dir, setup_logging, ModelConfig) 21 | import modeling.parser 22 | 23 | def keep_training(epoch, best_epoch, model_cfg): 24 | if model_cfg.n_epochs is not None and epoch > model_cfg.n_epochs: 25 | return False 26 | if epoch > 1 and epoch - best_epoch > model_cfg.patience: 27 | return False 28 | return True 29 | 30 | def train_one_epoch(model, x_train, y_train, args, model_cfg, progress=False): 31 | n = len(x_train) 32 | 33 | if args.shuffle: 34 | perm = np.random.permutation(n) 35 | else: 36 | perm = np.arange(n) 37 | 38 | if progress: 39 | pbar = progressbar.ProgressBar(term_width=40, 40 | widgets=[' ', progressbar.Percentage(), 41 | ' ', progressbar.ETA()], 42 | maxval=n).start() 43 | else: 44 | pbar = None 45 | 46 | train_loss = 0 47 | 48 | for j, i in enumerate(six.moves.range(0, n, model_cfg.batch_size)): 49 | if progress: 50 | pbar.update(j+1) 51 | x = x_train[perm[i:i + model_cfg.batch_size]] 52 | y = y_train[perm[i:i + model_cfg.batch_size]] 53 | if len(x) != model_cfg.batch_size: 54 | # TODO: how do other frameworks solve this? 55 | continue 56 | train_loss += model.fit(x, y) 57 | 58 | if progress: 59 | pbar.finish() 60 | 61 | return train_loss/float(n) 62 | 63 | def validate(model, x_valid, y_valid, args, model_cfg, progress=False): 64 | n = len(x_valid) 65 | 66 | if progress: 67 | pbar = progressbar.ProgressBar(term_width=40, 68 | widgets=[' ', progressbar.Percentage(), 69 | ' ', progressbar.ETA()], 70 | maxval=n).start() 71 | else: 72 | pbar = None 73 | 74 | val_accuracy = 0. 75 | val_loss = 0. 76 | 77 | for i in six.moves.range(0, n, model_cfg.batch_size): 78 | if progress: 79 | pbar.update(i+1) 80 | x = x_valid[i:i + model_cfg.batch_size] 81 | y = y_valid[i:i + model_cfg.batch_size] 82 | loss, acc = model.evaluate(x, y) 83 | val_loss += loss 84 | val_accuracy += acc 85 | 86 | if progress: 87 | pbar.finish() 88 | 89 | return val_loss/float(n), val_accuracy/float(n) 90 | 91 | def main(args): 92 | model_id = build_model_id(args) 93 | model_path = build_model_path(args, model_id) 94 | setup_model_dir(args, model_path) 95 | sys.stdout, sys.stderr = setup_logging(args) 96 | 97 | rng = np.random.RandomState(args.seed) 98 | 99 | x_train, y_train = load_model_data(args.train_file, 100 | args.data_name, args.target_name, 101 | n=args.n_train) 102 | 103 | x_valid, y_valid = load_model_data( 104 | args.validation_file, 105 | args.data_name, args.target_name, 106 | n=args.n_validation) 107 | 108 | train_files = args.extra_train_file + [args.train_file] 109 | train_files_iter = itertools.cycle(train_files) 110 | 111 | n_classes = max(np.unique(y_train)) + 1 112 | json_cfg = load_model_json(args, x_train, n_classes) 113 | 114 | sys.path.append(args.model_dir) 115 | from model import Model 116 | model_cfg = ModelConfig(**json_cfg) 117 | model = Model(model_cfg) 118 | setattr(model, 'stop_training', False) 119 | 120 | best_accuracy = 0. 121 | best_epoch = 0 122 | 123 | epoch = 1 124 | iteration = 0 125 | 126 | while True: 127 | if not keep_training(epoch, best_epoch, model_cfg): 128 | break 129 | 130 | train_loss = train_one_epoch(model, x_train, y_train, 131 | args, model_cfg, progress=args.progress) 132 | 133 | val_loss, val_accuracy = validate(model, x_valid, y_valid, 134 | args, model_cfg, progress=args.progress) 135 | 136 | if val_accuracy > best_accuracy: 137 | best_accuracy = val_accuracy 138 | best_epoch = epoch 139 | if model_path is not None: 140 | model.save_weights(model_path + '.npz') 141 | cPickle.dump(model, open(model_path + '.pkl', 'w')) 142 | 143 | print('epoch={epoch:05d}, iteration={iteration:05d}, loss={loss:.04f}, val_loss={val_loss:.04f}, val_acc={val_acc:.04f} best=[accuracy={best_accuracy:.04f} epoch={best_epoch:05d}]'.format( 144 | epoch=epoch, iteration=iteration, 145 | loss=train_loss, val_loss=val_loss, val_acc=val_accuracy, 146 | best_accuracy=best_accuracy, best_epoch=best_epoch)) 147 | 148 | iteration += 1 149 | if iteration % len(train_files) == 0: 150 | epoch += 1 151 | 152 | x_train, y_train = load_model_data( 153 | next(train_files_iter), 154 | args.data_name, args.target_name, 155 | n=args.n_train) 156 | 157 | if __name__ == '__main__': 158 | parser = modeling.parser.build_lasagne() 159 | sys.exit(main(parser.parse_args())) 160 | --------------------------------------------------------------------------------