├── .gitignore
├── README.md
├── bin
├── run-spelling-convnet-exp01-convparams-final.sh
├── run-spelling-convnet-exp01-convparams.sh
├── run-spelling-convnet-exp02-embeddings.sh
├── run-spelling-convnet-exp03-inputs.sh
├── run-spelling-convnet-exp04-real-errors.sh
├── run-spelling-convnet-exp05-multiclass.sh
├── run-spelling-convnet-residual.sh
├── run-spelling-convnet.sh
├── run-spelling-correction-isolated-binary.sh
├── run-spelling-correction-isolated-multiclass.sh
└── run-spelling-lstm.sh
├── contrasting_cases.py
├── modeling
├── __init__.py
├── autograd_examples.py
├── builders.py
├── callbacks.py
├── chainer_model.py
├── data.py
├── dataset.py
├── difference.py
├── fbeta.py
├── fbeta_predict.py
├── lasagne_model.py
├── layers.py
├── nonconvnet.py
├── outliers.py
├── parser.py
├── preprocess.py
├── residual.py
├── spelling.py
└── utils.py
├── models
├── keras
│ ├── attention
│ │ ├── model.json
│ │ └── model.py
│ ├── preposition
│ │ ├── convnet
│ │ │ ├── 4e0ae5dc683611e5950afcaa149e39ea
│ │ │ │ ├── model.py
│ │ │ │ └── model_old_keras.py
│ │ │ ├── model-word2vec.json
│ │ │ ├── model.json
│ │ │ ├── model.py
│ │ │ ├── run-medium.sh
│ │ │ ├── run-small.sh
│ │ │ └── small
│ │ │ │ └── find-best-filter-size
│ │ │ │ ├── find-best.sh
│ │ │ │ └── find-best.txt
│ │ └── lstm
│ │ │ ├── model.json
│ │ │ └── model.py
│ └── spelling
│ │ ├── convnet
│ │ ├── exp03-inputs
│ │ │ └── op_transpose_n_ops_1_n_errors_per_word_3
│ │ │ │ └── analysis.py
│ │ ├── model.json
│ │ └── model.py
│ │ ├── correction
│ │ └── isolated
│ │ │ ├── binary
│ │ │ ├── model.json
│ │ │ └── model.py
│ │ │ └── multiclass
│ │ │ ├── model.json
│ │ │ └── model.py
│ │ ├── data
│ │ └── nietzsche.txt
│ │ └── toksents.py
└── lasagne
│ └── spelling
│ └── convnet
│ ├── model.json
│ └── model.py
├── notebooks
├── ConvnetAnalysis.ipynb
├── ConvnetAnalysisHumanJudgments.ipynb
├── ConvnetSensitivityAnalysis.ipynb
├── Spelling.ipynb
└── notes.txt
├── requirements.txt
├── setup.py
├── tests
├── testdata.py
├── testdifference.py
├── testlasagne.py
├── testlayers.py
└── testnonconvnet.py
├── train_chainer.py
├── train_keras.py
├── train_keras_simple.py
└── train_lasagne.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # modeling
2 |
--------------------------------------------------------------------------------
/bin/run-spelling-convnet-exp01-convparams-final.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 |
3 | model_dir=models/keras/spelling/convnet
4 | data_dir=data/spelling/experimental/
5 | distance=1
6 | errors=3
7 |
8 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/')
9 | experiment_dir=$model_dir/$experiment_name
10 | mkdir -p $experiment_dir
11 |
12 | for operation in delete
13 | do
14 | for n_embed_dims in 10
15 | do
16 | for n_filters in 3000
17 | do
18 | for filter_width in 6
19 | do
20 | for n_fully_connected in 1
21 | do
22 | for n_residual_blocks in 0
23 | do
24 | for n_hidden in 1000
25 | do
26 | model_dest=$experiment_dir/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_residual_blocks_${n_residual_blocks}_n_hidden_${n_hidden}
27 | if [ -d $model_dest ]
28 | then
29 | continue
30 | fi
31 | ./train_keras.py $model_dir \
32 | $data_dir/op-${operation}-distance-${distance}-errors-per-word-${errors}.h5 \
33 | $data_dir/op-${operation}-distance-${distance}-errors-per-word-${errors}.h5 \
34 | chars \
35 | --target-name binary_target \
36 | --model-dest $model_dest \
37 | --n-embeddings 61 \
38 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=${n_fully_connected} n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 batch_size=32 \
39 | --shuffle \
40 | --confusion-matrix \
41 | --classification-report \
42 | --class-weight-auto \
43 | --class-weight-exponent 3 \
44 | --early-stopping-metric f2 \
45 | --verbose \
46 | --log
47 | done
48 | done
49 | done
50 | done
51 | done
52 | done
53 | done
54 |
--------------------------------------------------------------------------------
/bin/run-spelling-convnet-exp01-convparams.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 |
3 | model_dir=models/keras/spelling/convnet
4 | data_dir=data/spelling/experimental/old/
5 | distance=1
6 | errors=3
7 |
8 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/')
9 | experiment_dir=$model_dir/$experiment_name
10 | mkdir -p $experiment_dir
11 |
12 | for operation in delete
13 | do
14 | for n_embed_dims in 10 30 100
15 | do
16 | for n_filters in 100 200 300
17 | do
18 | for filter_width in 2 4 6 8
19 | do
20 | for n_fully_connected in 1
21 | do
22 | for n_residual_blocks in 0
23 | do
24 | for n_hidden in 100 200 300
25 | do
26 | model_dest=$experiment_dir/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_residual_blocks_${n_residual_blocks}_n_hidden_${n_hidden}
27 | if [ -d $model_dest ]
28 | then
29 | continue
30 | fi
31 | echo ./train_keras.py $model_dir \
32 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \
33 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \
34 | word \
35 | --target-name target \
36 | --model-dest $model_dest \
37 | --n-embeddings 61 \
38 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=${n_fully_connected} n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=5 \
39 | --shuffle \
40 | --confusion-matrix \
41 | --classification-report \
42 | --class-weight-auto \
43 | --class-weight-exponent 3 \
44 | --early-stopping-metric f2 \
45 | --verbose \
46 | --log
47 | done
48 | done
49 | done
50 | done
51 | done
52 | done
53 | done | parallel --gnu -j 2
54 |
--------------------------------------------------------------------------------
/bin/run-spelling-convnet-exp02-embeddings.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 |
3 | model_dir=models/keras/spelling/convnet
4 | data_dir=data/spelling/experimental/
5 | distance=1
6 | errors=3
7 |
8 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/')
9 | experiment_dir=$model_dir/$experiment_name
10 | mkdir -p $experiment_dir
11 |
12 | operation=delete
13 | n_embed_dims=56
14 | n_filters=10
15 | filter_width=6
16 | n_fully_connected=0
17 | n_hidden=0
18 |
19 | for embedding_init in identity orthogonal uniform normal
20 | do
21 | for train_embeddings in false true
22 | do
23 | model_dest=$experiment_dir/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_hidden_${n_hidden}_embedding_init_${embedding_init}_train_embeddings_${train_embeddings}
24 | #--model-dest $model_dest \
25 | echo $model_dest
26 | ./train_keras.py $model_dir \
27 | $data_dir/op-${operation}-distance-${distance}-errors-per-word-${errors}.h5 \
28 | $data_dir/op-${operation}-distance-${distance}-errors-per-word-${errors}.h5 \
29 | chars \
30 | --target-name binary_target \
31 | --n-embeddings 56 \
32 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=${n_fully_connected} n_hidden=$n_hidden embedding_init=$embedding_init train_embeddings=$train_embeddings optimizer=SGD learning_rate=0.001 momentum=0.0 decay=0.0 \
33 | --shuffle \
34 | --confusion-matrix \
35 | --classification-report \
36 | --class-weight-auto \
37 | --class-weight-exponent 3 \
38 | --verbose \
39 | --n-train 50000 \
40 | --n-epochs 3 \
41 | --no-save
42 | #--log \
43 | done
44 | done
45 | #| parallel --gnu -j 2
46 |
--------------------------------------------------------------------------------
/bin/run-spelling-convnet-exp03-inputs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 |
3 | model_dir=models/keras/spelling/convnet
4 | data_dir=data/spelling/experimental/
5 |
6 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/')
7 | experiment_dir=$model_dir/$experiment_name
8 | mkdir -p $experiment_dir
9 |
10 | n_embed_dims=10
11 | n_filters=3000
12 | filter_width=6
13 | n_fully_connected=1
14 | n_residual_blocks=0
15 | n_hidden=1000
16 |
17 | for operation in delete insert substitute transpose
18 | do
19 | for n_operations in 1 2
20 | do
21 | for n_errors_per_word in 3 10
22 | do
23 | model_dest=$experiment_dir/op_${operation}_n_ops_${n_operations}_n_errors_per_word_${n_errors_per_word}
24 | if [ -d $model_dest ]
25 | then
26 | continue
27 | fi
28 | echo ./train_keras.py $model_dir \
29 | $data_dir/op-${operation}-distance-${n_operations}-errors-per-word-${n_errors_per_word}.h5 \
30 | $data_dir/op-${operation}-distance-${n_operations}-errors-per-word-${n_errors_per_word}.h5 \
31 | marked_chars \
32 | --target-name binary_target \
33 | --model-dest $model_dest \
34 | --n-embeddings 61 \
35 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=$n_fully_connected n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 \
36 | --shuffle \
37 | --confusion-matrix \
38 | --classification-report \
39 | --class-weight-auto \
40 | --class-weight-exponent 3 \
41 | --early-stopping-metric val_f2 \
42 | --checkpoint-metric val_f2 \
43 | --verbose \
44 | --log
45 | break
46 | done
47 | done
48 | done | parallel --gnu -j 2
49 |
--------------------------------------------------------------------------------
/bin/run-spelling-convnet-exp04-real-errors.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | model_dir=models/keras/spelling/convnet
4 | data_dir=data/spelling/experimental/
5 |
6 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/')
7 | experiment_dir=$model_dir/$experiment_name
8 | mkdir -p $experiment_dir
9 |
10 | n_embed_dims=10
11 | n_filters=3000
12 | filter_width=6
13 | n_fully_connected=1
14 | n_residual_blocks=0
15 | n_hidden=1000
16 |
17 | # Train two models, one with random artificial errors, one with artificial
18 | # errors learned from a corpus of real errors.
19 |
20 | corpora="non-word-error-detection-experiment-04-random-negative-examples.h5 non-word-error-detection-experiment-04-generated-negative-examples.h5"
21 |
22 | for corpus in $corpora
23 | do
24 | model_dest=$experiment_dir/$(echo $corpus | sed -e 's,-,_,g' -e 's,.h5,,')
25 | if [ -d $model_dest ]
26 | then
27 | continue
28 | fi
29 | ./train_keras.py $model_dir \
30 | $data_dir/$corpus \
31 | $data_dir/$corpus \
32 | marked_chars \
33 | --target-name binary_target \
34 | --model-dest $model_dest \
35 | --n-embeddings 255 \
36 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=$n_fully_connected n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 \
37 | --shuffle \
38 | --confusion-matrix \
39 | --classification-report \
40 | --class-weight-auto \
41 | --class-weight-exponent 3 \
42 | --early-stopping-metric val_f2 \
43 | --checkpoint-metric val_f2 \
44 | --save-all-checkpoints \
45 | --verbose \
46 | --log
47 | done
48 | #| parallel --gnu -j 2
49 |
--------------------------------------------------------------------------------
/bin/run-spelling-convnet-exp05-multiclass.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -ex
2 |
3 | model_dir=models/keras/spelling/convnet
4 | data_dir=data/spelling/experimental/
5 | distance=1
6 | errors=3
7 |
8 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/')
9 | experiment_dir=$model_dir/$experiment_name
10 | mkdir -p $experiment_dir
11 |
12 | for operation in delete
13 | do
14 | for n_embed_dims in 100
15 | do
16 | for n_filters in 300
17 | do
18 | for filter_width in 8
19 | do
20 | for n_fully_connected in 2
21 | do
22 | for n_residual_blocks in 1
23 | do
24 | for n_hidden in 300
25 | do
26 | ./train_keras.py $model_dir \
27 | $data_dir/op-$operation-distance-$distance-errors-per-word-${errors}/000.h5 \
28 | $data_dir/op-$operation-distance-$distance-errors-per-word-${errors}/000.h5 \
29 | chars \
30 | --target-name multiclass_target \
31 | --model-dest $experiment_dir/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_residual_blocks_${n_residual_blocks}_n_hidden_${n_hidden}_n_hsm_classes_5000 \
32 | --n-embeddings 61 \
33 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=${n_fully_connected} n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=240 n_hsm_classes=5000 batch_size=8 \
34 | --shuffle \
35 | --class-weight-auto \
36 | --class-weight-exponent 3 \
37 | --early-stopping-metric f1 \
38 | --verbose \
39 | --target-data $data_dir/op-$operation-distance-$distance-errors-per-word-${errors}.json \
40 | --extra-train-file $(ls $data_dir/op-$operation-distance-$distance-errors-per-word-${errors}/* | egrep -v '000.h5') \
41 | --n-classes 119773 \
42 | --log
43 | done
44 | done
45 | done
46 | done
47 | done
48 | done
49 | done
50 |
--------------------------------------------------------------------------------
/bin/run-spelling-convnet-residual.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 |
3 | model_dir=models/keras/spelling/convnet
4 | data_dir=data/spelling/experimental/
5 | distance=1
6 | errors=3
7 | nonce_interval=
8 |
9 | crossval_dir=$model_dir/crossval
10 | mkdir -p $crossval_dir
11 |
12 | #for operation in delete insert substitute transpose
13 | for operation in delete
14 | do
15 | for n_embed_dims in 100
16 | do
17 | for n_filters in 1000
18 | do
19 | for filter_width in 5
20 | do
21 | for n_fully_connected in 1 2 3 4 5 6 7
22 | do
23 | for n_residual_blocks in 0
24 | do
25 | for n_hidden in 100
26 | do
27 | echo ./train_keras.py $model_dir \
28 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \
29 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \
30 | word \
31 | --model-dest $crossval_dir/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_residual_blocks_${n_residual_blocks}_n_hidden_${n_hidden} \
32 | --target-name target \
33 | --n-embeddings 61 \
34 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=${n_fully_connected} n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 \
35 | --shuffle \
36 | --confusion-matrix \
37 | --classification-report \
38 | --class-weight-auto \
39 | --class-weight-exponent 3 \
40 | --early-stopping-metric f2 \
41 | --n-validation 100000 \
42 | --log \
43 | --verbose
44 | done
45 | done
46 | done
47 | done
48 | done
49 | done
50 | done | parallel --gnu -j 2
51 |
--------------------------------------------------------------------------------
/bin/run-spelling-convnet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 |
3 | model_dir=models/keras/spelling/convnet
4 | data_dir=data/spelling/experimental/
5 | distance=1
6 | errors=3
7 | nonce_interval=-nonce-interval-3
8 |
9 | mkdir -p $model_dir/crossval
10 |
11 | #for operation in delete insert substitute transpose
12 | #for nonce in "" "-nonce-interval-3"
13 | #do
14 | for operation in delete
15 | do
16 | for n_embed_dims in 100
17 | do
18 | for n_filters in 1000
19 | do
20 | for filter_width in 5
21 | do
22 | for n_hidden in 100
23 | do
24 | for n_fully_connected in 1 2 3 4
25 | do
26 | echo ./train_keras.py $model_dir \
27 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \
28 | $data_dir/$operation-${errors}errors1word-distance-$distance${nonce}.h5 \
29 | word \
30 | --model-dest $model_dir/crossval/op_${operation}_n_embed_dims_${n_embed_dims}_n_filters_${n_filters}_filter_width_${filter_width}_n_fully_connected_${n_fully_connected}_n_hidden_${n_hidden} \
31 | --target-name target \
32 | --n-embeddings 61 \
33 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_hidden=$n_hidden n_fully_connected=${n_fully_connected} patience=3 \
34 | --shuffle \
35 | --confusion-matrix \
36 | --classification-report \
37 | --class-weight-auto \
38 | --class-weight-exponent 3 \
39 | --early-stopping-metric f2 \
40 | --n-validation 100000 \
41 | --log
42 | done
43 | done
44 | done
45 | done
46 | done
47 | done | parallel --gnu -j 2
48 |
--------------------------------------------------------------------------------
/bin/run-spelling-correction-isolated-binary.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | model_dir=models/keras/spelling/correction/isolated/binary/
4 | data_dir=data/spelling/experimental/
5 |
6 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/')
7 | experiment_dir=$model_dir/$experiment_name
8 | mkdir -p $experiment_dir
9 |
10 | n_embed_dims=10
11 | n_filters=3000
12 | filter_width=6
13 | n_fully_connected=2
14 | n_residual_blocks=2
15 | n_hidden=1000
16 |
17 | #corpora="non-word-error-detection-experiment-04-random-negative-examples.h5 non-word-error-detection-experiment-04-generated-negative-examples.h5"
18 | #corpora="non-word-error-detection-experiment-04-random-negative-examples.h5"
19 | corpora="non-word-error-detection-experiment-04-generated-negative-examples.h5"
20 |
21 | for corpus in $corpora
22 | do
23 | model_dest=$experiment_dir/$(echo $corpus | sed -e 's,-,_,g' -e 's,.h5,,')
24 | if [ -d $model_dest ]
25 | then
26 | continue
27 | fi
28 | ./train_keras_simple.py $model_dir \
29 | $data_dir/$corpus \
30 | $data_dir/$corpus \
31 | non_word_marked_chars real_word_marked_chars \
32 | --target-name binary_target \
33 | --model-dest $model_dest \
34 | --n-embeddings 255 \
35 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=$n_fully_connected n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 \
36 | --class-weight-exponent 3 \
37 | --verbose \
38 | --no-save
39 | done
40 | #--log
41 | #| parallel --gnu -j 2
42 |
--------------------------------------------------------------------------------
/bin/run-spelling-correction-isolated-multiclass.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | model_dir=models/keras/spelling/correction/isolated/multiclass/
4 | data_dir=data/spelling/experimental/
5 |
6 | experiment_name=$(echo $0 | sed -r 's/.*-(exp[0-9][0-9]-..*).sh/\1/')
7 | experiment_dir=$model_dir/$experiment_name
8 | mkdir -p $experiment_dir
9 |
10 | n_embed_dims=10
11 | n_filters=3000
12 | filter_width=6
13 | n_fully_connected=2
14 | n_residual_blocks=2
15 | n_hidden=1000
16 |
17 | #corpora="non-word-error-detection-experiment-04-random-negative-examples.h5 non-word-error-detection-experiment-04-generated-negative-examples.h5"
18 | #corpora="non-word-error-detection-experiment-04-random-negative-examples.h5"
19 | corpora="non-word-error-detection-experiment-04-generated-negative-examples.h5"
20 |
21 | for corpus in $corpora
22 | do
23 | model_dest=$experiment_dir/$(echo $corpus | sed -e 's,-,_,g' -e 's,.h5,,')
24 | if [ -d $model_dest ]
25 | then
26 | continue
27 | fi
28 | ./train_keras_simple.py $model_dir \
29 | $data_dir/$corpus \
30 | $data_dir/$corpus \
31 | non_word_marked_chars \
32 | --target-name multiclass_correction_target \
33 | --model-dest $model_dest \
34 | --n-embeddings 255 \
35 | --model-cfg n_embed_dims=$n_embed_dims n_filters=$n_filters filter_width=$filter_width n_fully_connected=$n_fully_connected n_residual_blocks=$n_residual_blocks n_hidden=$n_hidden patience=10 \
36 | --class-weight-exponent 3 \
37 | --verbose \
38 | --n-epochs 3 \
39 | --no-save
40 | done
41 | #--log
42 | #| parallel --gnu -j 2
43 |
--------------------------------------------------------------------------------
/bin/run-spelling-lstm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -xe
2 |
3 | ./train_keras.py models/keras/spelling/lstm \
4 | data/spelling/birbeck-train.h5 \
5 | data/spelling/birbeck-valid.h5 \
6 | word \
7 | --target-name is_real_word \
8 | --n-embeddings 56 \
9 | --model-cfg n_units=20 n_embed_dims=25 patience=1000 train_embeddings=true embedding_init=uniform optimizer=Adam \
10 | --shuffle \
11 | --log \
12 | --confusion-matrix \
13 | --classification-report \
14 | --class-weight-auto \
15 | --class-weight-exponent 5 \
16 | --n-epochs 350
17 |
--------------------------------------------------------------------------------
/contrasting_cases.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from __future__ import absolute_import
4 | from __future__ import print_function
5 | import numpy as np
6 | np.random.seed(1337) # for reproducibility
7 |
8 | import sys
9 | import argparse
10 | import h5py
11 |
12 | from keras.datasets import mnist
13 | from keras.models import Sequential
14 | from keras.layers.core import Dense, Dropout, Activation
15 | from keras.layers.normalization import BatchNormalization
16 | from keras.optimizers import SGD, Adadelta
17 | from keras.utils import np_utils
18 |
19 | from outliers import PMeansMultivariateNormal
20 |
21 | def create_dataset(n, train_size, valid_size):
22 | means = np.arange(100)
23 | cov = [range(1, 101)] * 100
24 | mvn = PMeansMultivariateNormal(means, cov, (n,))
25 | X = mvn.generate()
26 |
27 | assert n % 2 == 0
28 | assert n > train_size + valid_size
29 |
30 | # Make the data different along one dimension.
31 | even = np.arange(0, n, step=2)
32 | X[even, 0] = np.random.uniform(-.25, 1.75, size=n/2)
33 | # Make each odd-numbered row the inverse of its previous row.
34 | X[even+1, 0] = np.random.uniform(-1.75, .25, size=n/2)
35 |
36 | X += np.random.uniform(0.01, size=X.shape)
37 | X = X.astype(np.float32)
38 |
39 | y = np.array([[0,1] * (n/2)]).reshape((n,1))
40 | y = y.astype(np.int32)
41 |
42 | X_train = X[0:train_size, :]
43 | X_valid = X[train_size:train_size+valid_size, :]
44 | X_test = X[train_size+valid_size:, :]
45 |
46 | y_train = y[0:train_size]
47 | y_valid = y[train_size:train_size+valid_size]
48 | y_test = y[train_size+valid_size:]
49 |
50 | return X_train, X_valid, X_test, \
51 | y_train, y_valid, y_test
52 |
53 |
54 | def build_model(n_inputs, n_hidden, n_classes):
55 | model = Sequential()
56 | model.add(Dense(n_inputs, n_hidden))
57 | model.add(BatchNormalization((n_hidden,)))
58 | model.add(Activation('relu'))
59 | model.add(Dense(n_hidden, n_hidden))
60 | model.add(BatchNormalization((n_hidden,)))
61 | model.add(Activation('relu'))
62 | model.add(Dense(n_hidden, n_hidden))
63 | model.add(BatchNormalization((n_hidden,)))
64 | model.add(Activation('relu'))
65 | model.add(Dense(n_hidden, n_hidden))
66 | model.add(BatchNormalization((n_hidden,)))
67 | model.add(Activation('relu'))
68 | model.add(Dense(n_hidden, n_classes))
69 | model.add(Activation('softmax'))
70 |
71 | optimizer = Adadelta()
72 | model.compile(loss='categorical_crossentropy', optimizer=optimizer)
73 |
74 | return model
75 |
76 | def get_parser():
77 | parser = argparse.ArgumentParser(
78 | description='train a model to demonstrate contrasting cases')
79 | parser.add_argument(
80 | '--shuffle', action='store_true',
81 | help='shuffle the training examples after each epoch (i.e. do not use contrasting cases)')
82 | parser.add_argument(
83 | '--n', type=int, default=10000,
84 | help='the size of the data set to create')
85 | parser.add_argument(
86 | '--train-size', type=int, default=7000,
87 | help='the number of examples from the data set to allocate to training')
88 | parser.add_argument(
89 | '--valid-size', type=int, default=1500,
90 | help='the number of examples from the data set to allocate to validation')
91 | parser.add_argument(
92 | '--batch-size', type=int, default=10,
93 | help='mini-batch size')
94 | parser.add_argument(
95 | '--n-epochs', type=int, default=20,
96 | help='number of epochs to train')
97 | parser.add_argument(
98 | '--verbose', action='store_true',
99 | help='print progress')
100 |
101 | return parser.parse_args()
102 |
103 | def main(args):
104 | x_train, x_valid, x_test, \
105 | y_train, y_valid, y_test = create_dataset(
106 | args.n, args.train_size, args.valid_size)
107 |
108 | y_train = y_train.reshape((y_train.shape[0], 1))
109 | y_valid = y_valid.reshape((y_valid.shape[0], 1))
110 | y_test = y_test.reshape((y_test.shape[0], 1))
111 |
112 | n_classes = len(np.unique(y_train))
113 |
114 | '''
115 | print('y_train', y_train.shape)
116 | print('y_valid', y_valid.shape)
117 | print('y_test', y_test.shape)
118 | print('n_classes', n_classes, np.unique(y_train))
119 | '''
120 |
121 | # convert class vectors to binary class matrices
122 | y_train = np_utils.to_categorical(
123 | y_train, n_classes).astype(np.int32)
124 | y_valid = np_utils.to_categorical(
125 | y_valid, n_classes).astype(np.int32)
126 | y_test = np_utils.to_categorical(
127 | y_test, n_classes).astype(np.int32)
128 |
129 | if args.shuffle:
130 | print('Training (shuffled)')
131 | # Leave odd-numbered rows where they are; shuffle only
132 | # even-numbered ones. This ensures that each minibatch has one
133 | # example from each class.
134 | perm = np.arange(x_train.shape[0])
135 | evens = np.arange(0, x_train.shape[0], 2)
136 | perm[evens] = np.random.permutation(evens)
137 | else:
138 | print('Training (contrasting cases)')
139 |
140 | model = build_model(100, 20, n_classes)
141 |
142 | print('x_train', x_train.dtype)
143 | print('y_train', y_train.dtype)
144 |
145 | model.fit(x_train, y_train,
146 | batch_size=args.batch_size,
147 | shuffle=False,
148 | nb_epoch=args.n_epochs,
149 | show_accuracy=True,
150 | verbose=2 if args.verbose else 0,
151 | validation_data=(x_valid, y_valid))
152 |
153 | score = model.evaluate(x_test, y_test,
154 | show_accuracy=True,
155 | verbose=1 if args.verbose else 0)
156 |
157 | if args.shuffle:
158 | print('Test accuracy (shuffled)', score[1])
159 | else:
160 | print('Test accuracy (contrasting cases)', score[1])
161 |
162 | if __name__ == '__main__':
163 | sys.exit(main(get_parser()))
164 |
--------------------------------------------------------------------------------
/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import (split_data, mask_zero_for_rnn, balance_datasets)
2 |
--------------------------------------------------------------------------------
/modeling/autograd_examples.py:
--------------------------------------------------------------------------------
1 | import autograd.numpy as np
2 | from autograd import grad
3 |
4 | def sigmoid(x):
5 | return 0.5*(np.tanh(x) + 1)
6 |
7 | def logistic_predictions(weights, inputs):
8 | # Outputs probability of a label being true according to logistic model.
9 | return sigmoid(np.dot(inputs, weights))
10 |
11 | def training_loss(weights):
12 | # Training loss is the negative log-likelihood of the training labels.
13 | preds = logistic_predictions(weights, inputs)
14 | label_probabilities = preds * targets + (1 - preds) * (1 - targets)
15 | return -np.sum(np.log(label_probabilities))
16 |
17 | # Build a toy dataset.
18 | inputs = np.array([[0.52, 1.12, 0.77],
19 | [0.88, -1.08, 0.15],
20 | [0.52, 0.06, -1.30],
21 | [0.74, -2.49, 1.39]])
22 | targets = np.array([True, True, False, True])
23 |
24 | # Define a function that returns gradients of training loss using autograd.
25 | training_gradient_fun = grad(training_loss)
26 |
27 | # Optimize weights using gradient descent.
28 | weights = np.array([0.0, 0.0, 0.0])
29 | print "Initial loss:", training_loss(weights)
30 | for i in xrange(100):
31 | weights -= training_gradient_fun(weights) * 0.01
32 | print "Trained loss:", training_loss(weights)
33 |
34 | def taylor_sine(x):
35 | ans = currterm = x
36 | i = 0
37 | while np.abs(currterm) > 0.001:
38 | currterm = -currterm * x**2 / ((2 * i + 3) * (2 * i + 2))
39 | ans = ans + currterm
40 | i += 1
41 | return ans
42 |
43 | grad_sine = grad(taylor_sine)
44 | print "Gradient of sin(pi) is", grad_sine(np.pi)
45 |
--------------------------------------------------------------------------------
/modeling/builders.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from keras.layers.core import Dense
4 | from keras.layers.embeddings import Embedding
5 | from keras.layers.convolutional import (Convolution1D, MaxPooling1D)
6 | from keras.optimizers import (SGD, Adam, Adadelta, Adagrad, RMSprop)
7 | from keras.constraints import maxnorm
8 | from keras.regularizers import l2
9 |
10 | from modeling.layers import ImmutableEmbedding, HierarchicalSoftmax
11 |
12 | def build_embedding_layer(config, input_width=None):
13 | try:
14 | n_embeddings = config.n_vocab
15 | except AttributeError:
16 | n_embeddings = config.n_embeddings
17 |
18 | try:
19 | input_width = config.input_width
20 | except AttributeError:
21 | input_width = input_width
22 |
23 | try:
24 | mask_zero = config.mask_zero
25 | except AttributeError:
26 | mask_zero = False
27 |
28 | if hasattr(config, 'embedding_weights') and config.embedding_weights is not None:
29 | W = np.load(config.embedding_weights)
30 | if config.train_embeddings is True or config.train_embeddings == 'true':
31 | return Embedding(n_embeddings, config.n_embed_dims,
32 | weights=[W], input_length=input_width,
33 | W_constraint=maxnorm(config.embedding_max_norm),
34 | mask_zero=mask_zero)
35 | else:
36 | return ImmutableEmbedding(n_embeddings, config.n_embed_dims,
37 | weights=[W], mask_zero=mask_zero,
38 | input_length=input_width)
39 | else:
40 | if config.train_embeddings is True:
41 | return Embedding(n_embeddings, config.n_embed_dims,
42 | init=config.embedding_init,
43 | W_constraint=maxnorm(config.embedding_max_norm),
44 | mask_zero=mask_zero,
45 | input_length=input_width)
46 | else:
47 | return ImmutableEmbedding(n_embeddings, config.n_embed_dims,
48 | init=config.embedding_init,
49 | mask_zero=mask_zero,
50 | input_length=input_width)
51 |
52 | def build_convolutional_layer(config):
53 | return Convolution1D(config.n_filters, config.filter_width,
54 | W_constraint=maxnorm(config.filter_max_norm),
55 | border_mode=config.border_mode,
56 | W_regularizer=l2(config.l2_penalty))
57 |
58 | def build_pooling_layer(config, input_width=None, filter_width=None):
59 | try:
60 | input_width = config.input_width
61 | except AttributeError:
62 | assert input_width is not None
63 |
64 | try:
65 | filter_width = config.filter_width
66 | except AttributeError:
67 | assert filter_width is not None
68 |
69 | return MaxPooling1D(
70 | pool_length=input_width - filter_width + 1,
71 | stride=1)
72 |
73 | def build_dense_layer(config, n_hidden=None, activation='linear'):
74 | if n_hidden is None:
75 | n_hidden = config.n_hidden
76 | return Dense(n_hidden,
77 | W_regularizer=l2(config.l2_penalty),
78 | W_constraint=maxnorm(config.dense_max_norm),
79 | activation=activation)
80 |
81 | def build_hierarchical_softmax_layer(config):
82 | # This n_classes is different from the number of unique target values in
83 | # the training set. Hierarchical softmax assigns each word to a class
84 | # and decomposes the softmax into a prediction that's conditioned on
85 | # class membership.
86 | return HierarchicalSoftmax(config.n_classes, config.n_hsm_classes,
87 | batch_size=config.batch_size)
88 |
89 | def load_weights(config, model):
90 | if hasattr(config, 'model_weights') and config.model_weights is not None:
91 | print('Loading weights from %s' % config.model_weights)
92 | model.load_weights(config.model_weights)
93 |
94 | def build_optimizer(config):
95 | if config.optimizer == 'SGD':
96 | optimizer = SGD(lr=config.learning_rate,
97 | decay=config.decay, momentum=config.momentum,
98 | clipnorm=config.clipnorm)
99 | elif config.optimizer == 'Adam':
100 | optimizer = Adam(clipnorm=config.clipnorm)
101 | elif config.optimizer == 'RMSprop':
102 | optimizer = RMSprop(clipnorm=config.clipnorm)
103 | elif config.optimizer == 'Adadelta':
104 | optimizer = Adadelta(clipnorm=config.clipnorm)
105 | elif config.optimizer == 'Adagrad':
106 | optimizer = Adagrad(clipnorm=config.clipnorm)
107 | else:
108 | raise ValueError("don't know how to use optimizer {0}".format(config.optimizer))
109 |
110 | return optimizer
111 |
--------------------------------------------------------------------------------
/modeling/callbacks.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import keras
4 | from keras.callbacks import Callback, EarlyStopping
5 | import keras.callbacks
6 | import numpy as np
7 | import six
8 | from sklearn.metrics import (classification_report,
9 | confusion_matrix, f1_score, fbeta_score)
10 |
11 | def predict(model, x, marshaller, batch_size=128):
12 | if isinstance(model, keras.models.Graph):
13 | if marshaller is None:
14 | raise ValueError("a marshaller is required with Graphs")
15 | x = marshaller.marshal(x)
16 | output = model.predict(x, batch_size=batch_size)
17 | y_hat = marshaller.unmarshal(output)
18 | y_hat = np.argmax(y_hat, axis=1)
19 | else:
20 | y_hat = model.predict_classes(x, verbose=0, batch_size=batch_size)
21 | return y_hat
22 |
23 | class PredictionCallback(Callback):
24 | def __init__(self, x, logger, marshaller=None, iteration_freq=10, batch_size=128):
25 | self.__dict__.update(locals())
26 | self.callbacks = []
27 |
28 | def add(self, callback):
29 | self.callbacks.append(callback)
30 |
31 | def _set_model(self, model):
32 | self.model = model
33 | for cb in self.callbacks:
34 | cb._set_model(model)
35 |
36 | def on_batch_begin(self, batch, logs={}):
37 | pass
38 |
39 | def on_batch_end(self, batch, logs={}):
40 | pass
41 |
42 | def on_epoch_begin(self, epoch, logs={}):
43 | pass
44 |
45 | def on_epoch_end(self, epoch, logs={}):
46 | if 'iteration' in logs.keys() and logs['iteration'] % self.iteration_freq != 0:
47 | # If we've broken a large training set into smaller chunks, we don't
48 | # need to run the classification report after every chunk.
49 | return
50 |
51 | y_hat = predict(self.model, self.x, self.marshaller, batch_size=self.batch_size)
52 | logs['y_hat'] = y_hat
53 | for cb in self.callbacks:
54 | cb.on_epoch_end(epoch, logs)
55 |
56 | def on_train_begin(self, logs={}):
57 | pass
58 |
59 | def on_train_end(self, logs={}):
60 | pass
61 |
62 | class DelegatingMetricCallback(Callback):
63 | def __init__(self, x, y, logger, metric_name, delegate, marshaller=None, batch_size=128):
64 | self.__dict__.update(locals())
65 | del self.self
66 |
67 | def _set_model(self, model):
68 | self.model = model
69 | self.delegate._set_model(model)
70 |
71 | def on_epoch_end(self, epoch, logs={}):
72 | try:
73 | y_hat = logs['y_hat']
74 | except KeyError:
75 | y_hat = predict(self.model, self.x, self.marshaller, batch_size=self.batch_size)
76 | metric = self.build_metric(logs)
77 | logs[self.metric_name] = metric(self.y, y_hat)
78 | self.logger('%s %.03f' % (self.metric_name, logs[self.metric_name]))
79 | self.delegate.on_epoch_end(epoch, logs)
80 |
81 | def build_metric(self, logs):
82 | return {
83 | 'val_loss': lambda y,y_hat: logs['val_loss'],
84 | 'val_acc': lambda y,y_hat: logs['val_acc'],
85 | 'val_f1': f1_score,
86 | 'val_f1': lambda y,y_hat: fbeta_score(y, y_hat, beta=0.5),
87 | 'val_f2': lambda y,y_hat: fbeta_score(y, y_hat, beta=2)
88 | }[self.metric_name]
89 |
90 | class ConfusionMatrix(Callback):
91 | def __init__(self, x, y, logger, marshaller=None, batch_size=128):
92 | self.__dict__.update(locals())
93 | del self.self
94 |
95 | def on_epoch_end(self, epoch, logs={}):
96 | try:
97 | y_hat = logs['y_hat']
98 | except KeyError:
99 | y_hat = predict(self.model, self.x, self.marshaller, batch_size=self.batch_size)
100 | self.logger('\nConfusion matrix')
101 | self.logger(confusion_matrix(self.y, y_hat))
102 |
103 | class ClassificationReport(Callback):
104 | def __init__(self, x, y, logger, target_names=None, marshaller=None, batch_size=128):
105 | self.__dict__.update(locals())
106 | del self.self
107 |
108 | self.labels = np.arange(max(y)+1)
109 |
110 | if target_names is None:
111 | self.target_names = [str(t) for t in self.labels]
112 | else:
113 | self.target_names = [str(tn) for tn in target_names]
114 |
115 | def on_epoch_end(self, epoch, logs={}):
116 | try:
117 | y_hat = logs['y_hat']
118 | except KeyError:
119 | y_hat = predict(self.model, self.x, self.marshaller, batch_size=self.batch_size)
120 |
121 | self.logger('\nClassification report')
122 | self.logger(classification_report(
123 | self.y, y_hat,
124 | labels=self.labels, target_names=self.target_names))
125 |
126 | class OptimizerMonitor(Callback):
127 | def __init__(self, logger):
128 | self.logger = logger
129 |
130 | def on_epoch_end(self, epoch, logs={}):
131 | if not hasattr(self.model.optimizer, 'lr'):
132 | return
133 |
134 | lr = self.model.optimizer.lr.get_value()
135 | optimizer_state = str({ 'lr': lr })
136 |
137 | if 'iteration' in logs.keys():
138 | self.logger("epoch {epoch} iteration {iteration} - optimizer state {optimizer_state}".format(
139 | epoch=epoch, iteration=logs['iteration'], optimizer_state=optimizer_state))
140 | else:
141 | self.logger("epoch {epoch} - optimizer state {optimizer_state}".format(
142 | epoch=epoch, optimizer_state=optimizer_state))
143 |
144 | class VersionedModelCheckpoint(Callback):
145 | def __init__(self, filepath, max_epochs=10000, **kwargs):
146 | kwargs['save_best_only'] = False
147 | self.delegate = keras.callbacks.ModelCheckpoint(filepath, **kwargs)
148 | self.filepath = filepath
149 | self.basepath, self.ext = os.path.splitext(filepath)
150 | self.epoch = 0
151 | width = int(np.log10(max_epochs)) + 1
152 | self.fmt_string = '{basepath}-{epoch:0' + str(width) + 'd}{ext}'
153 |
154 | def on_epoch_end(self, epoch, logs={}):
155 | logs['val_loss'] = -self.epoch
156 | self.delegate.on_epoch_end(epoch, logs)
157 |
158 | if os.path.exists(self.filepath):
159 | newpath = self.fmt_string.format(
160 | basepath=self.basepath, epoch=self.epoch, ext=self.ext)
161 | os.rename(self.filepath, newpath)
162 | self.epoch += 1
163 |
164 | def _set_model(self, model):
165 | self.model = model
166 | self.delegate._set_model(model)
167 |
168 | class SingleStepLearningRateSchedule(keras.callbacks.Callback):
169 | def __init__(self, patience=5, learning_rate_divisor=10.):
170 | self.patience = patience
171 | self.learning_rate_divisor = learning_rate_divisor
172 | self.best_loss = np.inf
173 | self.best_epoch = 0
174 | self.updated_lr = False
175 |
176 | def on_epoch_end(self, epoch, logs={}):
177 | if self.updated_lr:
178 | return
179 |
180 | if logs['val_loss'] < self.best_loss:
181 | self.best_loss = logs['val_loss']
182 | self.best_epoch = epoch
183 |
184 | if epoch - self.best_epoch > self.patience:
185 | old_lr = self.model.optimizer.lr.get_value()
186 | new_lr = (old_lr / self.learning_rate_divisor).astype(np.float32)
187 | print('old_lr', old_lr, 'new_lr', new_lr)
188 | self.model.optimizer.lr.set_value(new_lr)
189 | self.learning_rate_divisor = 1.
190 |
--------------------------------------------------------------------------------
/modeling/chainer_model.py:
--------------------------------------------------------------------------------
1 | import chainer
2 | import chainer.functions as F
3 | from chainer import optimizers
4 |
5 | class Model(object):
6 | def __init__(self, args):
7 | for k,v in vars(args).iteritems():
8 | self.__dict__[k] = v
9 | self.init_params()
10 | self.init_optimizer()
11 | self.optimizer.setup(self.params)
12 |
13 | def init_optimizer(self):
14 | if self.optimizer == 'SGD':
15 | self.optimizer = optimizers.MomentumSGD(
16 | lr=self.learning_rate, momentum=self.momentum)
17 | elif self.optimizer == 'AdaDelta':
18 | self.optimizer = optimizers.AdaDelta()
19 | elif self.optimizer == 'AdaGrad':
20 | self.optimizer = optimizers.AdaGrad()
21 | elif self.optimizer == 'Adam':
22 | self.optimizer = optimizers.Adam()
23 | elif self.optimizer == 'RMSprop':
24 | self.optimizer = optimizers.RMSprop()
25 |
26 | def update(self):
27 | if hasattr(self, 'weight_decay'):
28 | if self.weight_decay > 0:
29 | self.optimizer.weight_decay(self.weight_decay)
30 | self.optimizer.update()
31 |
32 | def iteration(self, data, target, train=False):
33 | if train:
34 | self.optimizer.zero_grads()
35 | pred = self.forward(data)
36 | loss, metric = self.loss(pred, target)
37 | if train:
38 | loss.backward()
39 | self.update()
40 | return pred, loss, metric
41 |
42 | def fit(self, data, target):
43 | pred, loss, metric = self.iteration(data, target, train=True)
44 | return pred, loss, metric
45 |
46 | def evaluate(self, data, target):
47 | pred, loss, metric = self.iteration(data, target)
48 | return pred, loss, metric
49 |
50 | def init_params(self):
51 | raise NotImplementedError()
52 |
53 | def forward(self):
54 | raise NotImplementedError()
55 |
56 | def loss(self, pred, target):
57 | raise NotImplementedError()
58 |
59 | def predict(self, data, target=None):
60 | raise NotImplementedError()
61 |
62 | def predict_proba(self, data):
63 | raise NotImplementedError()
64 |
65 | def to_gpu(self):
66 | self.params.to_gpu()
67 |
68 | def to_cpu(self):
69 | self.params.to_cpu()
70 |
71 | class Classifier(Model):
72 | def loss(self, pred, target):
73 | target = chainer.Variable(target)
74 | loss = F.softmax_cross_entropy(pred, target)
75 | metric = F.accuracy(pred, target)
76 | return loss, metric
77 |
78 | def predict(self, data, target=None):
79 | pred = self.forward(data, train=False)
80 | if target is None:
81 | return np.argmax(F.softmax(pred).data, axis=1)
82 | else:
83 | loss, metric = self.loss(pred, target)
84 | return pred, loss, metric
85 |
86 | def predict_proba(self, data):
87 | pred = self.forward(data, train=False)
88 | return F.softmax(pred).data
89 |
90 | class Regressor(Model):
91 | def loss(self, pred, target):
92 | target = chainer.Variable(target)
93 | loss = F.mean_squared_error(pred, target)
94 | return loss, loss
95 |
96 | def predict(self, data, target=None):
97 | pred = self.forward(data, train=False)
98 | if target is None:
99 | return pred
100 | else:
101 | loss, metric = self.loss(pred, target)
102 | return pred, loss, metric
103 |
--------------------------------------------------------------------------------
/modeling/data.py:
--------------------------------------------------------------------------------
1 | import os
2 | import h5py
3 | import numpy as np
4 |
5 | class GraphMarshaller(object):
6 | """
7 | Interface for classes that handle preparing inputs and unpacking
8 | outputs of Keras Graph models.
9 | """
10 | def marshal(self, data, target=None):
11 | raise NotImplementedError()
12 |
13 | def unmsrhal(self, output):
14 | raise NotImplementedError()
15 |
16 | def split_data(hdf5_path, split_size, output_dir=None):
17 | """
18 | Split the datasets in an HDF5 file into smaller sets and save them
19 | to new files. By default the files are put into a subdirectory of
20 | the directory containing `hdf5_path`. The subdirectory is created
21 | if it does not exist; the name of the directory is `hdf5_path` with
22 | the file suffix removed. To write to a different directory, provide
23 | the path to the existing directory in `output_dir`.
24 |
25 | Parameters
26 | -------
27 | hdf5_path : str
28 | The path to the HDF5 file.
29 | split_size : int
30 | The size of the
31 | """
32 | f = h5py.File(hdf5_path)
33 | n = 0
34 | # Find the largest n.
35 | for k,v in f.iteritems():
36 | n = max(n, v.value.shape[0])
37 |
38 | if output_dir is None:
39 | output_dir = os.path.splitext(hdf5_path)[0]
40 | os.mkdir(output_dir)
41 |
42 | # Copy subsequences of the data to smaller files.
43 | width = int(np.ceil(np.log10(n / split_size)))
44 | for i,j in enumerate(range(0, n, split_size)):
45 | outfile = '{dir}/{num:{fill}{width}}.h5'.format(
46 | dir=output_dir, num=i, fill='0', width=width)
47 | print(outfile)
48 | fout = h5py.File(outfile, 'w')
49 | for k,v in f.iteritems():
50 | subset = v[j:j+split_size]
51 | fout.create_dataset(k, data=subset, dtype=v.dtype)
52 | fout.close()
53 |
54 | def balance_classes(target):
55 | """
56 | Get a subset of the indices in the target variable of an imbalanced dataset
57 | such that each class has the same number of occurrences. This is to be used
58 | in conjunction with `balance_datasets` to create a balanced dataset.
59 |
60 | Parameters
61 | ---------
62 | target : array-like of int
63 | The target variable from which to sample.
64 | """
65 | n = min(np.bincount(target))
66 | n_even = n/2
67 | indices = []
68 |
69 | for code in np.arange(max(target)+1):
70 | mask = target == code
71 | idx = np.sort(np.where(mask)[0])
72 | # Only sample from the even indices so the downsampled dataset
73 | # still consists of pairs of positive and negative examples.
74 | even_idx = idx[idx % 2 == 0]
75 | sampled_even_idx = np.sort(np.random.choice(even_idx, size=n_even, replace=False))
76 | # Add the odd-numbered examples of errors.
77 | sampled_idx = np.concatenate([sampled_even_idx, sampled_even_idx+1])
78 | sampled_idx = np.sort(sampled_idx)
79 | indices.extend(sampled_idx)
80 |
81 | return np.sort(indices)
82 |
83 | def balance_datasets(hdf5_file, key='original_word_code'):
84 | """
85 | Balance the datasets in an HDF5 file. A balanced sample of
86 | the dataset denoted by `key` is taken. The corresponding
87 | examples from all other datasets are sampled, too.
88 |
89 | Parameters
90 | -----------
91 | hdf5_file : h5py.File
92 | An open HDF5 file.
93 | key : str
94 | The key of the target variable in `hdf5_file` to balance.
95 | """
96 | idx = balance_classes(hdf5_file[key].value)
97 | for key in hdf5_file.keys():
98 | value = hdf5_file[key].value
99 | del hdf5_file[key]
100 | hdf5_file.create_dataset(key, data=value[idx], dtype=value.dtype)
101 |
102 | def mask_zero_for_rnn(hdf5_fh, n_vocab):
103 | """
104 | Given an HDF5 data set with inputs `X` (the entire sentence),
105 | `Xwindow` (the window of words around e.g. a preposition), and
106 | `XwindowNULL` (the window of words as in `Xwindow` with the center
107 | word replaced by a nonce), transform the inputs as follows:
108 |
109 | a) Change 0 in every position before the end of the sentence to
110 | vocab_size + 1.
111 | b) Change 0 in every position after the beginning of the sentence
112 | to vocab_size + 1.
113 |
114 | Unmodified, the inputs `X`, etc., use 0 to indicate both that the
115 | word is unknown and that the sentence has ended (i.e. for padding
116 | a variable-length input like a sentence to fill all of the columns
117 | of a matrix). The reasons to change this is that (1) some models,
118 | like recurrent neural networks, pay attention to every detail of
119 | their input and (2) some frameworks, like Keras, allow you do mask
120 | out 0's, so the model gets less confused.
121 |
122 | The `len` key has the offset at which the sentence ends in `X`.
123 |
124 | The `window_position` key in the data set has the offset at which
125 | the preposition occurs in `X`.
126 |
127 | Parameters
128 | ------------
129 | hdf5_fh :
130 | A open, writable HDF5 file.
131 | n_vocab : int
132 | The number of words in the model's vocabulary.
133 | """
134 | XRNN = renumber_unknowns_in_sentence(
135 | hdf5_fh['X'].value,
136 | hdf5_fh['len'].value,
137 | n_vocab)
138 | hdf5_fh.create_dataset('XRNN', data=XRNN, dtype=XRNN.dtype)
139 |
140 | XwindowRNN = renumber_unknowns_in_window(
141 | hdf5_fh['Xwindow'].value,
142 | hdf5_fh['window_position'].value,
143 | n_vocab)
144 | hdf5_fh.create_dataset('XwindowRNN', data=XwindowRNN, dtype=XwindowRNN.dtype)
145 |
146 | XwindowNULLRNN = renumber_unknowns_in_window(
147 | hdf5_fh['XwindowNULL'].value,
148 | hdf5_fh['window_position'].value,
149 | n_vocab)
150 | hdf5_fh.create_dataset('XwindowNULLRNN', data=XwindowNULLRNN, dtype=XwindowNULLRNN.dtype)
151 |
152 | return hdf5_fh
153 |
154 | def renumber_unknowns_in_sentence(X, lengths, n_vocab):
155 | """
156 | So, to transform `X` as described in item (a) above,
157 |
158 | * Find every occurrence of a 0 before the end of a sentence,
159 | using `len` to determine where the sentence ends.
160 | * Replace those occurences with `n_vocab`.
161 | """
162 |
163 | X = X.copy()
164 | for i,length in enumerate(lengths):
165 | sent = X[i]
166 | zeros_in_sent = [False] * X.shape[1]
167 | # Add 2 for leading '' and trailing ''.
168 | zeros_in_sent[:length+2] = sent[:length+2] == 0
169 | if np.any(zeros_in_sent):
170 | X[i, zeros_in_sent] = n_vocab
171 | return X
172 |
173 | def renumber_unknowns_in_window(Xwindow, window_positions, n_vocab):
174 | """
175 | And to transform `Xwindow` and `XwindowNULL` for item (b),
176 |
177 | * Find every occurrence of a 0 after the beginning of a sentence
178 | using `window_position` to determine where in the window the
179 | sentence begins. If `window_position` is 0, the first two
180 | positions in the window will be 0, because the preposition in
181 | that case is the first word in the sentence and it appears at
182 | the center of the window (index 2, with windows of length 5).
183 | Those first two words must remain 0, as they indicate the
184 | absence of words. If `window_position` is 1, only the first
185 | word must remain 0; the word in the second position of the
186 | window could be 0 because it is out of vocabulary. And if
187 | `window_position` is 2, then the first two words, if 0, are
188 | 0 because they're out of vocabulary. Thus, the indices in the
189 | window that should be checked for the "zero because out of
190 | vocabulary" case start at max(0, 2-`window_position`). (NB:
191 | I didn't find any occurrences of `window_position` > `len`,
192 | just some occurrences of `window_position` == `len` - 2,
193 | which with sentence-terminating punctuation and the
194 | padding character at the end of each sentence just means
195 | that there are several sentences that end with a preposition.
196 | So we only need to deal with the beginning of the window.)
197 | * Replace those occurrences with `n_vocab`.
198 | """
199 | Xwindow = Xwindow.copy()
200 | for i,window_position in enumerate(window_positions):
201 | window = Xwindow[i]
202 | start = max(0, 2 - window_position)
203 | zeros_in_window = window == 0
204 | zeros_in_window[0:start] = False
205 | if np.any(zeros_in_window):
206 | Xwindow[i, zeros_in_window] = n_vocab
207 | return Xwindow
208 |
209 | def create_window(sentence, position, size=7, nonce=None):
210 | """
211 | Create a fixed-width window onto a sentence centered at some position.
212 | The sentence is assumed not to contain sentence-initial and -terminating
213 | markup (i.e. no '' element immediately before the start of the
214 | sentence and no '' immediately after its end). (If they were included
215 | in `sentence`, we would exclude them for backward compatibility with other
216 | preprocesing code.) It is also assumed not to be padded with trailing zeros.
217 |
218 | Parameters
219 | ---------
220 | sentence : np.ndarray
221 | An array of integers that represents a sentence. The integers
222 | are indices in a model's vocabulary.
223 | position : int
224 | The 0-based index of the word in the sentence on which the window
225 | should be centered.
226 | size : int
227 | The size of the window. Must be odd.
228 | nonce : int or None
229 | The index in the vocabulary of the nonce word to put at the
230 | center of the window, replacing the index of the existing word.
231 | When None, this does not occur.
232 | """
233 | if position < 0 or position >= len(sentence):
234 | raise ValueError("`position` (%d) must lie within sentence (len=%d)" %
235 | (position, len(sentence)))
236 |
237 | # Get exactly the positions in `sentence` to copy to `window`.
238 | window_start = position - size/2
239 | window_end = position + size/2
240 | sent_range = np.arange(window_start, window_end+1)
241 | sent_mask = (sent_range >= 0) & (sent_range < len(sentence))
242 | sent_indices = sent_range[sent_mask]
243 |
244 | window_range = np.arange(0, size)
245 | window_indices = window_range[sent_mask]
246 |
247 | #print('window_start', window_start, 'window_end', window_end, 'sent_range', sent_range, 'sent_mask', sent_mask, 'sent_indices', sent_indices, 'window_range', window_range, 'window_indices', window_indices, 'sentence', sentence, 'position', position)
248 |
249 | window = np.zeros(size)
250 | window[window_indices] = sentence[sent_indices]
251 |
252 | if nonce is not None:
253 | window[size/2] = nonce
254 |
255 | return window
256 |
257 | def create_windows(sentences, lengths, positions, size, nonce=None):
258 | windows = np.zeros((len(sentences), size))
259 | for i, sentence in enumerate(sentences):
260 | length = lengths[i]
261 | position = positions[i]
262 | sentence_without_zero_padding = sentence[0:length+2]
263 | sentence_without_markup = sentence_without_zero_padding[1:-1]
264 | windows[i] = create_window(
265 | sentence_without_markup,
266 | position=position,
267 | size=size,
268 | nonce=nonce)
269 | return windows
270 |
271 | def add_window_dataset(hdf5_file, name, size, nonce=None, sentences_name='X'):
272 | sentences = hdf5_file[sentences_name].value
273 | lengths = hdf5_file['len'].value
274 | positions = hdf5_file['window_position'].value
275 |
276 | windows = create_windows(sentences, lengths, positions, size, nonce)
277 | hdf5_file.create_dataset(name, data=windows, dtype=np.int32)
278 |
279 | def create_contrasting_cases(X, seed=17, values=[7,8,10,12,13,17,18,19,27]):
280 | center_idx = int(X.shape[1]/2)
281 | rng = np.random.RandomState(seed)
282 | Xcc = np.zeros((X.shape[0]*2, X.shape[1]), dtype=X.dtype)
283 |
284 | for i in np.arange(len(X)):
285 |
286 | # Original example
287 | j = i * 2
288 | Xcc[j, :] = X[i, :]
289 |
290 | # Contrasting case
291 | cc = X[i, :].copy()
292 |
293 | while True:
294 | replacement_value = rng.choice(values)
295 | if replacement_value != cc[center_idx]:
296 | break
297 |
298 | cc[center_idx] = replacement_value
299 | Xcc[j+1, :] = cc
300 |
301 | return Xcc
302 |
303 | def duplicate_values(values):
304 | new_values = np.zeros(len(values)*2)
305 | for i,value in enumerate(values):
306 | j = i * 2
307 | new_values[j] = value
308 | new_values[j+1] = value
309 | return new_values
310 |
--------------------------------------------------------------------------------
/modeling/dataset.py:
--------------------------------------------------------------------------------
1 | import h5py
2 | from sklearn.utils import check_random_state
3 | import numpy as np
4 | from modeling.utils import balanced_class_weights
5 | from keras.utils import np_utils
6 |
7 | class HDF5FileDataset(object):
8 | def __init__(self, file_path, data_name, target_name, batch_size, one_hot=True, random_state=17):
9 | assert isinstance(data_name, (list,tuple))
10 | assert isinstance(target_name, (list,tuple))
11 |
12 | random_state = check_random_state(random_state)
13 |
14 | self.__dict__.update(locals())
15 | del self.self
16 |
17 | self._load_data()
18 | self._check_data()
19 |
20 | def _load_data(self):
21 | self.hdf5_file = h5py.File(self.file_path)
22 | self.n_classes = {}
23 | for target_name in self.target_name:
24 | self.n_classes[target_name] = np.max(self.hdf5_file[target_name])+1
25 |
26 | def _check_data(self):
27 | self.n = None
28 | for data_name in self.data_name:
29 | if self.n is None:
30 | self.n = len(self.hdf5_file[data_name])
31 | else:
32 | assert len(self.hdf5_file[data_name]) == self.n
33 | for target_name in self.target_name:
34 | assert len(self.hdf5_file[target_name]) == self.n
35 |
36 | def __getitem__(self, name):
37 | return self.hdf5_file[name].value
38 |
39 | def class_weights(self, class_weight_exponent, target):
40 | return balanced_class_weights(
41 | self.hdf5_file[target],
42 | 2,
43 | class_weight_exponent)
44 |
45 | def generator(self, one_hot=None, batch_size=None):
46 | if one_hot is None: one_hot = self.one_hot
47 | if batch_size is None: batch_size = self.batch_size
48 |
49 | while 1:
50 | idx = self.random_state.choice(self.n, size=batch_size, replace=False)
51 | batch = {}
52 | for data_name in self.data_name:
53 | batch[data_name] = self.hdf5_file[data_name].value[idx]
54 | for target_name in self.target_name:
55 | target = self.hdf5_file[target_name].value[idx]
56 | if one_hot:
57 | batch[target_name] = np_utils.to_categorical(target,
58 | self.n_classes[target_name])
59 | else:
60 | batch[target_name] = target
61 |
62 | yield batch
63 |
--------------------------------------------------------------------------------
/modeling/difference.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 |
4 | import unittest
5 | import numpy as np
6 | from theano import function
7 | import theano.tensor as T
8 |
9 | from keras.layers.core import Layer
10 |
11 | class TemporalDifference(Layer):
12 | """
13 | Given a 3-tensor with shape (nb_samples, maxlen, output_dim), outputs
14 | the difference X[
15 | """
16 | def _get_output(self, X):
17 | return X[:, 1:, :] - X[:, 0:X.shape[1]-1, :]
18 |
19 | def get_output(self, train):
20 | return self._get_output(self.get_input(train))
21 |
22 | def get_config(self):
23 | return {"name": self.__class__.__name__}
24 |
25 | class TestTemporalDifference(unittest.TestCase):
26 | def testForward(self):
27 | nb_examples = 2
28 | maxlen = 7
29 | output_dim = nb_word_dim = 5
30 | x = np.random.normal(size=(nb_examples, maxlen, output_dim)).astype(np.float32)
31 | expected = x[:, 1:, :] - x[:, 0:x.shape[1]-1, :]
32 | X = T.tensor3('X')
33 | retval = TemporalDifference()._get_output(X)
34 | f = function([X], retval)
35 | actual = f(x)
36 | self.assertTrue(np.allclose(actual, expected))
37 |
--------------------------------------------------------------------------------
/modeling/fbeta.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import theano.tensor as tt
3 | from theano import function
4 |
5 | eps = 1e-20
6 |
7 | def support(y):
8 | return y.sum(axis=0)
9 |
10 | def true_positive(y, y_hat):
11 | return (tt.eq(y_hat, y) & tt.eq(y, 1)).sum(axis=0)
12 |
13 | def make_y_diff(y, y_hat):
14 | return y_hat - y
15 |
16 | def false_positive(y_diff):
17 | return tt.eq(y_diff, 1).sum(axis=0)
18 |
19 | def true_negative(y_diff):
20 | return tt.eq(y_diff, 0).sum(axis=0)
21 |
22 | def false_negative(y_diff):
23 | return tt.eq(y_diff, -1).sum(axis=0)
24 |
25 | def precision(y, y_hat, eps=1e-9, y_diff=None):
26 | tp = true_positive(y, y_hat)
27 | if y_diff is None:
28 | y_diff = make_y_diff(y, y_hat)
29 | fp = false_positive(y_diff)
30 | return tp/(tp+fp+eps)
31 |
32 | def recall(y, y_hat, eps=1e-9, y_diff=None):
33 | tp = true_positive(y, y_hat)
34 | if y_diff is None:
35 | y_diff = make_y_diff(y, y_hat)
36 | fn = false_negative(y_diff)
37 | return tp/(tp+fn+eps)
38 |
39 | def fbeta_loss(y, y_hat, beta=0.5, eps=1e-9, average=None):
40 | """
41 | Returns the negative of the F_beta measure, because the
42 | optimizer is trying to minimize the objective.
43 | """
44 | y_diff = make_y_diff(y, y_hat)
45 | pr = precision(y, y_hat, eps=eps, y_diff=y_diff)
46 | rc = recall(y, y_hat, eps=eps, y_diff=y_diff)
47 |
48 | f_per_class = ( (1 + beta**2) * (pr * rc) ) / (beta**2 * pr + rc + eps)
49 |
50 | if average is None:
51 | f = f_per_class
52 | elif average == 'macro':
53 | f = f_per_class.mean()
54 | elif average == 'weighted':
55 | s = support(y)
56 | f = ((f_per_class * s) / s.sum()).sum()
57 |
58 | return -f
59 |
60 |
61 | y = tt.matrix('y', dtype='int64')
62 | y_hat = tt.matrix('y', dtype='int64')
63 |
64 | floss = fbeta_loss(y, y_hat, average='weighted')
65 | f = function([y, y_hat], floss)
66 |
67 | loss = f(np.array([[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0]]),
68 | np.array([[0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]]))
69 |
70 | print("loss", loss)
71 | print("grad", tt.grad(loss, floss))
72 |
73 | import numpy
74 | import theano
75 | import theano.tensor as T
76 | rng = numpy.random
77 |
78 | N = 400
79 | feats = 784
80 | D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2))
81 | training_steps = 10000
82 |
83 | ###########################################################################
84 | # Declare Theano symbolic variables
85 | ###########################################################################
86 |
87 | x = T.matrix("x")
88 | y = T.vector("y")
89 | w = theano.shared(rng.randn(feats), name="w")
90 | b = theano.shared(0., name="b")
91 |
92 | print("Initial model:")
93 | print(w.get_value())
94 | print(b.get_value())
95 |
96 | ###########################################################################
97 | # Construct Theano expression graph
98 | ###########################################################################
99 |
100 | # Probability that target = 1
101 | p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b))
102 |
103 | # The prediction thresholded
104 | prediction = p_1 > 0.5
105 |
106 | # Cross-entropy loss function
107 | xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1)
108 |
109 | # The cost to minimize
110 | cost = xent.mean() + 0.01 * (w ** 2).sum()
111 |
112 | # Compute the gradient of the cost (we shall return to this in a following
113 | # section of this tutorial).
114 | gw, gb = T.grad(cost, [w, b])
115 |
116 | # Compile
117 | train = theano.function(
118 | inputs=[x,y],
119 | outputs=[prediction, xent],
120 | updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)))
121 | predict = theano.function(inputs=[x], outputs=prediction)
122 |
123 | # Train
124 | for i in range(training_steps):
125 | pred, err = train(D[0], D[1])
126 |
127 | print("Final model:")
128 | print(w.get_value())
129 | print(b.get_value())
130 | print("target values for D:")
131 | print(D[1])
132 | print("prediction on D:")
133 | print(predict(D[0]))
134 |
--------------------------------------------------------------------------------
/modeling/fbeta_predict.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.metrics import fbeta_score
3 |
4 | def make_default_targets(y, target_names):
5 | default_targets = []
6 | for target in target_names[y]:
7 | # at-on => on-on
8 | # from-about => about-about
9 | s,t = target.split('-')
10 | default = '-'.join([t, t])
11 | default_targets.append(
12 | np.where(target_names == default)[0][0])
13 | return default_targets
14 |
15 | def predict_for_fbeta(y_hat_proba, default_targets, threshold=0.5, threshold_type='margin'):
16 | n = y_hat_proba.shape[0]
17 | y_hat_for_fbeta = np.zeros(n, dtype=np.int)
18 |
19 | if threshold_type not in ['margin', 'value']:
20 | raise ValueError('threshold_type must be either "margin" or "value"')
21 |
22 | for i in np.arange(n):
23 | most, next_most = np.argsort(y_hat_proba[i, :])[[-2,-1]]
24 | if threshold_type == 'margin':
25 | if y_hat_proba[i, most] - y_hat_proba[i, next_most] > threshold:
26 | y_hat_for_fbeta[i] = most
27 | else:
28 | y_hat_for_fbeta[i] = default_targets[most]
29 | elif threshold_type == 'value':
30 | if y_hat_proba[i, most] > threshold:
31 | y_hat_for_fbeta[i] = most
32 | else:
33 | y_hat_for_fbeta[i] = default_targets[most]
34 |
35 | return y_hat_for_fbeta
36 |
--------------------------------------------------------------------------------
/modeling/lasagne_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import lasagne
3 | import theano.tensor as T
4 | import theano
5 |
6 | class Model(object):
7 | def __init__(self, config):
8 | self.config = config
9 |
10 | self.input_var = self.build_input_var()
11 | self.target_var = self.build_target_var()
12 |
13 | self.model = self.build_model()
14 |
15 | self.train_output = lasagne.layers.get_output(self.model)
16 | self.train_loss = self.build_loss(self.train_output)
17 | self.params = lasagne.layers.get_all_params(self.model, trainable=True)
18 | self.updates = self.build_updates()
19 |
20 | self.test_output = lasagne.layers.get_output(self.model,
21 | deterministic=True)
22 | self.test_loss = self.build_loss(self.test_output)
23 | self.test_accuracy = T.eq(
24 | T.argmax(self.test_output, axis=1), self.target_var)
25 | self.test_accuracy = T.mean(
26 | self.test_accuracy, dtype=theano.config.floatX)
27 |
28 | self.train_fn = theano.function(
29 | [self.input_var, self.target_var],
30 | self.train_loss,
31 | updates=self.updates)
32 |
33 | self.val_fn = theano.function(
34 | [self.input_var, self.target_var],
35 | [self.test_loss, self.test_accuracy])
36 |
37 | self.pred_fun = theano.function([self.input_var], self.test_output)
38 |
39 | def build_input_var(self):
40 | raise NotImplementedError()
41 |
42 | def build_target_var(self):
43 | raise NotImplementedError()
44 |
45 | def build_updates(self):
46 | raise NotImplementedError()
47 |
48 | def build_model(self):
49 | raise NotImplementedError()
50 |
51 | def fit(self, data, target):
52 | return self.train_fn(data, target)
53 |
54 | def evaluate(self, data, target):
55 | output = self.val_fn(data, target)
56 | return output[0], output[1]
57 |
58 | def predict(self, data):
59 | pred = self.pred_fn(data)
60 | return pred
61 |
62 | def save_weights(self, path):
63 | np.savez(path, *lasagne.layers.get_all_param_values(self.model))
64 |
65 | def load_weights(self, path):
66 | with np.load(path) as f:
67 | params = [f['arr_%d' % i] for i in range(len(f.files))]
68 | lasagne.layers.set_all_param_values(self.model, params)
69 |
70 | class Classifier(Model):
71 | def build_loss(self, output):
72 | loss = lasagne.objectives.categorical_crossentropy(
73 | output, self.target_var)
74 | return loss.mean()
75 |
76 | class Regressor(Model):
77 | def build_loss(self, output):
78 | loss = lasagne.objectives.squared_error(
79 | output, self.target_var)
80 | return loss.mean()
81 |
--------------------------------------------------------------------------------
/modeling/layers.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import logging
3 | import numpy as np
4 | import theano.tensor as T
5 | import theano.tensor.nnet
6 |
7 | from keras.layers.embeddings import Embedding
8 | from keras.layers.convolutional import Convolution1D
9 | from keras.layers.core import Layer
10 | from keras import activations, initializations, regularizers, constraints
11 |
12 | from keras import backend as K
13 |
14 | logger = logging.getLogger()
15 |
16 | class ImmutableEmbedding(Embedding):
17 | '''
18 | Same as Embedding except the weights are not parameters of the
19 | network. This can be useful when the layer is initialized with
20 | pre-trained embeddings, such as Word2Vec.
21 |
22 | @input_dim: size of vocabulary (highest input integer + 1)
23 | @out_dim: size of dense representation
24 | '''
25 | def __init__(self, input_dim, output_dim, **kwargs):
26 | super(ImmutableEmbedding, self).__init__(
27 | input_dim, output_dim, **kwargs)
28 | self.params = []
29 |
30 | def build(self):
31 | super(ImmutableEmbedding, self).build()
32 | self.params = []
33 |
34 | class ImmutableConvolution1D(Convolution1D):
35 | '''
36 | Same as Convolution1D except the convolutional filters are not
37 | parameters of the network. This can be useful when the layer
38 | is initialized with pre-trained convolutional filters.
39 |
40 | @nb_filters: the number of convolutional filters
41 | @filter_width: the width of each filter
42 | '''
43 | def __init__(self, nb_filters, filter_width, **kwargs):
44 | super(ImmutableConvolution1D, self).__init__(
45 | nb_filters, filter_width, **kwargs)
46 | self.params = []
47 |
48 | def build(self):
49 | super(ImmutableConvolution1D, self).build()
50 | self.params = []
51 |
52 | class Transpose(Layer):
53 | def __init__(self):
54 | super(Transpose, self).__init__()
55 | self.input = T.matrix()
56 |
57 | def _get_output(self, X):
58 | return X.T
59 |
60 | def get_output(self, train):
61 | return self._get_output(self.get_input(train))
62 |
63 | def get_config(self):
64 | return {"name": self.__class__.__name__}
65 |
66 | class HierarchicalSoftmax(Layer):
67 | def __init__(self, output_dim, nb_hsm_classes, batch_size,
68 | init='glorot_uniform',
69 | W1_weights=None, W1_regularizer=None, W1_constraint=None,
70 | W2_weights=None, W2_regularizer=None, W2_constraint=None,
71 | b1_regularizer=None, b1_constraint=None,
72 | b2_regularizer=None, b2_constraint=None,
73 | input_dim=None, **kwargs):
74 |
75 | self.__dict__.update(locals())
76 | del self.self
77 |
78 | self.init = initializations.get(init)
79 | #self.output_dim = nb_classes * nb_outputs_per_class
80 | self.nb_outputs_per_class = int(np.ceil(output_dim / float(nb_hsm_classes)))
81 |
82 | self.W1_regularizer = regularizers.get(W1_regularizer)
83 | self.b1_regularizer = regularizers.get(b1_regularizer)
84 | self.W2_regularizer = regularizers.get(W2_regularizer)
85 | self.b2_regularizer = regularizers.get(b2_regularizer)
86 |
87 | self.W1_constraint = constraints.get(W1_constraint)
88 | self.b1_constraint = constraints.get(b1_constraint)
89 | self.W2_constraint = constraints.get(W2_constraint)
90 | self.b2_constraint = constraints.get(b2_constraint)
91 |
92 | self.constraints = [self.W1_constraint, self.b1_constraint,
93 | self.W2_constraint, self.b2_constraint]
94 |
95 | #self.initial_weights = weights
96 | self.input_dim = input_dim
97 | if self.input_dim:
98 | kwargs['input_shape'] = (self.input_dim,)
99 | self.input = T.matrix()
100 | super(HierarchicalSoftmax, self).__init__(**kwargs)
101 |
102 | def build(self):
103 | #print('self.input_shape', self.input_shape)
104 | n_features = self.input_shape[1]
105 |
106 | self.W1 = self.init((n_features, self.nb_hsm_classes))
107 | self.b1 = K.zeros((self.nb_hsm_classes,))
108 |
109 | self.W2 = self.init((self.nb_hsm_classes, n_features, self.nb_outputs_per_class))
110 | self.b2 = K.zeros((self.nb_hsm_classes, self.nb_outputs_per_class))
111 |
112 | self.trainable_weights = [self.W1, self.b1,
113 | self.W2, self.b2]
114 |
115 | self.regularizers = []
116 | if self.W1_regularizer:
117 | self.W1_regularizer.set_param(self.W1)
118 | self.regularizers.append(self.W1_regularizer)
119 |
120 | if self.b1_regularizer:
121 | self.b1_regularizer.set_param(self.b1)
122 | self.regularizers.append(self.b1_regularizer)
123 |
124 | if self.W2_regularizer:
125 | self.W2_regularizer.set_param(self.W2)
126 | self.regularizers.append(self.W2_regularizer)
127 |
128 | if self.b2_regularizer:
129 | self.b2_regularizer.set_param(self.b2)
130 | self.regularizers.append(self.b2_regularizer)
131 |
132 | @property
133 | def output_shape(self):
134 | print('HierarchicalSoftmax.output_shape', self.input_shape[0], self.output_dim)
135 | return (self.input_shape[0], self.output_dim)
136 |
137 | def _get_output(self, X):
138 | output = theano.tensor.nnet.h_softmax(X,
139 | #self.input_shape[1], self.output_dim,
140 | self.batch_size, self.output_dim,
141 | self.nb_hsm_classes, self.nb_outputs_per_class,
142 | self.W1, self.b1,
143 | self.W2, self.b2)
144 | return output
145 |
146 | def get_output(self, train=False):
147 | return self._get_output(self.get_input(train))
148 |
149 |
--------------------------------------------------------------------------------
/modeling/nonconvnet.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 | import theano
4 | import theano.tensor as T
5 | import unittest
6 | import logging
7 |
8 | logger = logging.getLogger()
9 |
10 | from keras.layers.core import Layer
11 | from keras.utils.theano_utils import sharedX
12 |
13 | class SplitOutputByFilter(Layer):
14 | """
15 | input: (batch_size, max_seq_len, n_filters * filter_width)
16 | output: (batch_size, n_filters, max_seq_len, filter_width)
17 | """
18 | def __init__(self, n_filters, filter_width):
19 | super(SplitOutputByFilter, self).__init__()
20 | self.n_filters = n_filters
21 | self.filter_width = filter_width
22 | self.input = T.tensor3()
23 |
24 | def slice(self, i, X):
25 | start = i * self.filter_width
26 | end = (i+1) * self.filter_width
27 | return X[:, :, start:end]
28 |
29 | def _get_output(self, X):
30 | outputs, updates = theano.scan(
31 | fn=self.slice,
32 | outputs_info=None,
33 | sequences=[T.arange(self.n_filters)],
34 | non_sequences=X)
35 | return outputs.dimshuffle(1, 0, 2, 3)
36 |
37 | def get_output(self, train):
38 | return self._get_output(self.get_input(train))
39 |
40 | def get_config(self):
41 | return {"name": self.__class__.__name__}
42 |
43 | class SlidingWindowL2MaxPooling(Layer):
44 | '''
45 | input: (batch_size, n_filters, max_seq_len, filter_width)
46 | output: (batch_size, n_filters, filter_width, filter_width)
47 | '''
48 | def __init__(self, batch_size, n_filters, filter_width, max_seq_len):
49 | super(SlidingWindowL2MaxPooling, self).__init__()
50 | self.batch_size = batch_size
51 | self.n_filters = n_filters
52 | self.filter_width = filter_width
53 | self.max_seq_len = max_seq_len
54 |
55 | def get_output(self, train):
56 | return self._get_output(self.get_input(train))
57 |
58 | def _get_output(self, X):
59 | outputs, updates = theano.scan(
60 | fn=self.sample_dimension,
61 | sequences=[T.arange(self.batch_size)],
62 | non_sequences=X)
63 | return outputs
64 |
65 | def sample_dimension(self, i, X):
66 | '''
67 | Takes a 4-tensor of shape `(batch_size, n_filters, max_seq_len,
68 | filter_width)` and an index into its first dimension. Returns the
69 | `(batch_size, n_filters, filter_width, filter_width)` subtensor
70 | with the greatest L2 norm along the third dimension.
71 |
72 | Parameters
73 | ----------
74 | X : a 4-tensor
75 | An `(batch_size, n_filters, max_seq_len, filter_width)` tensor.
76 | i : int
77 | An index into the first dimension of `X`.
78 |
79 | Returns
80 | ----------
81 | A 3-tensor of shape `(n_filters, filter_width, filter_width)`
82 | consisting of the subtensor of `X` with the greatest L2 norm along
83 | `X`'s third dimension (where `max_seq_len` lies).
84 | '''
85 | outputs, updates = theano.scan(
86 | fn=self.filter_dimension,
87 | sequences=[T.arange(self.n_filters)],
88 | non_sequences=X[i, :, :, :])
89 |
90 | return outputs
91 |
92 | def filter_dimension(self, i, X):
93 | '''
94 | Takes a 3-tensor of shape `(n_filters, max_seq_len, filter_width)`
95 | and an index into its first dimension. Returns the
96 | `(filter_width, filter_width)` subtensor of `X` with the greatest
97 | L2 norm along the second dimension.
98 |
99 | Parameters
100 | ----------
101 | X : a 3-tensor
102 | An `(batch_size, n_filters, max_seq_len, filter_width)` tensor.
103 | i : int
104 | An index into the first dimension of `X`.
105 |
106 | Returns
107 | ----------
108 | A 2-tensor of shape `(filter_width, filter_width)` consisting
109 | of the subtensor of the i-th element along the first dimension
110 | of `X` with the greatest L2 norm along `X`'s second dimension
111 | (where `max_seq_len` lies).
112 | '''
113 | norms, updates = theano.scan(
114 | fn=self.norm,
115 | sequences=[T.arange(self.max_seq_len)],
116 | non_sequences=X[i, :, :])
117 | start_window = T.argmax(norms)
118 | end_window = start_window + self.filter_width
119 | return X[i, start_window:end_window, :]
120 |
121 | def norm(self, i, X):
122 | return (X[i:i+self.filter_width, :] ** 2).sum()
123 |
124 | class ZeroFillDiagonals(Layer):
125 | '''
126 | input: (batch_size, n_filters, filter_width, filter_width)
127 | output: (batch_size, n_filters, filter_width, filter_width) with the
128 | diagonal of the last two `(filter_width, filter_width)` dimensions
129 | zeroed out.
130 | '''
131 | def __init__(self, batch_size, n_filters, filter_width):
132 | super(ZeroFillDiagonals, self).__init__()
133 | self.batch_size = batch_size
134 | self.n_filters = n_filters
135 | self.filter_width = filter_width
136 |
137 | # Construct a shared boolean matrix by which to multiply the input
138 | # element-wise. It should be 0 everywhere except on the diagonals
139 | # of the last two dimensions.
140 | input_shape = (batch_size, n_filters, filter_width, filter_width)
141 | mask = np.ones(input_shape)
142 | diag_indices = np.arange(filter_width)
143 | for i in np.arange(batch_size):
144 | for j in np.arange(n_filters):
145 | mask[i, j, diag_indices, diag_indices] = 0
146 | self.mask = sharedX(mask, dtype='int32')
147 |
148 | def get_output(self, train):
149 | return self._get_output(self.get_input(train))
150 |
151 | def _get_output(self, X):
152 | return X * self.mask
153 |
--------------------------------------------------------------------------------
/modeling/outliers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | import argparse
5 | import os.path
6 | import cPickle
7 | from itertools import product
8 |
9 | import theano
10 | import pylearn2
11 | from pylearn2.config import yaml_parse
12 |
13 | import numpy as np
14 | from numpy.random import multivariate_normal as mvnormal
15 | from numpy.random import uniform
16 | from scipy.spatial.distance import pdist, squareform
17 | from scipy.stats import pearsonr
18 |
19 | import matplotlib.pyplot as plt
20 | from mpl_toolkits.mplot3d import axes3d
21 |
22 | from sklearn.covariance import MinCovDet, EmpiricalCovariance
23 | from sklearn.decomposition import PCA
24 |
25 | ###########################################################################
26 | # This class was useful for simulating data sets while developing
27 | # this script.
28 | # means = [[0, 0]]
29 | # cov = [[2, 1], [1, 2]]
30 | # n = 5000
31 | # mvn = PMeansMultivariateNormal(n, means, cov)
32 | # X = mvn.generate()
33 | # X.shape
34 | # np.savetxt(X, file='simulated.csv')
35 | ###########################################################################
36 | class PMeansMultivariateNormal(object):
37 | def __init__(self, means, cov, size):
38 | self.__dict__.update(locals())
39 | del self.self
40 | #self.n = n
41 | #self.means = means
42 | #self.cov = cov
43 |
44 | def generate(self):
45 | return mvnormal(self.means, self.cov, self.size)
46 | '''
47 | X = np.empty(shape=(self.n*len(self.means), 2))
48 | for i, mean in enumerate(self.means):
49 | idx = range(i*self.n, i*self.n+self.n)
50 | x, y = mvnormal(mean, self.cov, self.n).T
51 | X[idx, 0] = x
52 | X[idx, 1] = y
53 | return X
54 | '''
55 |
56 | def reconstruction_error(a, b):
57 | return ((a - b)**2).sum(axis=1)
58 |
59 | def train_autoencoder(dataset_path, nvis=2, nhid=2, act_enc=None, act_dec=None):
60 | yaml = open('outliers.yaml', 'r').read()
61 | if act_enc is None:
62 | act_enc = 'null'
63 | else:
64 | act_enc = "'" + act_enc + "'"
65 |
66 | if act_dec is None:
67 | act_dec = 'null'
68 | else:
69 | act_dec = "'" + act_dec + "'"
70 |
71 | params = {
72 | 'dataset_path': dataset_path,
73 | 'nvis': nvis,
74 | 'nhid': nhid,
75 | 'act_enc': act_enc,
76 | 'act_dec': act_dec,
77 | 'learning_rate': 0.05,
78 | 'save_path': 'outliers.pkl'
79 | }
80 |
81 | yaml = yaml % (params)
82 |
83 | train = yaml_parse.load(yaml)
84 | train.main_loop()
85 |
86 | pkl = open('outliers.pkl')
87 | return cPickle.load(pkl)
88 |
89 | class NullTransformer(object):
90 | def fit(self, X):
91 | pass
92 |
93 | def fit_transform(self, X):
94 | return X
95 |
96 | def transform(self, X):
97 | return X
98 |
99 | def main():
100 | parser = argparse.ArgumentParser(
101 | description='Plot outlier-like distances for a 2-dimensional dataset')
102 | parser.add_argument(
103 | 'dataset', type=argparse.FileType('r'),
104 | help='a CSV file containing the dataset')
105 | parser.add_argument(
106 | '--plot', type=str, choices=['train', 'grid'], default='grid',
107 | help='plot the dataset or a grid evenly distributed over its span')
108 | parser.add_argument(
109 | '--plotdims', type=int, choices=[2, 3], default=2,
110 | help='the number of dimensions to plot')
111 |
112 | args = parser.parse_args()
113 |
114 | X = np.loadtxt(args.dataset, delimiter=',')
115 | fig = plt.figure()
116 |
117 | xformer = NullTransformer()
118 |
119 | if X.shape[1] > 2:
120 | xformer = PCA(n_components=2)
121 | X = xformer.fit_transform(X)
122 |
123 | if args.plotdims == 2:
124 | plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0')
125 | else:
126 | plt.scatter(X[:, 0], X[:, 1])
127 | plt.show(block=False)
128 |
129 | path_to_script = os.path.realpath(__file__)
130 | dir_of_script = os.path.dirname(path_to_script)
131 | dataset_path = dir_of_script + '/outliers.npy'
132 | np.save(dataset_path, X)
133 |
134 | ###########################################################################
135 | # Train autoencoder with the n samples until convergence. Run
136 | # evenly distributed samples through the autoencoder and compute
137 | # their reconstruction error.
138 | ###########################################################################
139 |
140 | maxseq_orig = np.max(X)
141 | minseq_orig = np.min(X)
142 | seqrange = np.abs(maxseq_orig - minseq_orig)
143 | maxseq = maxseq_orig + 0.5 * seqrange
144 | minseq = minseq_orig - 0.5 * seqrange
145 | print("minseq", minseq, "maxseq", maxseq)
146 | if args.plot == 'grid':
147 | seq = np.linspace(minseq, maxseq, num=50, endpoint=True)
148 | Xplot = np.array([_ for _ in product(seq, seq)])
149 | else:
150 | Xplot = X
151 |
152 | robust_cov = MinCovDet().fit(X)
153 | robust_md = robust_cov.mahalanobis(Xplot)
154 |
155 | empirical_cov = EmpiricalCovariance().fit(X)
156 | empirical_md = empirical_cov.mahalanobis(Xplot)
157 |
158 | # Assume Xplot is at least 2-dimensional.
159 | if Xplot.shape[1] > 2:
160 | Xplot2d = bh_sne(Xplot)
161 | else:
162 | Xplot2d = Xplot
163 |
164 | robust_md01 = robust_md - np.nanmin(robust_md)
165 | robust_md01 = robust_md01 / np.nanmax(robust_md01)
166 |
167 | empirical_md01 = empirical_md - np.nanmin(empirical_md)
168 | empirical_md01 = empirical_md01 / np.nanmax(empirical_md01)
169 |
170 | fig = plt.figure()
171 | if args.plotdims == 2:
172 | ax = fig.add_subplot(1, 1, 1)
173 | ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
174 | cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0')
175 | else:
176 | ax = fig.add_subplot(1, 1, 1, projection='3d')
177 | ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01,
178 | cmap=plt.cm.jet, color=robust_md01)
179 | ax.set_zlabel('Mahalanobis distance')
180 | ax.set_xlabel('x')
181 | ax.set_ylabel('y')
182 | ax.set_title('Mahalanobis distance (robust covariance)')
183 |
184 | fig = plt.figure()
185 | if args.plotdims == 2:
186 | ax = fig.add_subplot(1, 1, 1)
187 | ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
188 | cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0')
189 | else:
190 | ax = fig.add_subplot(1, 1, 1, projection='3d')
191 | ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01,
192 | cmap=plt.cm.jet, color=empirical_md01)
193 | ax.set_zlabel('Mahalanobis distance')
194 |
195 | ax.set_xlabel('x')
196 | ax.set_ylabel('y')
197 | ax.set_title('Mahalanobis distance (empirical covariance)')
198 |
199 | enc_dec = [
200 | # tanh encoder, linear decoder
201 | ['tanh', 'linear'],
202 | # sigmoid encoder, linear decoder
203 | ['sigmoid', 'linear'],
204 | #######################################################################
205 | # The reconstruction error of the autoencoders trained with the
206 | # remaining commented-out pairs don't seem to match Mahalanobis
207 | # distance very well. Feel free to uncomment them to see for
208 | # yourself.
209 | # linear encoder, linear decoder
210 | # ['linear', 'linear'],
211 | # tanh encoder, tanh decoder
212 | # ['tanh', 'tanh'],
213 | # tanh encoder, sigmoid decoder
214 | # ['tanh', 'sigmoid'],
215 | # sigmoid encoder, tanh decoder
216 | # ['sigmoid', 'tanh'],
217 | # sigmoid encoder, sigmoid decoder
218 | # ['sigmoid', 'sigmoid']
219 | #######################################################################
220 | ]
221 |
222 | for i, act in enumerate(enc_dec):
223 | enc, dec = act
224 | if dec == 'linear':
225 | dec = None
226 | model = train_autoencoder(dataset_path,
227 | act_enc=enc, act_dec=dec, nvis=X.shape[1], nhid=16)
228 |
229 | Xshared = theano.shared(
230 | np.asarray(Xplot, dtype=theano.config.floatX), borrow=True)
231 | f = theano.function([], outputs=model.reconstruct(Xshared))
232 | fit = f()
233 | error = reconstruction_error(Xplot, fit)
234 |
235 | error01 = error - np.nanmin(error)
236 | error01 = error01 / np.nanmax(error01)
237 |
238 | fig = plt.figure()
239 | if args.plotdims == 2:
240 | ax = fig.add_subplot(1, 1, 1)
241 | ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
242 | cmap=plt.cm.jet, c=error, s=60, linewidth='0')
243 | else:
244 | ax = fig.add_subplot(1, 1, 1, projection='3d')
245 | ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], error,
246 | cmap=plt.cm.jet, color=error01)
247 | ax.set_zlabel('Reconstruction error')
248 |
249 | ax.set_xlabel('x')
250 | ax.set_ylabel('y')
251 | encdec_type = ', '.join(act)
252 | ax.set_title('Reconstruction error (' + encdec_type + ')')
253 |
254 | print("Correlation of robust MD and reconstruction error (" +
255 | str(encdec_type) + ") " + str(pearsonr(robust_md, error)))
256 | print("Correlation of empirical MD and reconstruction error (" +
257 | str(encdec_type) + ") " + str(pearsonr(empirical_md, error)))
258 |
259 | print("Correlation of robust MD and empirical MD " +
260 | str(pearsonr(robust_md, empirical_md)))
261 |
262 | os.remove(dataset_path)
263 | os.remove('outliers.pkl')
264 |
265 | plt.show(block=True)
266 |
267 | if __name__ == '__main__':
268 | sys.exit(main())
269 |
--------------------------------------------------------------------------------
/modeling/parser.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import argparse
3 | import numpy
4 |
5 | def kvpair(s):
6 | try:
7 | k,v = s.split('=')
8 | if '.' in v:
9 | try:
10 | v = float(v)
11 | except ValueError:
12 | pass
13 | else:
14 | try:
15 | v = int(v)
16 | except ValueError:
17 | pass
18 | return k,v
19 | except:
20 | raise argparse.ArgumentTypeError(
21 | '--model-cfg arguments must be KEY=VALUE pairs')
22 |
23 | def build_chainer():
24 | parser = build()
25 | parser.add_argument('--gpu', '-g', default=-1, type=int,
26 | help='GPU ID (negative value indicates CPU)')
27 | return parser
28 |
29 | def build_keras():
30 | parser = build()
31 | return parser
32 |
33 | def build_lasagne():
34 | parser = build()
35 | parser.add_argument('--progress', action='store_true',
36 | help='Whether to display a progress for training and validation')
37 | return parser
38 |
39 | def build():
40 | parser = argparse.ArgumentParser(
41 | description='Train a model.')
42 | parser.add_argument('model_dir', metavar="MODEL_DIR", type=str,
43 | help='The base directory of this model. Must contain a model.py (model code) and a model.json (hyperparameters). Model configuration and weights are saved to model_dir/UUID.')
44 | parser.add_argument('--model-cfg', type=kvpair, nargs='+', default=[],
45 | help='Model hyper-parameters as KEY=VALUE pairs; overrides parameters in MODEL_DIR/model.json')
46 | parser.add_argument('--model-dest', type=str, default='',
47 | help='Directory to which to copy model.py and model.json. This overrides copying to model_dir/UUID.')
48 | parser.add_argument(
49 | '--mode', type=str,
50 | choices=['transient', 'persistent', 'persistent-background'],
51 | default='persistent',
52 | help='How to run the model; in "transient" mode, output goes to the console and the model is not saved; in "persistent" mode, output goes to the console and the model is saved; in "persistent-background" mode, output goes to the model.log file and the model is saved. The default is "persistent"')
53 |
54 | return parser
55 |
--------------------------------------------------------------------------------
/modeling/preprocess.py:
--------------------------------------------------------------------------------
1 | class NullPreprocessor(object):
2 | def __init__(self):
3 | pass
4 |
5 | def fit(self, X, y=None):
6 | pass
7 |
8 | def transform(self, X, y=None):
9 | if y is None:
10 | return X
11 | else:
12 | return X, y
13 |
14 | def fit_transform(self, X, y=None):
15 | return self.transform(X, y)
16 |
--------------------------------------------------------------------------------
/modeling/residual.py:
--------------------------------------------------------------------------------
1 | from keras.models import Sequential, Graph
2 | from keras.layers.core import Dense, Activation, Layer, Dropout
3 | from keras.activations import relu
4 |
5 | class Identity(Layer):
6 | def get_output(self, train):
7 | return self.get_input(train)
8 |
9 | def build_residual_block(name, input_shape, n_hidden, n_skip=2):
10 | """
11 | Rough sketch of building blocks of layers for residual learning.
12 | See http://arxiv.org/abs/1512.03385 for motivation.
13 | """
14 | block = Graph()
15 | input_name = 'x'
16 | block.add_input(input_name, input_shape=input_shape)
17 |
18 | # The current keras graph implementation doesn't allow you to connect
19 | # an input node to an output node. Use Identity to work around that.
20 | block.add_node(Identity(), name=name+'identity', input=input_name)
21 |
22 | prev_output = input_name
23 | for i in range(n_skip):
24 | layer_name = 'h' + str(i)
25 | l = Dense(n_hidden, activation='relu')
26 | block.add_node(l, name=layer_name, input=prev_output)
27 | prev_output = layer_name
28 | if i < n_skip:
29 | block.add_node(Dropout(0.5), name=layer_name+'do', input=layer_name)
30 | prev_output = layer_name+'do'
31 |
32 | block.add_output(name=name+'output', inputs=[name+'identity', prev_output], merge_mode='sum')
33 |
34 | return block
35 |
--------------------------------------------------------------------------------
/models/keras/attention/model.json:
--------------------------------------------------------------------------------
1 |
2 | {
3 | "train_embeddings": true,
4 | "regularization_layer": "",
5 | "dropout_p": 0.5,
6 | "dropout_p_conv": 0.0,
7 | "n_embed_dims": 25,
8 | "loss": "categorical_crossentropy",
9 | "patience": 20,
10 | "batch_size": 128,
11 | "decay": 0.0,
12 | "embedding_max_norm": 1000,
13 | "filter_max_norm": 1000,
14 | "dense_max_norm": 1000,
15 | "l2_penalty": 0.0,
16 | "clipnorm": 0,
17 | "truncate_gradient": -1
18 | }
19 |
--------------------------------------------------------------------------------
/models/keras/attention/model.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.setrecursionlimit(5000)
3 | import json
4 | import h5py
5 |
6 | import numpy as np
7 |
8 | from keras.models import Sequential, Graph
9 | from keras.layers.core import (Layer, Dense, Activation, Dropout,
10 | TimeDistributedDense, TimeDistributedMerge,
11 | Flatten, Reshape)
12 | from keras.layers.normalization import BatchNormalization
13 | from keras.layers.recurrent import LSTM, GRU
14 | from keras.layers.embeddings import Embedding
15 | from keras.constraints import maxnorm
16 | from keras.regularizers import l2
17 | from keras.optimizers import SGD, Adam, Adadelta, Adagrad, RMSprop
18 |
19 | from modeling.layers import ImmutableEmbedding
20 | from modeling.difference import TemporalDifference
21 | from modeling.builders import (build_embedding_layer,
22 | build_convolutional_layer, build_pooling_layer,
23 | build_dense_layer, build_optimizer, load_weights)
24 |
25 | def error_free_examples(path):
26 | f = h5py.File(path)
27 | # Target_code is 0 when the preposition in the example is the original
28 | # preposition in the corpus and 1 when the preposition has been randomly
29 | # replaced with another one in the confusion set.
30 | idx = f['target_code'].value == 0
31 | f.close()
32 | return idx
33 |
34 | class Identity(Layer):
35 | def get_output(self, train):
36 | return self.get_input(train)
37 |
38 | class Transpose(Layer):
39 | def get_output(self, train):
40 | return self.get_input(train).T
41 |
42 | def build_model(args):
43 | np.random.seed(args.seed)
44 |
45 | graph = Graph()
46 |
47 | graph.add_input('input', input_shape=(args.input_width,), dtype='int')
48 |
49 | graph.add_node(build_embedding_layer(args),
50 | input='input', name='embedding')
51 |
52 | graph.add_node(LSTM(args.n_units,
53 | truncate_gradient=args.truncate_gradient,
54 | return_sequences=True),
55 | input='embedding', name='lstm0')
56 |
57 | graph.add_node(LSTM(args.n_units,
58 | truncate_gradient=args.truncate_gradient,
59 | return_sequences=True),
60 | input='lstm0', name='lstm1')
61 |
62 | # Attention module.
63 | graph.add_node(TimeDistributedDense(args.n_units, activation='relu'),
64 | input='lstm1', name='attention0')
65 | graph.add_node(TimeDistributedDense(args.n_units, activation='relu'),
66 | input='attention0', name='attention1')
67 | graph.add_node(TimeDistributedDense(args.n_units, activation='softmax'),
68 | input='attention1', name='attention2')
69 |
70 | # Apply mask from output of attention module to LSTM output.
71 | graph.add_node(TimeDistributedMerge(mode='sum'),
72 | inputs=['lstm1', 'attention2'],
73 | name='applyattn',
74 | merge_mode='mul')
75 |
76 | graph.add_node(Dense(args.n_classes, activation='softmax'),
77 | input='applyattn', name='softmax')
78 |
79 | graph.add_output(input='softmax', name='output')
80 |
81 | load_weights(args, graph)
82 |
83 | optimizer = build_optimizer(args)
84 |
85 | graph.compile(loss={'output': args.loss}, optimizer=optimizer)
86 |
87 | return graph
88 |
--------------------------------------------------------------------------------
/models/keras/preposition/convnet/4e0ae5dc683611e5950afcaa149e39ea/model.py:
--------------------------------------------------------------------------------
1 | import os.path
2 |
3 | import numpy as np
4 |
5 | from keras.models import Sequential
6 | from keras.layers.core import Dense, Dropout, Activation, Flatten
7 | from keras.layers.normalization import BatchNormalization
8 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
9 | from keras.layers.embeddings import Embedding
10 | from keras.constraints import maxnorm
11 | from keras.regularizers import l2
12 | from keras.optimizers import SGD, Adam, Adadelta, Adagrad, RMSprop
13 |
14 | from modeling.layers import ImmutableEmbedding
15 | from modeling.difference import TemporalDifference
16 |
17 | def build_model(args):
18 | print("args", vars(args))
19 |
20 | np.random.seed(args.seed)
21 |
22 | model = Sequential()
23 |
24 | if hasattr(args, 'embedding_weights') and args.embedding_weights is not None:
25 | W = np.load(args.embedding_weights)
26 | if args.train_embeddings is True or args.train_embeddings == 'true':
27 | model.add(Embedding(args.n_vocab, args.n_word_dims,
28 | weights=[W], input_length=args.input_width,
29 | W_constraint=maxnorm(args.embedding_max_norm)))
30 | else:
31 | model.add(ImmutableEmbedding(args.n_vocab, args.n_word_dims,
32 | weights=[W], input_length=args.input_width))
33 | else:
34 | model.add(Embedding(args.n_vocab, args.n_word_dims,
35 | W_constraint=maxnorm(args.embedding_max_norm),
36 | input_length=args.input_width))
37 |
38 | if args.use_difference:
39 | model.add(TemporalDifference())
40 |
41 | model.add(Convolution1D(args.n_filters, args.filter_width,
42 | W_constraint=maxnorm(args.filter_max_norm),
43 | border_mode=args.border_mode,
44 | W_regularizer=l2(args.l2_penalty),
45 | activation='relu'))
46 | #if 'normalization' in args.regularization_layer:
47 | # model.add(BatchNormalization(
48 | # (args.input_width-args.filter_width+1, args.n_filters)))
49 | #model.add(Activation('relu'))
50 |
51 | model.add(MaxPooling1D(
52 | pool_length=args.input_width - args.filter_width + 1,
53 | stride=1, ignore_border=False))
54 | model.add(Flatten())
55 |
56 | if 'dropout' in args.regularization_layer:
57 | model.add(Dropout(args.dropout_p_conv))
58 | if 'normalization' in args.regularization_layer:
59 | model.add(BatchNormalization())
60 |
61 | model.add(Dense(2*args.n_filters,
62 | W_regularizer=l2(args.l2_penalty),
63 | activation='relu'))
64 | if 'dropout' in args.regularization_layer:
65 | model.add(Dropout(args.dropout_p))
66 | if 'normalization' in args.regularization_layer:
67 | model.add(BatchNormalization())
68 |
69 | model.add(Dense(2*args.n_filters,
70 | W_regularizer=l2(args.l2_penalty),
71 | activation='relu'))
72 | if 'dropout' in args.regularization_layer:
73 | model.add(Dropout(args.dropout_p))
74 | if 'normalization' in args.regularization_layer:
75 | model.add(BatchNormalization())
76 |
77 | model.add(Dense(2*args.n_filters,
78 | W_regularizer=l2(args.l2_penalty),
79 | activation='relu'))
80 | if 'dropout' in args.regularization_layer:
81 | model.add(Dropout(args.dropout_p))
82 | if 'normalization' in args.regularization_layer:
83 | model.add(BatchNormalization())
84 |
85 | model.add(Dense(args.n_classes,
86 | W_regularizer=l2(args.l2_penalty),
87 | activation='softmax'))
88 | #if 'normalization' in args.regularization_layer:
89 | # model.add(BatchNormalization((args.n_classes,)))
90 |
91 | if args.optimizer == 'SGD':
92 | optimizer = SGD(lr=args.learning_rate,
93 | decay=args.decay, momentum=args.momentum,
94 | clipnorm=args.clipnorm)
95 | elif args.optimizer == 'Adam':
96 | optimizer = Adam(clipnorm=args.clipnorm)
97 | elif args.optimizer == 'RMSprop':
98 | optimizer = RMSprop(clipnorm=args.clipnorm)
99 | elif args.optimizer == 'Adadelta':
100 | optimizer = Adadelta(clipnorm=args.clipnorm)
101 | elif args.optimizer == 'Adagrad':
102 | optimizer = Adagrad(clipnorm=args.clipnorm)
103 | else:
104 | raise ValueError("don't know how to use optimizer {0}".format(args.optimizer))
105 |
106 | if hasattr(args, 'model_weights'):
107 | print('Checking for weights file ' + str(args.model_weights))
108 | if os.path.exists(args.model_weights):
109 | print('Loading weights')
110 | model.load_weights(args.model_weights)
111 |
112 | print('Compiling')
113 | model.compile(loss=args.loss, optimizer=optimizer)
114 |
115 | return model
116 |
--------------------------------------------------------------------------------
/models/keras/preposition/convnet/4e0ae5dc683611e5950afcaa149e39ea/model_old_keras.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from keras.models import Sequential
4 | from keras.layers.core import Dense, Dropout, Activation, Flatten
5 | from keras.layers.normalization import BatchNormalization
6 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
7 | from keras.layers.embeddings import Embedding
8 | from keras.constraints import maxnorm
9 | from keras.regularizers import l2
10 | from keras.optimizers import SGD, Adam, Adadelta, Adagrad, RMSprop
11 |
12 | from modeling.layers import ImmutableEmbedding
13 | from modeling.difference import TemporalDifference
14 |
15 | def build_model(args):
16 | print("args", vars(args))
17 |
18 | np.random.seed(args.seed)
19 |
20 | model = Sequential()
21 |
22 | if hasattr(args, 'embedding_weights') and args.embedding_weights is not None:
23 | W = np.load(args.embedding_weights)
24 | if args.train_embeddings:
25 | model.add(Embedding(args.n_vocab, args.n_word_dims,
26 | weights=[W],
27 | W_constraint=maxnorm(args.embedding_max_norm)))
28 | else:
29 | model.add(ImmutableEmbedding(args.n_vocab, args.n_word_dims,
30 | weights=[W]))
31 | else:
32 | model.add(Embedding(args.n_vocab, args.n_word_dims,
33 | W_constraint=maxnorm(args.embedding_max_norm)))
34 |
35 | if args.use_difference:
36 | model.add(TemporalDifference())
37 |
38 | model.add(Convolution1D(args.n_word_dims, args.n_filters, args.filter_width,
39 | W_constraint=maxnorm(args.filter_max_norm),
40 | border_mode=args.border_mode,
41 | W_regularizer=l2(args.l2_penalty)))
42 | #if 'normalization' in args.regularization_layer:
43 | # model.add(BatchNormalization(
44 | # (args.input_width-args.filter_width+1, args.n_filters)))
45 | model.add(Activation('relu'))
46 |
47 | model.add(MaxPooling1D(
48 | pool_length=args.input_width - args.filter_width + 1,
49 | stride=None, ignore_border=False))
50 | model.add(Flatten())
51 | if 'dropout' in args.regularization_layer:
52 | model.add(Dropout(args.dropout_p_conv))
53 | if 'normalization' in args.regularization_layer:
54 | model.add(BatchNormalization((args.n_filters,)))
55 |
56 | model.add(Dense(args.n_filters, 2*args.n_filters,
57 | W_regularizer=l2(args.l2_penalty)))
58 | model.add(Activation('relu'))
59 | if 'dropout' in args.regularization_layer:
60 | model.add(Dropout(args.dropout_p))
61 | if 'normalization' in args.regularization_layer:
62 | model.add(BatchNormalization((2*args.n_filters,)))
63 |
64 | model.add(Dense(2*args.n_filters, 2*args.n_filters))
65 | model.add(Activation('relu'))
66 | if 'dropout' in args.regularization_layer:
67 | model.add(Dropout(args.dropout_p))
68 | if 'normalization' in args.regularization_layer:
69 | model.add(BatchNormalization((2*args.n_filters,)))
70 |
71 | model.add(Dense(2*args.n_filters, 2*args.n_filters,
72 | W_regularizer=l2(args.l2_penalty)))
73 | model.add(Activation('relu'))
74 | if 'dropout' in args.regularization_layer:
75 | model.add(Dropout(args.dropout_p))
76 | if 'normalization' in args.regularization_layer:
77 | model.add(BatchNormalization((2*args.n_filters,)))
78 |
79 | model.add(Dense(2*args.n_filters, args.n_classes,
80 | W_regularizer=l2(args.l2_penalty)))
81 | #if 'normalization' in args.regularization_layer:
82 | # model.add(BatchNormalization((args.n_classes,)))
83 | model.add(Activation('softmax'))
84 |
85 | if args.optimizer == 'SGD':
86 | optimizer = SGD(lr=args.learning_rate,
87 | decay=args.decay, momentum=args.momentum,
88 | clipnorm=args.clipnorm)
89 | elif args.optimizer == 'Adam':
90 | optimizer = Adam(clipnorm=args.clipnorm)
91 | elif args.optimizer == 'RMSprop':
92 | optimizer = RMSprop(clipnorm=args.clipnorm)
93 | elif args.optimizer == 'Adadelta':
94 | optimizer = Adadelta(clipnorm=args.clipnorm)
95 | elif args.optimizer == 'Adagrad':
96 | optimizer = Adagrad(clipnorm=args.clipnorm)
97 | else:
98 | raise ValueError("don't know how to use optimizer {0}".format(args.optimizer))
99 |
100 | if hasattr(args, 'model_weights') and args.model_weights is not None:
101 | model.load_weights(args.model_weights)
102 |
103 | model.compile(loss=args.loss, optimizer=optimizer)
104 |
105 | return model
106 |
--------------------------------------------------------------------------------
/models/keras/preposition/convnet/model-word2vec.json:
--------------------------------------------------------------------------------
1 | {
2 | "embedding_weights": "data/prepositions-weights.npy",
3 | "train_embeddings": false,
4 | "regularization_layer": "normalization",
5 | "n_word_dims": 300,
6 | "border_mode": "valid",
7 | "use_difference": true,
8 | "n_filters": 1000,
9 | "filter_width": 3,
10 | "loss": "categorical_crossentropy",
11 | "patience": 30,
12 | "batch_size": 128,
13 | "optimizer": "SGD",
14 | "learning_rate": 0.001,
15 | "momentum": 0.9,
16 | "decay": 0.0,
17 | "embedding_max_norm": 1000,
18 | "filter_max_norm": 1000,
19 | "dense_max_norm": 1000,
20 | "l2_penalty": 0.0,
21 | "clipnorm": 0
22 | }
23 |
--------------------------------------------------------------------------------
/models/keras/preposition/convnet/model.json:
--------------------------------------------------------------------------------
1 | {
2 | "train_embeddings": true,
3 | "regularization_layer": "",
4 | "dropout_p": 0.5,
5 | "dropout_p_conv": 0.0,
6 | "n_word_dims": 50,
7 | "border_mode": "valid",
8 | "use_difference": false,
9 | "n_filters": 500,
10 | "n_hidden": 500,
11 | "filter_width": 4,
12 | "loss": "categorical_crossentropy",
13 | "patience": 20,
14 | "batch_size": 128,
15 | "optimizer": "SGD",
16 | "learning_rate": 0.001,
17 | "momentum": 0.9,
18 | "decay": 0.0,
19 | "embedding_max_norm": 1000,
20 | "filter_max_norm": 1000,
21 | "dense_max_norm": 1000,
22 | "l2_penalty": 0.0,
23 | "clipnorm": 0
24 | }
25 |
--------------------------------------------------------------------------------
/models/keras/preposition/convnet/model.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.setrecursionlimit(5000)
3 | import json
4 | import h5py
5 |
6 | import numpy as np
7 |
8 | from keras.models import Sequential, Graph
9 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Layer
10 | from keras.layers.normalization import BatchNormalization
11 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
12 | from keras.layers.embeddings import Embedding
13 | from keras.constraints import maxnorm
14 | from keras.regularizers import l2
15 | from keras.optimizers import SGD, Adam, Adadelta, Adagrad, RMSprop
16 |
17 | from modeling.layers import ImmutableEmbedding
18 | from modeling.difference import TemporalDifference
19 | from modeling.builders import (build_embedding_layer,
20 | build_convolutional_layer, build_pooling_layer,
21 | build_dense_layer, build_optimizer, load_weights)
22 |
23 | class EncartaExamplesWithOKWindows():
24 | def __init__(self, seed=17):
25 | self.random_state = np.random.RandomState(seed=seed)
26 | self.prepositions = set([7, 8, 10, 12, 13, 17, 18, 19, 27])
27 |
28 | def fit_transform(self, X, y=None):
29 | return self.transform(X, y)
30 |
31 | def transform(self, X, y=None):
32 | # Select the examples where the middle column is in our
33 | # preposition set.
34 | middle_column = X[:, X.shape[1]/2]
35 | ok = np.array([True] * len(X))
36 | for i,val in enumerate(middle_column):
37 | if val not in self.prepositions:
38 | ok[i] = False
39 | print('in %d out %d' % (len(X), len(X[ok])))
40 | if y is not None:
41 | return X[ok], y[ok]
42 | else:
43 | return X[ok]
44 |
45 | class TrainingSetRealExamples():
46 | def __init__(self, seed=17):
47 | self.random_state = np.random.RandomState(seed=seed)
48 |
49 | def fit_transform(self, X, y=None):
50 | evens = [i*2 for i in np.arange(X.shape[0]/2)]
51 | if y is not None:
52 | return X[evens], y[evens]
53 | else:
54 | return X[evens]
55 |
56 | def transform(self, X, y=None):
57 | if y is None:
58 | return X
59 | else:
60 | return X, y
61 |
62 | class RandomPermuter(object):
63 | def __init__(self, seed=17):
64 | self.random_state = np.random.RandomState(seed=seed)
65 |
66 | def fit(self, X, y=None):
67 | pass
68 |
69 | def _transform(self, X, y=None):
70 | X = X.copy()
71 | middle_column_idx = np.int(X.shape[1]/2)
72 | middle_column_values = X[:, middle_column_idx]
73 | random_values = self.random_state.permutation(middle_column_values)
74 | X[:, middle_column_idx] = random_values
75 | if y is None:
76 | return X
77 | else:
78 | return X, y
79 |
80 | class ValidationSetRealExamples(RandomPermuter):
81 | def __init__(self, seed=17):
82 | self.random_state = np.random.RandomState(seed=seed)
83 |
84 | def fit_transform(self, X, y=None):
85 | if y is None:
86 | return X
87 | else:
88 | return X, y
89 |
90 | def transform(self, X, y=None):
91 | evens = [i*2 for i in np.arange(X.shape[0]/2)]
92 | if y is not None:
93 | return X[evens], y[evens]
94 | else:
95 | return X[evens]
96 |
97 | class TrainingSetPrepositionRandomPermuter(RandomPermuter):
98 | def fit_transform(self, X, y=None):
99 | return self._transform(X, y)
100 |
101 | def transform(self, X, y=None):
102 | if y is None:
103 | return X
104 | else:
105 | return X, y
106 |
107 | class ValidationSetPrepositionRandomPermuter(RandomPermuter):
108 | def fit_transform(self, X, y=None):
109 | if y is None:
110 | return X
111 | else:
112 | return X, y
113 |
114 | def transform(self, X, y=None):
115 | return self._transform(X, y)
116 |
117 | class RandomRegularizer(object):
118 | def __init__(self, seed=17):
119 | self.random_state = np.random.RandomState(seed=seed)
120 |
121 | def fit(self, X, y=None):
122 | pass
123 |
124 | def _transform(self, X, y=None):
125 | X = X.copy()
126 | middle_column_idx = np.int(X.shape[1]/2)
127 | middle_column_values = X[:, middle_column_idx]
128 | value_set = list(set(middle_column_values.tolist()))
129 | random_values = []
130 | for i in np.arange(len(X)):
131 | current_value = middle_column_values[i]
132 | while True:
133 | random_value = self.random_state.choice(value_set)
134 | if random_value != current_value:
135 | random_values.append(random_value)
136 | break
137 | X[:, middle_column_idx] = random_values
138 | if y is None:
139 | return X
140 | else:
141 | return X, y
142 |
143 | class TrainingSetPrepositionRandomRegularizer(RandomRegularizer):
144 | """
145 | Takes examples in the form of a vector of indices. Replaces each
146 | middle value in each vector with a value from some other example.
147 | """
148 | def fit_transform(self, X, y=None):
149 | return self._transform(X, y)
150 |
151 | def transform(self, X, y=None):
152 | if y is None:
153 | return X
154 | else:
155 | return X, y
156 |
157 | class ValidationSetPrepositionRandomRegularizer(RandomRegularizer):
158 | def fit_transform(self, X, y=None):
159 | if y is None:
160 | return X
161 | else:
162 | return X, y
163 |
164 | def transform(self, X, y=None):
165 | return self._transform(X, y)
166 |
167 | class UnconstrainedTrainingSetPrepositionPermuter(object):
168 | def __init__(self, seed=17):
169 | self.random_state = np.random.RandomState(seed=seed)
170 |
171 | def fit(self, X, y=None):
172 | pass
173 |
174 | def fit_transform(self, X, y=None):
175 | X = X.copy()
176 | middle_column_idx = np.int(X.shape[1]/2)
177 | middle_column_values = X[:, middle_column_idx]
178 | random_values = self.random_state.permutation(middle_column_values)
179 | X[:, middle_column_idx] = random_values
180 | if y is None:
181 | return X
182 | else:
183 | return X, y
184 |
185 | def transform(self, X, y=None):
186 | if y is None:
187 | return X
188 | else:
189 | return X, y
190 |
191 |
192 | def real_examples(path):
193 | f = h5py.File(path)
194 | # Target_code is 0 when the preposition in the example is the original
195 | # preposition in the corpus and 1 when the preposition has been randomly
196 | # replaced with another one in the confusion set.
197 | idx = f['target_code'].value == 0
198 | f.close()
199 | return idx
200 |
201 | def random_regularization_examples(path):
202 | f = h5py.File(path)
203 | idx = f['target_code'].value == 1
204 | f.close()
205 | return idx
206 |
207 | class Identity(Layer):
208 | def get_output(self, train):
209 | return self.get_input(train)
210 |
211 | def build_residual_model(args):
212 | graph = Graph()
213 |
214 | graph.add_input('input', input_shape=(args.input_width,), dtype='int')
215 |
216 | graph.add_node(build_embedding_layer(args), name='embedding', input='input')
217 |
218 | graph.add_node(build_convolutional_layer(args), name='conv', input='embedding')
219 | prev_layer = 'conv'
220 | if 'normalization' in args.regularization_layer:
221 | graph.add_node(BatchNormalization(), name='conv_bn', input=prev_layer)
222 | prev_layer = 'conv_bn'
223 | graph.add_node(Activation('relu'), name='conv_relu', input=prev_layer)
224 |
225 | graph.add_node(build_pooling_layer(args), name='pool', input='conv_relu')
226 |
227 | graph.add_node(Flatten(), name='flatten', input='pool')
228 | prev_layer = 'flatten'
229 |
230 | # Add two dense layers.
231 | for i in range(2):
232 | layer_name = 'dense%02d' %i
233 | l = build_dense_layer(args, n_hidden=args.n_filters)
234 | graph.add_node(l, name=layer_name, input=prev_layer)
235 | prev_layer = layer_name
236 | if 'normalization' in args.regularization_layer:
237 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer)
238 | prev_layer = layer_name+'bn'
239 | if 'dropout' in args.regularization_layer:
240 | graph.add_node(Dropout(args.dropout_p), name=layer_name+'do', input=prev_layer)
241 | prev_layer = layer_name+'do'
242 |
243 | # Add sequence of residual blocks.
244 | for i in range(args.n_residual_blocks):
245 | # Add a fixed number of layers per residual block.
246 | block_name = '%02d' % i
247 |
248 | graph.add_node(Identity(), name=block_name+'input', input=prev_layer)
249 | prev_layer = block_input_layer = block_name+'input'
250 |
251 | for layer_num in range(args.n_layers_per_residual_block):
252 | layer_name = 'h%s%02d' % (block_name, layer_num)
253 |
254 | l = build_dense_layer(args, n_hidden=args.n_filters)
255 | graph.add_node(l, name=layer_name, input=prev_layer)
256 | prev_layer = layer_name
257 |
258 | if 'normalization' in args.regularization_layer:
259 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer)
260 | prev_layer = layer_name+'bn'
261 |
262 | if i < args.n_layers_per_residual_block:
263 | a = Activation('relu')
264 | graph.add_node(Activation('relu'), name=layer_name+'relu', input=prev_layer)
265 | prev_layer = layer_name+'relu'
266 | if 'dropout' in args.regularization_layer:
267 | graph.add_node(Dropout(args.dropout_p), name=layer_name+'do', input=prev_layer)
268 | prev_layer = layer_name+'do'
269 |
270 | graph.add_node(Identity(), name=block_name+'output', inputs=[block_input_layer, prev_layer], merge_mode='sum')
271 | graph.add_node(Activation('relu'), name=block_name+'relu', input=block_name+'output')
272 | prev_layer = block_input_layer = block_name+'relu'
273 |
274 | graph.add_node(build_dense_layer(args, args.n_classes,
275 | activation='softmax'), name='softmax', input=prev_layer)
276 |
277 | graph.add_output(name='output', input='softmax')
278 |
279 | load_weights(args, graph)
280 |
281 | optimizer = build_optimizer(args)
282 |
283 | graph.compile(loss={'output': args.loss}, optimizer=optimizer)
284 |
285 | return graph
286 |
287 |
288 | def build_ordinary_model(args):
289 | model = Sequential()
290 | model.add(build_embedding_layer(args))
291 | if args.dropout_embedding_p > 0.:
292 | model.add(Dropout(args.dropout_embedding_p))
293 | model.add(build_convolutional_layer(args))
294 | if 'normalization' in args.regularization_layer:
295 | model.add(BatchNormalization())
296 | model.add(Activation('relu'))
297 | if args.dropout_conv_p > 0.:
298 | model.add(Dropout(args.dropout_conv_p))
299 |
300 | model.add(build_pooling_layer(args))
301 | model.add(Flatten())
302 |
303 | for i in range(args.n_fully_connected):
304 | model.add(build_dense_layer(args))
305 | if 'normalization' in args.regularization_layer:
306 | model.add(BatchNormalization())
307 | model.add(Activation('relu'))
308 | if 'dropout' in args.regularization_layer:
309 | model.add(Dropout(args.dropout_p))
310 |
311 | model.add(build_dense_layer(args, args.n_classes,
312 | activation='softmax'))
313 |
314 | load_weights(args, model)
315 |
316 | optimizer = build_optimizer(args)
317 |
318 | model.compile(loss=args.loss, optimizer=optimizer)
319 |
320 | for k,v in json.loads(model.to_json()).items():
321 | print(k)
322 | if k == 'layers':
323 | for l in v:
324 | print(' => %s' % l['name'])
325 |
326 | return model
327 |
328 | def build_model(args):
329 | np.random.seed(args.seed)
330 |
331 | if isinstance(args.n_residual_blocks, int):
332 | return build_residual_model(args)
333 | else:
334 | return build_ordinary_model(args)
335 |
336 |
--------------------------------------------------------------------------------
/models/keras/preposition/convnet/run-medium.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -xe
2 |
3 | N=10000000
4 |
5 | #--extra-train-file $(ls data/preposition/prepositions-all-new-train-$N/* | grep -v 00.h5) \
6 |
7 | embedding_weights=data/preposition/prepositions-all-new-weights.npy
8 |
9 | ./train_keras.py \
10 | models/preposition/convnet \
11 | data/preposition/prepositions-all-new-train-$N.h5 \
12 | data/preposition/prepositions-all-new-validate.h5 \
13 | XwindowNULL \
14 | --target-name original_word_code \
15 | --target-data data/preposition/prepositions-all-new-target-data.json \
16 | --description "comparing inputs with convnets - input = XwindowNULL, target = original_word_code, contrasting, $N training examples, Adagrad, n_filters=500 , n_hidden=1000, n_word_dims=300 (pre-trained, frozen), 3 hidden layers, shuffled data" \
17 | --n-vocab 83064 \
18 | --model-cfg optimizer=Adagrad regularization_layer="" patience=10 n_filters=500 n_hidden=1000 n_word_dims=300 embedding_weights=$embedding_weights train_embeddings=false \
19 | --n-validation 20000 \
20 | --classification-report \
21 | --shuffle \
22 | --n-epochs 10 \
23 | --log
24 |
25 | ./train_keras.py \
26 | models/preposition/convnet \
27 | data/preposition/prepositions-all-new-train-$N.h5 \
28 | data/preposition/prepositions-all-new-validate.h5 \
29 | XwindowNULL X \
30 | --target-name original_word_code \
31 | --target-data data/preposition/prepositions-all-new-target-data.json \
32 | --description "comparing inputs with convnets - input = XwindowNULL X, target = original_word_code, contrasting, $N training examples, Adagrad, n_filters=500 , n_hidden=1000, n_word_dims=300 (pre-trained, frozen), 3 hidden layers, shuffled data" \
33 | --n-vocab 83064 \
34 | --model-cfg optimizer=Adagrad regularization_layer="" patience=10 n_filters=500 n_hidden=1000 n_word_dims=300 embedding_weights=$embedding_weights train_embeddings=false \
35 | --n-validation 20000 \
36 | --classification-report \
37 | --shuffle \
38 | --n-epochs 10 \
39 | --log
40 |
41 |
--------------------------------------------------------------------------------
/models/keras/preposition/convnet/run-small.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -xe
2 |
3 | N=1000000
4 |
5 | embedding_weights=data/preposition/prepositions-all-new-weights.npy
6 |
7 | function train() {
8 | n_filters=$1
9 | shift
10 | filter_width=$1
11 | shift
12 | features=$@
13 |
14 | features_name=$(echo $features | sed 's, ,-,g')
15 | dest=$features_name-$n_filters-$filter_width
16 |
17 | ./train_keras.py \
18 | models/keras/preposition/convnet \
19 | data/preposition/prepositions-all-new-train-$N-balanced.h5 \
20 | data/preposition/prepositions-all-new-validate-balanced.h5 \
21 | $features \
22 | --model-dest models/keras/preposition/convnet/small/feature-evaluation/$dest \
23 | --target-name original_word_code \
24 | --target-data data/preposition/prepositions-all-new-target-data.json \
25 | --description "comparing inputs with convnets - input = $features, target = original_word_code, contrasting, $N training examples, Adagrad, patience=5, n_filters=$n_filters, filter_width=$filter_width, n_word_dims=300 (pre-trained, frozen), 1 hidden layer, shuffled data" \
26 | --n-vocab 83064 \
27 | --model-cfg optimizer=Adagrad regularization_layer="dropout" n_filters=$n_filters n_word_dims=300 embedding_weights=$embedding_weights train_embeddings=false filter_width=$filter_width patience=5 \
28 | --n-validation 20000 \
29 | --n-epochs 10 \
30 | --shuffle \
31 | --log
32 | }
33 |
34 | function xval5() {
35 | features=$@
36 | for filter_width in 2 3 5
37 | do
38 | for n_filters in 100
39 | do
40 | train $n_filters $filter_width $features
41 | done
42 | done
43 | }
44 |
45 | function xval7() {
46 | features=$@
47 | for filter_width in 2 3 5 7
48 | do
49 | for n_filters in 100
50 | do
51 | train $n_filters $filter_width $features
52 | done
53 | done
54 | }
55 |
56 | function xval9() {
57 | features=$@
58 | for filter_width in 2 3 5 7 9
59 | do
60 | for n_filters in 100
61 | do
62 | train $n_filters $filter_width $features
63 | done
64 | done
65 | }
66 |
67 | xval5 Xwindow
68 | xval7 Xwindow7
69 | xval9 Xwindow9
70 |
71 | xval5 XwindowNULL X
72 | xval7 Xwindow7NULL X
73 | xval9 Xwindow9NULL X
74 |
75 | xval9 X
76 |
77 | xval5 XwindowNULL
78 | xval7 Xwindow7NULL
79 | xval9 Xwindow9NULL
80 |
81 | xval5 Xwindow X
82 | xval7 Xwindow7 X
83 | xval9 Xwindow9 X
84 |
--------------------------------------------------------------------------------
/models/keras/preposition/convnet/small/find-best-filter-size/find-best.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Get a unique list of the feature names.
4 | for input in $(for file in */model.log; do echo $(dirname $file) | sed 's,-[0-9]$,,'; done | sort | uniq)
5 | do
6 | # For each feature, find the one filter width that yielded the lowest
7 | # validation loss.
8 | for file in ${input}*/model.log
9 | do
10 | echo $input $(dirname $file) $(grep val_acc $file | cat -n | sort -n -r -k17 | tail -1)
11 | done | sort -n -r -k17 | tail -1
12 | done | sort -n -r -k 17
13 |
--------------------------------------------------------------------------------
/models/keras/preposition/convnet/small/find-best-filter-size/find-best.txt:
--------------------------------------------------------------------------------
1 | X-100 X-100-5 10 11-25 18:13 root INFO 170s - loss: 1.2008 - acc: 0.6007 - val_loss: 1.2047 - val_acc: 0.5964
2 |
3 | XwindowNULL-100 XwindowNULL-100-3 10 11-25 19:56 root INFO 160s - loss: 1.1836 - acc: 0.6147 - val_loss: 1.2035 - val_acc: 0.6055
4 | XwindowNULL-X-100 XwindowNULL-X-100-5 10 11-25 12:33 root INFO 177s - loss: 1.0559 - acc: 0.6551 - val_loss: 1.0390 - val_acc: 0.6549
5 | Xwindow-X-100 Xwindow-X-100-5 10 11-26 00:28 root INFO 178s - loss: 1.0339 - acc: 0.6663 - val_loss: 1.0215 - val_acc: 0.6684
6 | Xwindow-100 Xwindow-100-3 10 11-25 08:02 root INFO 161s - loss: 0.9886 - acc: 0.6886 - val_loss: 1.0048 - val_acc: 0.6834
7 |
8 | Xwindow7NULL-100 Xwindow7NULL-100-5 10 11-25 21:07 root INFO 135s - loss: 1.1051 - acc: 0.6404 - val_loss: 1.1552 - val_acc: 0.6223
9 | Xwindow7NULL-X-100 Xwindow7NULL-X-100-5 10 11-25 13:54 root INFO 180s - loss: 1.0307 - acc: 0.6636 - val_loss: 1.0158 - val_acc: 0.6630
10 | Xwindow7-X-100 Xwindow7-X-100-7 10 11-26 02:20 root INFO 189s - loss: 0.9867 - acc: 0.6817 - val_loss: 0.9934 - val_acc: 0.6765
11 | Xwindow7-100 Xwindow7-100-5 10 11-25 09:12 root INFO 135s - loss: 0.9192 - acc: 0.7094 - val_loss: 0.9673 - val_acc: 0.6980
12 |
13 | Xwindow9NULL-100 Xwindow9NULL-100-5 10 11-25 22:30 root INFO 204s - loss: 1.0893 - acc: 0.6448 - val_loss: 1.1373 - val_acc: 0.6236
14 | Xwindow9NULL-X-100 Xwindow9NULL-X-100-9 10 11-25 16:56 root INFO 211s - loss: 0.9888 - acc: 0.6783 - val_loss: 1.0049 - val_acc: 0.6674
15 | Xwindow9-X-100 Xwindow9-X-100-9 10 11-26 04:50 root INFO 211s - loss: 0.9618 - acc: 0.6897 - val_loss: 0.9908 - val_acc: 0.6795
16 | Xwindow9-100 Xwindow9-100-7 10 11-25 10:58 root INFO 135s - loss: 0.8795 - acc: 0.7216 - val_loss: 0.9521 - val_acc: 0.7008
17 |
--------------------------------------------------------------------------------
/models/keras/preposition/lstm/model.json:
--------------------------------------------------------------------------------
1 | {
2 | "regularization_layer": null,
3 | "n_word_dims": 50,
4 | "n_units": 100,
5 | "loss": "categorical_crossentropy",
6 | "patience": 10,
7 | "batch_size": 128,
8 | "optimizer": "SGD",
9 | "learning_rate": 0.001,
10 | "momentum": 0.9,
11 | "decay": 0.0,
12 | "embedding_max_norm": 1000,
13 | "truncate_gradient": -1,
14 | "clipnorm": 0,
15 | "mask_zero": false,
16 | "l2_penalty": 0.0
17 | }
18 |
--------------------------------------------------------------------------------
/models/keras/preposition/lstm/model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from keras.models import Sequential
4 | from keras.layers.core import Dense, Dropout, Activation, Flatten
5 | from keras.layers.recurrent import LSTM, GRU
6 | from keras.layers.embeddings import Embedding
7 | from keras.constraints import maxnorm
8 | from keras.regularizers import l2
9 | from keras.optimizers import SGD, Adam, RMSprop, Adadelta, Adagrad
10 |
11 | from modeling.layers import ImmutableEmbedding
12 |
13 | def build_model(args):
14 | print("args", vars(args))
15 |
16 | model = Sequential()
17 |
18 | np.random.seed(args.seed)
19 |
20 | if hasattr(args, 'embedding_weights') and args.embedding_weights is not None:
21 | W = np.load(args.embedding_weights)
22 | if args.train_embeddings:
23 | model.add(Embedding(args.n_vocab, args.n_word_dims,
24 | weights=[W],
25 | W_constraint=maxnorm(args.embedding_max_norm)))
26 | else:
27 | model.add(ImmutableEmbedding(args.n_vocab, args.n_word_dims,
28 | weights=[W]))
29 | else:
30 | model.add(Embedding(args.n_vocab, args.n_word_dims,
31 | mask_zero=args.mask_zero,
32 | W_constraint=maxnorm(args.embedding_max_norm)))
33 |
34 | model.add(LSTM(args.n_word_dims, args.n_units,
35 | truncate_gradient=args.truncate_gradient,
36 | return_sequences=True))
37 | if args.regularization_layer == 'dropout':
38 | model.add(Dropout(0.2))
39 | #elif args.regularization_layer == 'normalization':
40 | # model.add(BatchNormalization((args.n_filters,)))
41 |
42 | model.add(LSTM(args.n_units, args.n_units,
43 | truncate_gradient=args.truncate_gradient,
44 | return_sequences=True))
45 | if args.regularization_layer == 'dropout':
46 | model.add(Dropout(0.2))
47 | #elif args.regularization_layer == 'normalization':
48 | # model.add(BatchNormalization((args.n_filters,)))
49 |
50 | '''
51 | model.add(LSTM(args.n_units, args.n_units,
52 | truncate_gradient=args.truncate_gradient,
53 | return_sequences=True))
54 | if args.regularization_layer == 'dropout':
55 | model.add(Dropout(0.2))
56 | #elif args.regularization_layer == 'normalization':
57 | # model.add(BatchNormalization((args.n_filters,)))
58 | '''
59 |
60 | model.add(LSTM(args.n_units, args.n_units,
61 | truncate_gradient=args.truncate_gradient,
62 | return_sequences=False))
63 | if args.regularization_layer == 'dropout':
64 | model.add(Dropout(0.2))
65 | #elif args.regularization_layer == 'normalization':
66 | # model.add(BatchNormalization((args.n_filters,)))
67 |
68 | model.add(Dense(args.n_units, args.n_classes,
69 | W_regularizer=l2(args.l2_penalty)))
70 | model.add(Activation('softmax'))
71 |
72 | if args.optimizer == 'SGD':
73 | optimizer = SGD(lr=args.learning_rate,
74 | decay=args.decay, momentum=args.momentum,
75 | clipnorm=args.clipnorm)
76 | elif args.optimizer == 'Adam':
77 | optimizer = Adam(clipnorm=args.clipnorm)
78 | elif args.optimizer == 'RMSprop':
79 | optimizer = RMSprop(clipnorm=args.clipnorm)
80 | elif args.optimizer == 'Adadelta':
81 | optimizer = Adadelta(clipnorm=args.clipnorm)
82 | elif args.optimizer == 'Adagrad':
83 | optimizer = Adagrad(clipnorm=args.clipnorm)
84 | else:
85 | raise ValueError("don't know how to use optimizer {0}".format(args.optimizer))
86 |
87 | model.compile(loss=args.loss, optimizer=optimizer)
88 |
89 | return model
90 |
--------------------------------------------------------------------------------
/models/keras/spelling/convnet/exp03-inputs/op_transpose_n_ops_1_n_errors_per_word_3/analysis.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import modeling.utils
6 | import spelling.baseline
7 |
8 | def mark(words):
9 | return ['^'+w+'$' for w in words]
10 |
11 | def build_index():
12 | train_hdf5_file = 'data/spelling/experimental/op-transpose-distance-1-errors-per-word-3.h5'
13 | train_h5 = h5py.File(train_hdf5_file)
14 |
15 | train_csv_file = 'data/spelling/experimental/op-transpose-distance-1-errors-per-word-3.csv'
16 | train_df = pd.read_csv(train_csv_file, sep='\t', encoding='utf8')
17 | words = train_df.real_word.tolist()
18 | marked_words = mark(words)
19 |
20 | X_train = train_h5['marked_chars'].value
21 | index_size = np.max(X_train)
22 | i = 0
23 | index = {}
24 |
25 | while len(index) < index_size:
26 | marked_word = marked_words[i]
27 | row = X_train[i]
28 |
29 | for j,idx in enumerate(row):
30 | if idx == 0:
31 | break
32 | index[marked_word[j]] = idx
33 |
34 | i += 1
35 |
36 | return index
37 |
38 | index = build_index()
39 |
40 | model_dir = 'models/keras/spelling/convnet/exp03-inputs/op_transpose_n_ops_1_n_errors_per_word_3'
41 |
42 | df = pd.read_csv('../spelling/data/aspell-dict.csv.gz', sep='\t', encoding='utf8')
43 | words = df.word.tolist()
44 | vocab = set(words)
45 |
46 | lm = spelling.baseline.CharacterLanguageModel('witten-bell', order=3)
47 | lm.fit(words)
48 |
49 | model, model_cfg = modeling.utils.load_model(model_dir, model_weights=True)
50 |
51 | bins = np.arange(0, 1, .1)
52 | outputs = {}
53 | histograms = {}
54 |
55 | for order in range(1, 4):
56 | print('order %d' % order)
57 | generated = []
58 | # Generate 500k words, controlling for length and excluding those
59 | # that are already in the vocabulary. Only keep the first 100k
60 | # of those that satisfy our requirements.
61 | for g in lm.generate(order, 500000):
62 | if len(g) < 5 or len(g) > 10:
63 | continue
64 | if g in vocab:
65 | continue
66 | generated.append(g)
67 | if len(generated) == 100000:
68 | break
69 |
70 | marked = mark(generated)
71 | X = np.zeros((len(marked), input_width))
72 | for i,word in enumerate(marked):
73 | for j,chr in enumerate(word):
74 | X[i,j] = index[chr]
75 |
76 | output = zip(generated, model.predict(X)[:, 1])
77 | outputs[order] = output
78 | histograms[order] = np.histogram([o[1] for o in output], bins=bins)
79 |
--------------------------------------------------------------------------------
/models/keras/spelling/convnet/model.json:
--------------------------------------------------------------------------------
1 | {
2 | "n_word_dims": 0,
3 | "n_filters": 0,
4 | "filter_width": 0,
5 | "n_fully_connected": 0,
6 | "n_residual_blocks": 0,
7 |
8 | "train_embeddings": true,
9 | "embedding_init": "uniform",
10 | "batch_normalization": true,
11 |
12 | "optimizer": "Adam",
13 | "loss": "categorical_crossentropy",
14 | "l2_penalty": 0.0,
15 |
16 | "dropout_embedding_p": 0.0,
17 | "dropout_conv_p": 0.0,
18 | "dropout_fc_p": 0.0,
19 |
20 | "patience": 1,
21 | "batch_size": 128,
22 |
23 | "embedding_max_norm": 1000,
24 | "filter_max_norm": 1000,
25 | "dense_max_norm": 1000,
26 | "clipnorm": 0,
27 | "border_mode": "valid"
28 | }
29 |
--------------------------------------------------------------------------------
/models/keras/spelling/convnet/model.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.setrecursionlimit(5000)
3 | import json
4 | import h5py
5 |
6 | import numpy as np
7 |
8 | from keras.models import Sequential, Graph
9 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Layer
10 | from keras.layers.normalization import BatchNormalization
11 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
12 | from keras.layers.embeddings import Embedding
13 | from keras.constraints import maxnorm
14 | from keras.regularizers import l2
15 | from keras.optimizers import SGD, Adam, Adadelta, Adagrad, RMSprop
16 |
17 | from modeling.layers import ImmutableEmbedding
18 | from modeling.difference import TemporalDifference
19 | import modeling.data
20 | from modeling.builders import (build_embedding_layer,
21 | build_convolutional_layer, build_pooling_layer,
22 | build_dense_layer, build_optimizer, load_weights)
23 |
24 | class GraphMarshaller(modeling.data.GraphMarshaller):
25 | def marshal(self, data, target=None):
26 | return {
27 | 'input': data,
28 | 'output': target
29 | }
30 |
31 | def unmarshal(self, output):
32 | return output['output']
33 |
34 | class Identity(Layer):
35 | def get_output(self, train):
36 | return self.get_input(train)
37 |
38 | def build_residual_model(args):
39 | graph = Graph()
40 |
41 | graph.add_input('input', input_shape=(args.input_width,), dtype='int')
42 |
43 | graph.add_node(build_embedding_layer(args), name='embedding', input='input')
44 |
45 | graph.add_node(build_convolutional_layer(args), name='conv', input='embedding')
46 | prev_layer = 'conv'
47 | if args.batch_normalization:
48 | graph.add_node(BatchNormalization(), name='conv_bn', input=prev_layer)
49 | prev_layer = 'conv_bn'
50 | graph.add_node(Activation('relu'), name='conv_relu', input=prev_layer)
51 |
52 | graph.add_node(build_pooling_layer(args), name='pool', input='conv_relu')
53 |
54 | graph.add_node(Flatten(), name='flatten', input='pool')
55 | prev_layer = 'flatten'
56 |
57 | # Add some number of fully-connected layers without skip connections.
58 | for i in range(args.n_fully_connected):
59 | layer_name = 'dense%02d' %i
60 | l = build_dense_layer(args, n_hidden=args.n_hidden)
61 | graph.add_node(l, name=layer_name, input=prev_layer)
62 | prev_layer = layer_name
63 | if args.batch_normalization:
64 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer)
65 | prev_layer = layer_name+'bn'
66 | if args.dropout_fc_p > 0.:
67 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer)
68 | prev_layer = layer_name+'do'
69 |
70 | # Add sequence of residual blocks.
71 | for i in range(args.n_residual_blocks):
72 | # Add a fixed number of layers per residual block.
73 | block_name = '%02d' % i
74 |
75 | graph.add_node(Identity(), name=block_name+'input', input=prev_layer)
76 | prev_layer = block_input_layer = block_name+'input'
77 |
78 | try:
79 | n_layers_per_residual_block = args.n_layers_per_residual_block
80 | except AttributeError:
81 | n_layers_per_residual_block = 2
82 |
83 | for layer_num in range(n_layers_per_residual_block):
84 | layer_name = 'h%s%02d' % (block_name, layer_num)
85 |
86 | l = build_dense_layer(args, n_hidden=args.n_hidden)
87 | graph.add_node(l, name=layer_name, input=prev_layer)
88 | prev_layer = layer_name
89 |
90 | if args.batch_normalization:
91 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer)
92 | prev_layer = layer_name+'bn'
93 |
94 | if i < n_layers_per_residual_block:
95 | a = Activation('relu')
96 | graph.add_node(Activation('relu'), name=layer_name+'relu', input=prev_layer)
97 | prev_layer = layer_name+'relu'
98 | if args.dropout_fc_p > 0.:
99 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer)
100 | prev_layer = layer_name+'do'
101 |
102 | graph.add_node(Identity(), name=block_name+'output', inputs=[block_input_layer, prev_layer], merge_mode='sum')
103 | graph.add_node(Activation('relu'), name=block_name+'relu', input=block_name+'output')
104 | prev_layer = block_input_layer = block_name+'relu'
105 |
106 | graph.add_node(build_dense_layer(args, args.n_classes,
107 | activation='softmax'), name='softmax', input=prev_layer)
108 |
109 | graph.add_output(name='output', input='softmax')
110 |
111 | load_weights(args, graph)
112 |
113 | optimizer = build_optimizer(args)
114 |
115 | graph.compile(loss={'output': args.loss}, optimizer=optimizer)
116 |
117 | return graph
118 |
119 | def build_ordinary_model(args):
120 | model = Sequential()
121 | model.add(build_embedding_layer(args))
122 | if args.dropout_embedding_p > 0.:
123 | model.add(Dropout(args.dropout_embedding_p))
124 | model.add(build_convolutional_layer(args))
125 | if args.batch_normalization:
126 | model.add(BatchNormalization())
127 | model.add(Activation('relu'))
128 | if args.dropout_conv_p > 0.:
129 | model.add(Dropout(args.dropout_conv_p))
130 |
131 | model.add(build_pooling_layer(args))
132 | model.add(Flatten())
133 |
134 | for i in range(args.n_fully_connected):
135 | model.add(build_dense_layer(args))
136 | if args.batch_normalization:
137 | model.add(BatchNormalization())
138 | model.add(Activation('relu'))
139 | if args.dropout_fc_p > 0.:
140 | model.add(Dropout(args.dropout_fc_p))
141 |
142 | model.add(build_dense_layer(args, args.n_classes,
143 | activation='softmax'))
144 |
145 | load_weights(args, model)
146 |
147 | optimizer = build_optimizer(args)
148 |
149 | model.compile(loss=args.loss, optimizer=optimizer)
150 |
151 | if args.verbose:
152 | for k,v in json.loads(model.to_json()).items():
153 | if k == 'layers':
154 | for l in v:
155 | print(' => %s' % l['name'])
156 |
157 | return model
158 |
159 | def build_model(args):
160 | np.random.seed(args.seed)
161 |
162 | if args.n_residual_blocks > 0:
163 | return build_residual_model(args)
164 | else:
165 | return build_ordinary_model(args)
166 |
167 |
--------------------------------------------------------------------------------
/models/keras/spelling/correction/isolated/binary/model.json:
--------------------------------------------------------------------------------
1 | {
2 | "n_word_dims": 0,
3 | "n_filters": 0,
4 | "filter_width": 0,
5 | "n_fully_connected": 0,
6 | "n_residual_blocks": 0,
7 |
8 | "train_embeddings": true,
9 | "embedding_init": "uniform",
10 | "batch_normalization": true,
11 |
12 | "optimizer": "Adam",
13 | "loss": "categorical_crossentropy",
14 | "l2_penalty": 0.0,
15 |
16 | "dropout_embedding_p": 0.0,
17 | "dropout_conv_p": 0.0,
18 | "dropout_fc_p": 0.0,
19 |
20 | "patience": 1,
21 | "batch_size": 128,
22 |
23 | "embedding_max_norm": 1000,
24 | "filter_max_norm": 1000,
25 | "dense_max_norm": 1000,
26 | "clipnorm": 0,
27 | "border_mode": "valid"
28 | }
29 |
--------------------------------------------------------------------------------
/models/keras/spelling/correction/isolated/binary/model.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import h5py
3 | sys.setrecursionlimit(5000)
4 | import json
5 | import h5py
6 |
7 | from sklearn.utils import check_random_state
8 |
9 | import numpy as np
10 |
11 | from keras.models import Sequential, Graph
12 | from keras.utils import np_utils
13 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Layer
14 | from keras.layers.normalization import BatchNormalization
15 |
16 | import modeling.data
17 | from modeling.builders import (build_embedding_layer,
18 | build_convolutional_layer, build_pooling_layer,
19 | build_dense_layer, build_optimizer, load_weights,
20 | build_hierarchical_softmax_layer)
21 | from modeling.utils import balanced_class_weights
22 |
23 | class SingleFileDataset(object):
24 | def __init__(self, file_path, data_name, target_name, batch_size, random_state=17):
25 | assert isinstance(data_name, (list,tuple))
26 | assert isinstance(target_name, (list,tuple))
27 |
28 | random_state = check_random_state(random_state)
29 |
30 | self.__dict__.update(locals())
31 | del self.self
32 |
33 | self.load_data()
34 |
35 | def load_data(self):
36 | self.data = {}
37 | self.target = {}
38 | self.target_one_hot = {}
39 |
40 | f = h5py.File(self.file_path)
41 | self.n = None
42 |
43 | for data_name in self.data_name:
44 | self.data[data_name] = f[data_name].value
45 | if self.n is None:
46 | self.n = len(self.data[data_name])
47 | else:
48 | assert len(self.data[data_name]) == self.n
49 | for target_name in self.target_name:
50 | target = f[target_name].value
51 | assert len(target) == self.n
52 |
53 | self.target[target_name] = target
54 | n_classes = np.max(target) + 1
55 | self.target_one_hot[target_name] = np_utils.to_categorical(target, n_classes)
56 | f.close()
57 |
58 | def get_dict(self, one_hot=True):
59 | d = {}
60 | for data_name in self.data_name:
61 | d[data_name] = self.data[data_name]
62 | for target_name in self.target_name:
63 | if one_hot:
64 | d[target_name] = self.target_one_hot[target_name]
65 | else:
66 | d[target_name] = self.target[target_name]
67 | return d
68 |
69 | def class_weights(self, class_weight_exponent):
70 | return balanced_class_weights(
71 | self.target['binary_target'],
72 | 2,
73 | class_weight_exponent)
74 |
75 | def generate(self):
76 | while 1:
77 | idx = self.random_state.choice(self.n, size=self.batch_size, replace=False)
78 | batch = {}
79 | for data_name in self.data_name:
80 | batch[data_name] = self.data[data_name][idx]
81 | for target_name in self.target_name:
82 | batch[target_name] = self.target_one_hot[target_name][idx]
83 | yield batch
84 |
85 | class Identity(Layer):
86 | def get_output(self, train):
87 | return self.get_input(train)
88 |
89 | def add_bn_relu(graph, args, prev_layer):
90 | bn_name = prev_layer + '_bn'
91 | relu_name = prev_layer + '_relu'
92 | if args.batch_normalization:
93 | graph.add_node(BatchNormalization(), name=bn_name, input=prev_layer)
94 | prev_layer = bn_name
95 | graph.add_node(Activation('relu'), name=relu_name, input=prev_layer)
96 | return relu_name
97 |
98 | def build_model(args, train_data, validation_data):
99 | np.random.seed(args.seed)
100 |
101 | graph = Graph()
102 |
103 | non_word_input = 'non_word_marked_chars'
104 | real_word_input = 'real_word_marked_chars'
105 |
106 | non_word_input_width = train_data.data[non_word_input].shape[1]
107 | real_word_input_width = train_data.data[real_word_input].shape[1]
108 |
109 | print('non_word_input_width', non_word_input_width)
110 | print('real_word_input_width', real_word_input_width)
111 |
112 | graph.add_input(non_word_input, input_shape=(non_word_input_width,), dtype='int')
113 | graph.add_node(build_embedding_layer(args, input_width=non_word_input_width),
114 | name='non_word_embedding', input=non_word_input)
115 | graph.add_node(build_convolutional_layer(args), name='non_word_conv', input='non_word_embedding')
116 | non_word_prev_layer = add_bn_relu(graph, args, 'non_word_conv')
117 | graph.add_node(build_pooling_layer(args, input_width=non_word_input_width),
118 | name='non_word_pool', input=non_word_prev_layer)
119 | graph.add_node(Flatten(), name='non_word_flatten', input='non_word_pool')
120 |
121 | graph.add_input(real_word_input, input_shape=(real_word_input_width,), dtype='int')
122 | graph.add_node(build_embedding_layer(args, input_width=real_word_input_width),
123 | name='real_word_embedding', input=real_word_input)
124 | graph.add_node(build_convolutional_layer(args), name='real_word_conv', input='real_word_embedding')
125 | real_word_prev_layer = add_bn_relu(graph, args, 'real_word_conv')
126 | graph.add_node(build_pooling_layer(args, input_width=real_word_input_width),
127 | name='real_word_pool', input=real_word_prev_layer)
128 | graph.add_node(Flatten(), name='real_word_flatten', input='real_word_pool')
129 |
130 | # Add some number of fully-connected layers without skip connections.
131 | prev_layer = 'join_non_and_real'
132 | for i in range(args.n_fully_connected):
133 | layer_name = 'dense%02d' %i
134 | l = build_dense_layer(args, n_hidden=args.n_hidden)
135 | if i == 0:
136 | graph.add_node(l, name=layer_name,
137 | inputs=['non_word_flatten', 'real_word_flatten'])
138 | else:
139 | graph.add_node(l, name=layer_name, input=prev_layer)
140 | prev_layer = layer_name
141 | if args.batch_normalization:
142 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer)
143 | prev_layer = layer_name+'bn'
144 | if args.dropout_fc_p > 0.:
145 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer)
146 | prev_layer = layer_name+'do'
147 |
148 | # Add sequence of residual blocks.
149 | for i in range(args.n_residual_blocks):
150 | # Add a fixed number of layers per residual block.
151 | block_name = '%02d' % i
152 |
153 | graph.add_node(Identity(), name=block_name+'input', input=prev_layer)
154 | prev_layer = block_input_layer = block_name+'input'
155 |
156 | try:
157 | n_layers_per_residual_block = args.n_layers_per_residual_block
158 | except AttributeError:
159 | n_layers_per_residual_block = 2
160 |
161 | for layer_num in range(n_layers_per_residual_block):
162 | layer_name = 'h%s%02d' % (block_name, layer_num)
163 |
164 | l = build_dense_layer(args, n_hidden=args.n_hidden)
165 | graph.add_node(l, name=layer_name, input=prev_layer)
166 | prev_layer = layer_name
167 |
168 | if args.batch_normalization:
169 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer)
170 | prev_layer = layer_name+'bn'
171 |
172 | if i < n_layers_per_residual_block:
173 | a = Activation('relu')
174 | graph.add_node(Activation('relu'), name=layer_name+'relu', input=prev_layer)
175 | prev_layer = layer_name+'relu'
176 | if args.dropout_fc_p > 0.:
177 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer)
178 | prev_layer = layer_name+'do'
179 |
180 | graph.add_node(Identity(), name=block_name+'output', inputs=[block_input_layer, prev_layer], merge_mode='sum')
181 | graph.add_node(Activation('relu'), name=block_name+'relu', input=block_name+'output')
182 | prev_layer = block_input_layer = block_name+'relu'
183 |
184 | #if hasattr(args, 'n_hsm_classes'):
185 | # graph.add_node(build_hierarchical_softmax_layer(args),
186 | # name='softmax', input=prev_layer)
187 | #else:
188 |
189 | graph.add_node(build_dense_layer(args, 2,
190 | activation='softmax'), name='softmax', input=prev_layer)
191 |
192 | graph.add_output(name='binary_target', input='softmax')
193 |
194 | load_weights(args, graph)
195 |
196 | optimizer = build_optimizer(args)
197 |
198 | graph.compile(loss={'binary_target': args.loss}, optimizer=optimizer)
199 |
200 | return graph
201 |
202 | def load_train(args, model_cfg):
203 | return SingleFileDataset(
204 | args.train_path,
205 | args.data_name, [args.target_name],
206 | model_cfg.batch_size, args.seed)
207 |
208 | def load_validation(args, model_cfg):
209 | return SingleFileDataset(
210 | args.validation_path,
211 | args.data_name, [args.target_name],
212 | model_cfg.batch_size, args.seed)
213 |
214 | def fit_model(graph, train_data, validation_data, args, callbacks=[]):
215 | graph.fit_generator(train_data.generate(),
216 | samples_per_epoch=int(train_data.n/100),
217 | nb_epoch=args.n_epochs,
218 | validation_data=validation_data.get_dict(),
219 | callbacks=callbacks,
220 | class_weight=train_data.class_weights(args.class_weight_exponent))
221 |
222 | #fit_generator(generator, samples_per_epoch, nb_epoch, verbose=1, callbacks=[], validation_data=None,
223 | # nb_val_samples=None, class_weight={}, nb_worker=1, nb_val_worker=None)
224 |
225 |
--------------------------------------------------------------------------------
/models/keras/spelling/correction/isolated/multiclass/model.json:
--------------------------------------------------------------------------------
1 | {
2 | "n_word_dims": 0,
3 | "n_filters": 0,
4 | "filter_width": 0,
5 | "n_fully_connected": 0,
6 | "n_residual_blocks": 0,
7 |
8 | "train_embeddings": true,
9 | "embedding_init": "uniform",
10 | "batch_normalization": true,
11 |
12 | "optimizer": "Adam",
13 | "loss": "categorical_crossentropy",
14 | "l2_penalty": 0.0,
15 |
16 | "dropout_embedding_p": 0.0,
17 | "dropout_conv_p": 0.0,
18 | "dropout_fc_p": 0.0,
19 |
20 | "n_classes": 119774,
21 | "patience": 1,
22 | "batch_size": 128,
23 |
24 | "embedding_max_norm": 1000,
25 | "filter_max_norm": 1000,
26 | "dense_max_norm": 1000,
27 | "clipnorm": 0,
28 | "border_mode": "valid"
29 | }
30 |
--------------------------------------------------------------------------------
/models/keras/spelling/correction/isolated/multiclass/model.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import h5py
3 | sys.setrecursionlimit(5000)
4 | import json
5 | import h5py
6 |
7 | from sklearn.utils import check_random_state
8 |
9 | import numpy as np
10 |
11 | from keras.models import Sequential, Graph
12 | from keras.utils import np_utils
13 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Layer
14 | from keras.layers.normalization import BatchNormalization
15 |
16 | import modeling.data
17 | from modeling.builders import (build_embedding_layer,
18 | build_convolutional_layer, build_pooling_layer,
19 | build_dense_layer, build_optimizer, load_weights,
20 | build_hierarchical_softmax_layer)
21 | from modeling.utils import balanced_class_weights
22 |
23 | class HDF5FileDataset(object):
24 | def __init__(self, file_path, data_name, target_name, batch_size, one_hot=True, random_state=17):
25 | assert isinstance(data_name, (list,tuple))
26 | assert isinstance(target_name, (list,tuple))
27 |
28 | random_state = check_random_state(random_state)
29 |
30 | self.__dict__.update(locals())
31 | del self.self
32 |
33 | self._load_data()
34 | self._check_data()
35 |
36 | def _load_data(self):
37 | self.hdf5_file = h5py.File(self.file_path)
38 | self.n_classes = {}
39 | for target_name in self.target_name:
40 | self.n_classes[target_name] = np.max(self.hdf5_file[target_name])+1
41 |
42 | def _check_data(self):
43 | self.n = None
44 | for data_name in self.data_name:
45 | if self.n is None:
46 | self.n = len(self.hdf5_file[data_name])
47 | else:
48 | assert len(self.hdf5_file[data_name]) == self.n
49 | for target_name in self.target_name:
50 | assert len(self.hdf5_file[target_name]) == self.n
51 |
52 | def __getitem__(self, name):
53 | return self.hdf5_file[name].value
54 |
55 | def class_weights(self, class_weight_exponent, target='multiclass_correction_target'):
56 | return balanced_class_weights(
57 | self.hdf5_file[target],
58 | 2,
59 | class_weight_exponent)
60 |
61 | def generator(self, one_hot=None, batch_size=None):
62 | if one_hot is None: one_hot = self.one_hot
63 | if batch_size is None: batch_size = self.batch_size
64 |
65 | while 1:
66 | idx = self.random_state.choice(self.n, size=batch_size, replace=False)
67 | batch = {}
68 | for data_name in self.data_name:
69 | batch[data_name] = self.hdf5_file[data_name].value[idx]
70 | for target_name in self.target_name:
71 | target = self.hdf5_file[target_name].value[idx]
72 | if one_hot:
73 | batch[target_name] = np_utils.to_categorical(target,
74 | self.n_classes[target_name])
75 | else:
76 | batch[target_name] = target
77 |
78 | yield batch
79 |
80 | class Identity(Layer):
81 | def get_output(self, train):
82 | return self.get_input(train)
83 |
84 | def add_bn_relu(graph, args, prev_layer):
85 | bn_name = prev_layer + '_bn'
86 | relu_name = prev_layer + '_relu'
87 | if args.batch_normalization:
88 | graph.add_node(BatchNormalization(), name=bn_name, input=prev_layer)
89 | prev_layer = bn_name
90 | graph.add_node(Activation('relu'), name=relu_name, input=prev_layer)
91 | return relu_name
92 |
93 | def build_model(args, train_data):
94 | np.random.seed(args.seed)
95 |
96 | graph = Graph()
97 |
98 | non_word_input = 'non_word_marked_chars'
99 | non_word_input_width = train_data[non_word_input].shape[1]
100 |
101 | graph.add_input(non_word_input, input_shape=(non_word_input_width,), dtype='int')
102 | graph.add_node(build_embedding_layer(args, input_width=non_word_input_width),
103 | name='non_word_embedding', input=non_word_input)
104 | graph.add_node(build_convolutional_layer(args), name='non_word_conv', input='non_word_embedding')
105 | non_word_prev_layer = add_bn_relu(graph, args, 'non_word_conv')
106 | graph.add_node(build_pooling_layer(args, input_width=non_word_input_width),
107 | name='non_word_pool', input=non_word_prev_layer)
108 | graph.add_node(Flatten(), name='non_word_flatten', input='non_word_pool')
109 |
110 | # Add some number of fully-connected layers without skip connections.
111 | prev_layer = 'non_word_flatten'
112 | for i in range(args.n_fully_connected):
113 | layer_name = 'dense%02d' %i
114 | l = build_dense_layer(args, n_hidden=args.n_hidden)
115 | graph.add_node(l, name=layer_name, input=prev_layer)
116 | prev_layer = layer_name
117 | if args.batch_normalization:
118 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer)
119 | prev_layer = layer_name+'bn'
120 | if args.dropout_fc_p > 0.:
121 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer)
122 | prev_layer = layer_name+'do'
123 |
124 | # Add sequence of residual blocks.
125 | for i in range(args.n_residual_blocks):
126 | # Add a fixed number of layers per residual block.
127 | block_name = '%02d' % i
128 |
129 | graph.add_node(Identity(), name=block_name+'input', input=prev_layer)
130 | prev_layer = block_input_layer = block_name+'input'
131 |
132 | try:
133 | n_layers_per_residual_block = args.n_layers_per_residual_block
134 | except AttributeError:
135 | n_layers_per_residual_block = 2
136 |
137 | for layer_num in range(n_layers_per_residual_block):
138 | layer_name = 'h%s%02d' % (block_name, layer_num)
139 |
140 | l = build_dense_layer(args, n_hidden=args.n_hidden)
141 | graph.add_node(l, name=layer_name, input=prev_layer)
142 | prev_layer = layer_name
143 |
144 | if args.batch_normalization:
145 | graph.add_node(BatchNormalization(), name=layer_name+'bn', input=prev_layer)
146 | prev_layer = layer_name+'bn'
147 |
148 | if i < n_layers_per_residual_block:
149 | a = Activation('relu')
150 | graph.add_node(Activation('relu'), name=layer_name+'relu', input=prev_layer)
151 | prev_layer = layer_name+'relu'
152 | if args.dropout_fc_p > 0.:
153 | graph.add_node(Dropout(args.dropout_fc_p), name=layer_name+'do', input=prev_layer)
154 | prev_layer = layer_name+'do'
155 |
156 | graph.add_node(Identity(), name=block_name+'output', inputs=[block_input_layer, prev_layer], merge_mode='sum')
157 | graph.add_node(Activation('relu'), name=block_name+'relu', input=block_name+'output')
158 | prev_layer = block_input_layer = block_name+'relu'
159 |
160 | n_classes = np.max(train_data['multiclass_correction_target']) + 1
161 | if hasattr(args, 'n_hsm_classes'):
162 | graph.add_node(build_hierarchical_softmax_layer(args),
163 | name='softmax', input=prev_layer)
164 | else:
165 | graph.add_node(build_dense_layer(args, n_classes,
166 | activation='softmax'), name='softmax', input=prev_layer)
167 |
168 | graph.add_output(name='multiclass_correction_target', input='softmax')
169 |
170 | load_weights(args, graph)
171 |
172 | optimizer = build_optimizer(args)
173 |
174 | graph.compile(loss={'multiclass_correction_target': args.loss}, optimizer=optimizer)
175 |
176 | return graph
177 |
178 | def fit(config, callbacks=[]):
179 | train_data = HDF5FileDataset(
180 | config.train_path,
181 | config.data_name,
182 | [config.target_name],
183 | config.batch_size,
184 | config.seed)
185 |
186 | validation_data = HDF5FileDataset(
187 | config.validation_path,
188 | config.data_name,
189 | [config.target_name],
190 | config.batch_size,
191 | config.seed)
192 |
193 | graph = build_model(config, train_data)
194 |
195 | graph.fit_generator(train_data.generator(),
196 | samples_per_epoch=int(train_data.n/100),
197 | nb_epoch=config.n_epochs,
198 | validation_data=validation_data.generator(),
199 | nb_val_samples=10000,
200 | callbacks=callbacks,
201 | class_weight=train_data.class_weights(config.class_weight_exponent))
202 |
--------------------------------------------------------------------------------
/models/keras/spelling/toksents.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | import os
5 | from data import data
6 | import marshal
7 |
8 | sent_file = sys.argv[1]
9 | d = data.load_data(sent_file)
10 | token_seq = data.tokenize(d)
11 | marshal_file = os.path.splitext(sent_file)[0] + '.marshal'
12 | marshal.dump(token_seq, open(marshal_file, 'w'))
13 | print('DONE ' + sent_file)
14 |
--------------------------------------------------------------------------------
/models/lasagne/spelling/convnet/model.json:
--------------------------------------------------------------------------------
1 | {
2 | "n_word_dims": 50,
3 | "use_difference": false,
4 | "n_filters": 1000,
5 | "filter_width": 4,
6 | "loss": "categorical_crossentropy",
7 | "patience": 400,
8 | "batch_size": 128,
9 | "optimizer": "Adagrad",
10 | "learning_rate": 0.1,
11 | "momentum": 0.9,
12 | "decay": 0.0,
13 | "embedding_max_norm": 1000,
14 | "filter_max_norm": 1000,
15 | "dense_max_norm": 1000,
16 | "l2_penalty": 0.0,
17 | "clipnorm": 0,
18 | "regularization_layer": "dropout",
19 | "dropout_p_conv": 0.1,
20 | "dropout_p": 0.5
21 | }
22 |
--------------------------------------------------------------------------------
/models/lasagne/spelling/convnet/model.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import sys
4 | import os
5 | import time
6 |
7 | import numpy as np
8 | import theano
9 | import theano.tensor as T
10 |
11 | import modeling.lasagne_model
12 | import lasagne
13 |
14 | class Model(modeling.lasagne_model.Classifier):
15 | def build_input_var(self):
16 | return T.imatrix('inputs')
17 |
18 | def build_target_var(self):
19 | return T.ivector('targets')
20 |
21 | def build_updates(self):
22 | return lasagne.updates.nesterov_momentum(
23 | self.train_loss, self.params,
24 | learning_rate=0.01, momentum=0.9)
25 |
26 | def build_model(self):
27 | # Input layer
28 | input_shape = (self.config.batch_size, self.config.input_width)
29 | print('input_shape', input_shape)
30 | model = lasagne.layers.InputLayer(shape=input_shape,
31 | input_var=self.input_var)
32 |
33 | # Embedding layer
34 | model = lasagne.layers.EmbeddingLayer(model,
35 | self.config.n_vocab, self.config.n_word_dims)
36 |
37 | # Convolutional layer
38 | model = lasagne.layers.Conv1DLayer(model,
39 | num_filters=self.config.n_filters,
40 | filter_size=self.config.filter_width,
41 | nonlinearity=lasagne.nonlinearities.rectify,
42 | W=lasagne.init.GlorotUniform())
43 |
44 | print('pool_size', self.config.input_width-self.config.filter_width-1)
45 |
46 | # Max-pooling layer
47 | model = lasagne.layers.MaxPool1DLayer(model,
48 | pool_size=self.config.input_width-self.config.filter_width-1)
49 |
50 | # Flatten layer
51 | #model = lasagne.layers.FlattenLayer(model)
52 |
53 | # Fully-connected layer
54 | model = lasagne.layers.DenseLayer(
55 | lasagne.layers.dropout(model, p=.0),
56 | num_units=self.config.n_filters*2,
57 | nonlinearity=lasagne.nonlinearities.rectify)
58 |
59 | # Output layer
60 | model = lasagne.layers.DenseLayer(
61 | lasagne.layers.dropout(model, p=.5),
62 | num_units=self.config.n_classes,
63 | nonlinearity=lasagne.nonlinearities.softmax)
64 |
65 | return model
66 |
--------------------------------------------------------------------------------
/notebooks/ConvnetSensitivityAnalysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "np.set_printoptions(precision=3)\n",
13 | "np.set_printoptions(suppress=True)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 3,
19 | "metadata": {
20 | "collapsed": true
21 | },
22 | "outputs": [],
23 | "source": [
24 | "import itertools\n",
25 | "\n",
26 | "def powerset(iterable):\n",
27 | " s = list(iterable)\n",
28 | " return itertools.chain.from_iterable(\n",
29 | " itertools.combinations(s, r) for r in range(len(s)+1))"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 4,
35 | "metadata": {
36 | "collapsed": false
37 | },
38 | "outputs": [],
39 | "source": [
40 | "import json\n",
41 | "import pandas as pd\n",
42 | "from sklearn.metrics import precision_recall_fscore_support\n",
43 | "import modeling.utils"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 5,
49 | "metadata": {
50 | "collapsed": false
51 | },
52 | "outputs": [],
53 | "source": [
54 | "# For scikit learn metrics.\n",
55 | "precision_recall_average = 'macro'"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 6,
61 | "metadata": {
62 | "collapsed": true
63 | },
64 | "outputs": [],
65 | "source": [
66 | "# Best so far, but imbalanced.\n",
67 | "model_dir = 'models/keras/preposition/convnet/20a7a6b088ee11e5b2b374d435ed6f3a/'\n",
68 | "\n",
69 | "# Balanced.\n",
70 | "# model_dir = 'models/keras/preposition/convnet/balanced/'\n",
71 | "\n",
72 | "# Load the test set for evaluation.\n",
73 | "data_file = 'data/preposition/prepositions-all-new-test.h5'"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 7,
79 | "metadata": {
80 | "collapsed": false
81 | },
82 | "outputs": [
83 | {
84 | "name": "stdout",
85 | "output_type": "stream",
86 | "text": [
87 | "Loading weights (build_model)\n",
88 | "Loading weights\n"
89 | ]
90 | },
91 | {
92 | "name": "stderr",
93 | "output_type": "stream",
94 | "text": [
95 | "Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled)\n"
96 | ]
97 | }
98 | ],
99 | "source": [
100 | "model, model_cfg = modeling.utils.load_model(model_dir, load_weights=True)"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 8,
106 | "metadata": {
107 | "collapsed": false
108 | },
109 | "outputs": [
110 | {
111 | "name": "stdout",
112 | "output_type": "stream",
113 | "text": [
114 | "[(999552, 5), (999552, 52)]\n"
115 | ]
116 | }
117 | ],
118 | "source": [
119 | "model_data = modeling.utils.load_all_model_data(data_file, model_cfg)"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 9,
125 | "metadata": {
126 | "collapsed": false
127 | },
128 | "outputs": [],
129 | "source": [
130 | "# Load target data or metadata (e.g. mapping between numeric target variable and preposition).\n",
131 | "target_data_file = 'data/preposition/prepositions-all-new-target-data.json'\n",
132 | "target_data = json.load(open(target_data_file))"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "collapsed": true
140 | },
141 | "outputs": [],
142 | "source": [
143 | "def compute_n_unknown_words():\n",
144 | " n_unknown_words = np.zeros_like(model_data.len)\n",
145 | " for i in np.arange(0, len(model_data.len)):\n",
146 | " n_unknown_words[i] = len(np.where(model_data.data[i, 0:model_data.len[i]] == 0)[0])\n",
147 | " return n_unknown_words"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "Sensitivity analysis of effect of position of unknown words in window around preposition\n",
155 | "=======\n",
156 | "1. Take all examples in which the window around the preposition contains no unknown words.\n",
157 | "2. For each set in the powerset of positions in the window (excluding the center, where the preposition occurs):\n",
158 | " 1. Set the words in that position to be unknown (i.e. assign 0 to that position) for all examples.\n",
159 | " 2. Run the examples through the model.\n",
160 | "3. Evaluate the model's performance."
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {
167 | "collapsed": false
168 | },
169 | "outputs": [],
170 | "source": [
171 | "def sensitivity_analysis(n=50000):\n",
172 | " n_unknown_words = compute_n_unknown_words()\n",
173 | "\n",
174 | " print('# of examples ' + str(len(model_data.data)))\n",
175 | " print('# of examples with no unknown words ' + str((n_unknown_words==0).sum()))\n",
176 | " \n",
177 | " error_detection_targets = np.ones_like(model_data.current_word_code)\n",
178 | " evens = np.arange(0, len(model_data.target), 2)\n",
179 | " error_detection_targets[evens] = 0\n",
180 | "\n",
181 | " no_unknown_words_data = model_data.data[n_unknown_words == 0]\n",
182 | " no_unknown_words_correction_targets = model_data.target[n_unknown_words == 0]\n",
183 | " no_unknown_words_detection_targets = error_detection_targets[n_unknown_words == 0]\n",
184 | "\n",
185 | " window_size = 5\n",
186 | " center = 2\n",
187 | "\n",
188 | " assert len(np.where(model_data.data[:, center] == 0)[0]) == 0\n",
189 | "\n",
190 | " indices_in_window = [center-2, center-1, center+1, center+2]\n",
191 | "\n",
192 | " masks = [mask for mask in powerset(indices_in_window)]\n",
193 | "\n",
194 | " correction_results = {}\n",
195 | " \n",
196 | " results_df = None\n",
197 | "\n",
198 | " for mask in masks:\n",
199 | " data = no_unknown_words_data.copy()[0:n]\n",
200 | " mask = np.array(mask, dtype=int)\n",
201 | "\n",
202 | " data[:, mask] = 0\n",
203 | "\n",
204 | " for i in np.arange(len(data)):\n",
205 | " data[i, mask + model_data.position[i] + 3] = 0\n",
206 | "\n",
207 | " no_unknown_words_correction_preds = model.predict_classes(data, verbose=0)\n",
208 | "\n",
209 | " unknowns_str = ['_'] * (len(indices_in_window) + 1)\n",
210 | " for x in mask:\n",
211 | " unknowns_str[x] = \"?\"\n",
212 | " unknowns_str[center] = \"P\"\n",
213 | "\n",
214 | " # Error correction\n",
215 | " p, r, f, _ = precision_recall_fscore_support(\n",
216 | " no_unknown_words_correction_targets[0:n],\n",
217 | " no_unknown_words_correction_preds,\n",
218 | " average=precision_recall_average)\n",
219 | " \n",
220 | " row = pd.DataFrame({\n",
221 | " \"pos-2\": [unknowns_str[0]],\n",
222 | " \"pos-1\": [unknowns_str[1]],\n",
223 | " \"pos-0\": [unknowns_str[2]],\n",
224 | " \"pos+1\": [unknowns_str[3]],\n",
225 | " \"pos+2\": [unknowns_str[4]],\n",
226 | " \"precision\": [p],\n",
227 | " \"recall\": [r],\n",
228 | " \"f1\": [f],\n",
229 | " \"n\": [n]\n",
230 | " })\n",
231 | " if results_df is None:\n",
232 | " results_df = row\n",
233 | " else:\n",
234 | " results_df = pd.concat([results_df, row])\n",
235 | "\n",
236 | " results_df = results_df[[\"pos-2\", \"pos-1\", \"pos-0\", \"pos+1\", \"pos+2\", \"precision\", \"recall\", \"f1\", \"n\"]]\n",
237 | " print(results_df.to_latex(index=False, float_format=lambda f: '%.02f' % f))\n",
238 | " \n",
239 | "sensitivity_analysis(n=10000)"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {
246 | "collapsed": true
247 | },
248 | "outputs": [],
249 | "source": []
250 | }
251 | ],
252 | "metadata": {
253 | "kernelspec": {
254 | "display_name": "Python 2",
255 | "language": "python",
256 | "name": "python2"
257 | },
258 | "language_info": {
259 | "codemirror_mode": {
260 | "name": "ipython",
261 | "version": 2
262 | },
263 | "file_extension": ".py",
264 | "mimetype": "text/x-python",
265 | "name": "python",
266 | "nbconvert_exporter": "python",
267 | "pygments_lexer": "ipython2",
268 | "version": "2.7.10"
269 | }
270 | },
271 | "nbformat": 4,
272 | "nbformat_minor": 0
273 | }
274 |
--------------------------------------------------------------------------------
/notebooks/notes.txt:
--------------------------------------------------------------------------------
1 | ---------------------------------------------------------------------------
2 | Given our current architecture, how helpful is increasing the training set size?
3 | ---------------------------------------------------------------------------
4 |
5 | Embedding size: 50
6 | Number of convolutional filters: 500
7 | Filter width: 4
8 | Max filter norm: 1 (might be too small, considering filter width)
9 | Hidden fully-connected layers: 3
10 | Fully-connected layer sizes: 1000, 1000, 500
11 | Learning rate: 0.03
12 | Momentum: 0.9
13 | Weight decay: 0
14 |
15 | Train on 1m, validate on 200k:
16 |
17 | acc: 0.1336 - val_acc: 0.2124 - val_f1: 0.12
18 | acc: 0.2628 - val_acc: 0.3049 - val_f1: 0.21
19 | acc: 0.3093 - val_acc: 0.3476 - val_f1: 0.26
20 | acc: 0.3571 - val_acc: 0.3972 - val_f1: 0.33
21 | acc: 0.3913 - val_acc: 0.4225 - val_f1: 0.36
22 | acc: 0.4091 - val_acc: 0.4312 - val_f1: 0.38
23 |
24 | Train on 2m, validate on 200k:
25 |
26 | acc: 0.1909 - val_acc: 0.3048 - val_f1: 0.22
27 | acc: 0.3421 - val_acc: 0.4070 - val_f1: 0.34
28 | acc: 0.4028 - val_acc: 0.4362 - val_f1: 0.38
29 | acc: 0.4208 - val_acc: 0.4464 - val_f1: 0.40
30 | acc: 0.4293 - val_acc: 0.4522 - val_f1: 0.40
31 |
32 | Train on 4m, validate on 200k:
33 |
34 | acc: 0.2616 - val_acc: 0.4063 - val_f1: 0.25
35 | acc: 0.4095 - val_acc: 0.4456 - val_f1: 0.33
36 | acc: 0.4315 - val_acc: 0.4573 - val_f1: 0.35
37 | acc: 0.4398 - val_acc: 0.4630 - val_f1: 0.37
38 | acc: 0.4448 - val_acc: 0.4658 - val_f1: 0.38
39 | acc: 0.4484 - val_acc: 0.4673 - val_f1: 0.39
40 | acc: 0.4509 - val_acc: 0.4693 - val_f1: 0.39
41 |
42 | Loosen max norm constraint on word embeddings to 2, use class weights
43 | to help model perform better on less frequent classes:
44 |
45 | Embedding size: 50
46 | Number of convolutional filters: 500
47 | Filter width: 4
48 | Hidden fully-connected layers:
49 | Number: 3
50 | Fully-connected layer sizes: 1000, 1000, 500
51 | Learning rate: 0.1
52 | Momentum: 0.9
53 | Decay: 0.000000001
54 |
55 | Train on 4m, validate on 200k:
56 | acc: 0.3606 - val_acc: 0.4497 - val_f1: 0.40
57 | acc: 0.4343 - val_acc: 0.4619 - val_f1: 0.42
58 | acc: 0.4366 - val_acc: 0.4585 - val_f1: 0.42
59 | acc: 0.4362 - val_acc: 0.4624 - val_f1: 0.42
60 | acc: 0.4335 - val_acc: 0.4553 - val_f1: 0.41
61 | acc: 0.4316 - val_acc: 0.4541 - val_f1: 0.41
62 |
63 | ---------------------------------------------------------------------------
64 | What happens when we use an LSTM network instead of a temporal
65 | convolutional network?
66 | ---------------------------------------------------------------------------
67 |
68 | Embedding size: 50
69 | Number of LSTM layers: 3
70 | Number of units in LSTM layers: 64
71 | Dropout after each LSTM layer: 0.2
72 | Learning rate: 0.1
73 | Momentum: 0.9
74 | Decay: 0
75 | Gradient truncation: -1 (classical BPTT)
76 | Norm clipping threshold: 0 (no clipping)
77 |
78 | Train on 4m, validate on 200k:
79 | acc: 0.2791 - val_acc: 0.3589 - val_f1: 0.29
80 | acc: 0.4383 - val_acc: 0.5068 - val_f1: 0.47
81 | acc: 0.4863 - val_acc: 0.5203 - val_f1: 0.49
82 | acc: 0.5251 - val_acc: 0.5438 - val_f1: 0.51
83 | acc: 0.5461 - val_acc: 0.5613 - val_f1: 0.53
84 | acc: 0.5614 - val_acc: 0.5697 - val_f1: 0.55
85 | acc: 0.5736 - val_acc: 0.5744 - val_f1: 0.56
86 | acc: 0.5825 - val_acc: 0.5800 - val_f1: 0.56
87 |
88 | New configuration (with clipping)
89 |
90 | Embedding size: 50
91 | Number of LSTM layers: 3
92 | Number of units in LSTM layers: 64
93 | Dropout after each LSTM layer: 0.2
94 | Learning rate: 0.1
95 | Momentum: 0.9
96 | Decay: 0
97 | Gradient truncation: -1 (classical BPTT)
98 | Norm clipping threshold: 5 (no clipping)
99 |
100 | Train on 8m, validate on 200k (here one epoch is 100k examples; I had
101 | to split up the 8m examples into separate files, and each gets its
102 | own epoch):
103 |
104 | acc: 0.4823 - val_acc: 0.491425
105 |
106 | New configuration (with clipping)
107 |
108 | Embedding size: 50
109 | Number of LSTM layers: 4
110 | Number of units in LSTM layers: 500
111 | Dropout after each LSTM layer: 0.2
112 | Learning rate: 0.1
113 | Momentum: 0.9
114 | Decay: 0
115 | Gradient truncation: -1 (classical BPTT)
116 | Norm clipping threshold: 5 (no clipping)
117 |
118 | Train on 16m, validate on 200k:
119 |
120 | ---------------------------------------------------------------------------
121 | After creating a balanced data set (which excluded 'about', because it
122 | is so less frequent than the other prepositions), I may have discovered
123 | that training a model with intra-minibatch contrasting cases -- that is,
124 | with every sentence from the corpus being accompanied by an example that
125 | is the same sentence with an error introduced -- is essential to being
126 | able to train this model.
127 | ---------------------------------------------------------------------------
128 |
129 | ---------------------------------------------------------------------------
130 | Multi-task learning; change train.py to allow multiple --target
131 | arguments ... or change the architecture so the targets used are
132 | determined by model.json and model.py?
133 | ---------------------------------------------------------------------------
134 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | keras=0.3.1
2 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(name='Modeling package', packages=['modeling'])
4 |
--------------------------------------------------------------------------------
/tests/testdata.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 |
4 | #import six
5 | import sys
6 | import os
7 | import numpy as np
8 |
9 | import unittest
10 | import modeling.data
11 |
12 | class TestData(unittest.TestCase):
13 | def test_create_window_position_at_beginning(self):
14 | sentence = np.arange(1, 12)
15 | position = 0
16 | expected_window = [0, 0, 0, 1, 2, 3, 4]
17 | window = modeling.data.create_window(sentence, position,
18 | size=7)
19 |
20 | self.assertEqual(7, len(window))
21 | self.assertTrue(np.all(window == expected_window))
22 |
23 | def test_create_window_position_at_end_nonce(self):
24 | sentence = np.arange(1, 12)
25 | position = len(sentence) - 1
26 | nonce = 99
27 | expected_window = [8, 9, 10, nonce, 0, 0, 0]
28 | window = modeling.data.create_window(sentence, position,
29 | size=7, nonce=nonce)
30 |
31 | self.assertEqual(7, len(window))
32 | self.assertTrue(np.all(window == expected_window))
33 |
34 | def test_create_window_position_before_sentence(self):
35 | sentence = np.arange(1, 12)
36 | position = -1
37 | self.assertRaises(
38 | ValueError,
39 | modeling.data.create_window,
40 | sentence, position)
41 |
42 | def test_create_window_position_after_sentence(self):
43 | sentence = np.arange(1, 12)
44 | position = 12
45 | self.assertRaises(
46 | ValueError,
47 | modeling.data.create_window,
48 | sentence, position)
49 |
50 |
--------------------------------------------------------------------------------
/tests/testdifference.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 |
4 | import unittest
5 | import numpy as np
6 | from theano import function
7 | import theano.tensor as T
8 |
9 | from keras.layers.core import Layer
10 |
11 | class TemporalDifference(Layer):
12 | """
13 | Given a 3-tensor with shape (nb_samples, maxlen, output_dim), outputs
14 | the difference X[
15 | """
16 | def _get_output(self, X):
17 | return X[:, 1:, :] - X[:, 0:X.shape[1]-1, :]
18 |
19 | def get_output(self, train):
20 | return self._get_output(self.get_input(train))
21 |
22 | def get_config(self):
23 | return {"name": self.__class__.__name__}
24 |
25 | class TestTemporalDifference(unittest.TestCase):
26 | def testForward(self):
27 | nb_examples = 2
28 | maxlen = 7
29 | output_dim = nb_word_dim = 5
30 | x = np.random.normal(size=(nb_examples, maxlen, output_dim)).astype(np.float32)
31 | expected = x[:, 1:, :] - x[:, 0:x.shape[1]-1, :]
32 | X = T.tensor3('X')
33 | retval = TemporalDifference()._get_output(X)
34 | f = function([X], retval)
35 | actual = f(x)
36 | self.assertTrue(np.allclose(actual, expected))
37 |
--------------------------------------------------------------------------------
/tests/testlasagne.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 |
4 | import six
5 | import sys
6 | import os
7 | import numpy as np
8 |
9 | import unittest
10 | import modeling.lasagne_model
11 | import modeling.utils
12 |
13 | import theano.tensor as T
14 | import lasagne
15 |
16 | # From Lasagne/examples/mnist.py
17 | def load_mnist():
18 | # We first define a download function, supporting both Python 2 and 3.
19 | if sys.version_info[0] == 2:
20 | from urllib import urlretrieve
21 | else:
22 | from urllib.request import urlretrieve
23 |
24 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
25 | print("Downloading %s" % filename)
26 | urlretrieve(source + filename, filename)
27 |
28 | # We then define functions for loading MNIST images and labels.
29 | # For convenience, they also download the requested files if needed.
30 | import gzip
31 |
32 | def load_mnist_images(filename):
33 | if not os.path.exists(filename):
34 | download(filename)
35 | # Read the inputs in Yann LeCun's binary format.
36 | with gzip.open(filename, 'rb') as f:
37 | data = np.frombuffer(f.read(), np.uint8, offset=16)
38 | # The inputs are vectors now, we reshape them to monochrome 2D images,
39 | # following the shape convention: (examples, channels, rows, columns)
40 | data = data.reshape(-1, 1, 28, 28)
41 | # The inputs come as bytes, we convert them to float32 in range [0,1].
42 | # (Actually to range [0, 255/256], for compatibility to the version
43 | # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
44 | return data / np.float32(256)
45 |
46 | def load_mnist_labels(filename):
47 | if not os.path.exists(filename):
48 | download(filename)
49 | # Read the labels in Yann LeCun's binary format.
50 | with gzip.open(filename, 'rb') as f:
51 | data = np.frombuffer(f.read(), np.uint8, offset=8)
52 | # The labels are vectors of integers now, that's exactly what we want.
53 | return data
54 |
55 | # We can now download and read the training and test set images and labels.
56 | X_train = load_mnist_images('train-images-idx3-ubyte.gz')
57 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
58 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
59 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
60 |
61 | # We reserve the last 10000 training examples for validation.
62 | X_train, X_val = X_train[:-10000], X_train[-10000:]
63 | y_train, y_val = y_train[:-10000], y_train[-10000:]
64 |
65 | # We just return all the arrays in order, as expected in main().
66 | # (It doesn't matter how we do this as long as we can read them again.)
67 | return X_train, y_train, X_val, y_val, X_test, y_test
68 |
69 | class TestModel(modeling.lasagne_model.Classifier):
70 | def build_input_var(self):
71 | return T.tensor4('inputs')
72 |
73 | def build_target_var(self):
74 | return T.ivector('targets')
75 |
76 | def build_updates(self):
77 | return lasagne.updates.nesterov_momentum(
78 | self.train_loss, self.params, learning_rate=0.01, momentum=0.9)
79 |
80 | def build_model(self, input_var):
81 | l_in = lasagne.layers.InputLayer(
82 | shape=(None, 1, 28, 28), input_var=input_var)
83 |
84 | l_in_drop = lasagne.layers.DropoutLayer(l_in, p=0.2)
85 |
86 | # Add a fully-connected layer of 800 units, using the linear rectifier, and
87 | # initializing weights with Glorot's scheme (which is the default anyway).
88 | l_hid1 = lasagne.layers.DenseLayer(
89 | l_in_drop, num_units=800,
90 | nonlinearity=lasagne.nonlinearities.rectify,
91 | W=lasagne.init.GlorotUniform())
92 |
93 | # Finally, we'll add the fully-connected output layer, of 10 softmax units:
94 | l_out = lasagne.layers.DenseLayer(
95 | l_hid1, num_units=10,
96 | nonlinearity=lasagne.nonlinearities.softmax)
97 |
98 | # Each layer is linked to its incoming layer(s), so we only need to pass
99 | # the output layer to give access to a network in Lasagne:
100 | return l_out
101 |
102 | class TestLasagneClassifier(unittest.TestCase):
103 | def test_mnist(self):
104 | args = {}
105 | config = modeling.utils.ModelConfig(**args)
106 | model = TestModel(config)
107 | X_train, y_train, X_val, y_val, X_test, y_test = load_mnist()
108 | n_epochs = 5
109 | batch_size = 256
110 | for epoch in six.moves.range(n_epochs):
111 | for j in six.moves.range(0, len(X_train), batch_size):
112 | model.fit(X_train[j:j+batch_size], y_train[j:j+batch_size])
113 | val_loss, val_acc = model.evaluate(X_val, y_val)
114 | self.assertTrue(val_acc > 0.9)
115 |
116 | def test_save_load(self):
117 | weights_file = '/tmp/model.npz'
118 |
119 | args = {}
120 | config = modeling.utils.ModelConfig(**args)
121 | rng1 = np.random.RandomState(17)
122 | lasagne.random.set_rng(rng1)
123 | model1 = TestModel(config)
124 | model1.save_weights(weights_file)
125 |
126 | rng2 = np.random.RandomState(23)
127 | lasagne.random.set_rng(rng2)
128 | model2 = TestModel(config)
129 | model2.load_weights(weights_file)
130 |
131 | weights1 = lasagne.layers.get_all_param_values(model1.model)
132 | weights2 = lasagne.layers.get_all_param_values(model2.model)
133 |
134 | for i in six.moves.range(len(weights1)):
135 | w1 = weights1[i]
136 | w2 = weights2[i]
137 | self.assertTrue(np.allclose(w1, w2))
138 |
--------------------------------------------------------------------------------
/tests/testlayers.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import numpy as np
3 | import theano
4 | import theano.tensor as T
5 | #import theano.tensor.nnet
6 |
7 | import keras.models
8 | import keras.layers.core
9 |
10 | from modeling.layers import HierarchicalSoftmax
11 | import modeling.utils
12 | import modeling.builders
13 |
14 | class TestHierarchicalSoftmax(unittest.TestCase):
15 | def setUp(self):
16 | self.batch_size = 1
17 | self.input_dim = 4
18 | self.n_hsm_classes = 5
19 | self.n_outputs_per_class = 3
20 | self.output_size = self.n_hsm_classes * self.n_outputs_per_class
21 |
22 | def test_hierarchical_softmax_integrated(self):
23 | net = keras.models.Sequential()
24 | net.add(keras.layers.core.Dense(100, input_dim=self.input_dim, activation='relu'))
25 | net.add(HierarchicalSoftmax(
26 | self.output_size, self.n_hsm_classes,
27 | #self.n_hsm_classes, self.n_outputs_per_class,
28 | batch_size=self.batch_size))
29 | net.compile(loss='categorical_crossentropy', optimizer='Adam')
30 | x = np.random.normal(size=(self.batch_size, self.input_dim))
31 | target = net.predict_proba(x, verbose=0)
32 | n_classes = self.n_hsm_classes * self.n_outputs_per_class
33 | self.assertEqual((self.batch_size, n_classes), target.shape)
34 |
35 | def test_hierarchical_softmax_isolated(self):
36 | layer = HierarchicalSoftmax(self.output_size, self.n_hsm_classes,
37 | #self.n_outputs_per_class,
38 | batch_size=self.batch_size,
39 | input_dim=self.input_dim)
40 | layer.build()
41 |
42 | xt = T.matrix('x')
43 | f = theano.function([xt], layer._get_output(xt))
44 | x = np.random.normal(size=(self.batch_size, self.input_dim)).astype(np.float32)
45 |
46 | output = f(x)
47 | self.assertTrue(output.shape == (self.batch_size, self.output_size))
48 | self.assertTrue(np.allclose(1.0, output.sum()))
49 |
50 | #@unittest.skip('')
51 | def test_theano_h_softmax(self):
52 | """
53 | Tests the output dimensions of the h_softmax when a target is provided or
54 | not.
55 |
56 | This test came from
57 | """
58 |
59 | #############
60 | # Initialize shared variables
61 | #############
62 |
63 | floatX = theano.config.floatX
64 | shared = theano.shared
65 |
66 | # Class softmax.
67 | W1 = np.asarray(np.random.normal(
68 | size=(self.input_dim, self.n_hsm_classes)), dtype=floatX)
69 | W1 = shared(W1)
70 | b1 = np.asarray(np.zeros((self.n_hsm_classes,)), dtype=floatX)
71 | b1 = shared(b1)
72 |
73 | # Class member softmax.
74 | W2 = np.asarray(np.random.normal(
75 | size=(self.n_hsm_classes, self.input_dim, self.n_outputs_per_class)),
76 | dtype=floatX)
77 | W2 = shared(W2)
78 | b2 = np.asarray(
79 | np.zeros((self.n_hsm_classes, self.n_outputs_per_class)), dtype=floatX)
80 | b2 = shared(b2)
81 |
82 | #############
83 | # Build graph
84 | #############
85 | x = T.matrix('x')
86 | y = T.ivector('y')
87 |
88 | # This only computes the output corresponding to the target
89 | y_hat_tg = theano.tensor.nnet.h_softmax(x,
90 | self.batch_size, self.output_size, self.n_hsm_classes, self.n_outputs_per_class,
91 | W1, b1, W2, b2, y)
92 |
93 | # This computes all the outputs
94 | y_hat_all = theano.tensor.nnet.h_softmax(x,
95 | self.batch_size, self.output_size, self.n_hsm_classes, self.n_outputs_per_class,
96 | W1, b1, W2, b2)
97 |
98 | #############
99 | # Compile functions
100 | #############
101 | fun_output_tg = theano.function([x, y], y_hat_tg)
102 | fun_output = theano.function([x], y_hat_all)
103 |
104 | #############
105 | # Test
106 | #############
107 | x_mat = np.random.normal(size=(self.batch_size, self.input_dim)).astype(floatX)
108 | y_mat = np.random.randint(0, self.output_size, self.batch_size).astype('int32')
109 |
110 | self.assertTrue(fun_output_tg(x_mat, y_mat).shape == (self.batch_size,))
111 | self.assertTrue(fun_output(x_mat).shape == (self.batch_size, self.output_size))
112 |
113 | if __name__ == '__main__':
114 | unittest.main()
115 |
--------------------------------------------------------------------------------
/tests/testnonconvnet.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import random
3 | import numpy as np
4 | import theano
5 | import theano.tensor as T
6 |
7 | from keras import models
8 | from keras.layers import embeddings
9 | from keras.layers import core
10 |
11 | from modeling.nonconvnet import ZeroFillDiagonals, \
12 | SplitOutputByFilter, \
13 | SlidingWindowL2MaxPooling
14 |
15 | class TestNonConvNet(unittest.TestCase):
16 | def setUp(self):
17 | self.n_vocab = 100
18 | self.n_word_dims = 5
19 | self.filter_width = 4
20 | self.n_filters = 3
21 | self.max_seq_len = 9
22 | self.batch_size = 3
23 |
24 | def setSeeds(self):
25 | np.random.seed(1)
26 |
27 | def testNonConvNet(self):
28 | self.setSeeds()
29 |
30 | x = np.random.randint(self.n_vocab, size=(self.batch_size,
31 | self.max_seq_len))
32 |
33 | model = models.Sequential()
34 |
35 | # input: (batch_size, max_seq_len)
36 | # output: (batch_size, max_seq_len, n_word_dims)
37 | model.add(embeddings.Embedding(self.n_vocab, self.n_word_dims))
38 | model.compile(loss='mse', optimizer='sgd')
39 | expected_shape_l1 = (self.batch_size, self.max_seq_len,
40 | self.n_word_dims)
41 | output_l1 = model.predict(x)
42 | self.assertEqual(expected_shape_l1, output_l1.shape)
43 |
44 | # input: (batch_size, max_seq_len, n_word_dims)
45 | # output: (batch_size, max_seq_len, n_filters * filter_width)
46 | model.add(core.TimeDistributedDense(
47 | self.n_word_dims, self.n_filters * self.filter_width))
48 | model.compile(loss='mse', optimizer='sgd')
49 | expected_shape_l2 = (self.batch_size, self.max_seq_len,
50 | self.n_filters * self.filter_width)
51 | output_l2 = model.predict(x)
52 | self.assertEqual(expected_shape_l2, output_l2.shape)
53 |
54 | # input: (batch_size, max_seq_len, n_filters * filter_width)
55 | # output: (batch_size, n_filters, max_seq_len, filter_width)
56 | model.add(SplitOutputByFilter(self.n_filters, self.filter_width))
57 | model.compile(loss='mse', optimizer='sgd')
58 | expected_shape_l3 = (self.batch_size, self.n_filters,
59 | self.max_seq_len, self.filter_width)
60 | output_l3 = model.predict(x)
61 | self.assertEqual(expected_shape_l3, output_l3.shape)
62 |
63 | # input: (batch_size, n_filters, max_seq_len, filter_width)
64 | # output: (batch_size, n_filters, filter_width, filter_width)
65 | model.add(SlidingWindowL2MaxPooling(
66 | self.batch_size, self.n_filters,
67 | self.filter_width, self.max_seq_len))
68 | model.compile(loss='mse', optimizer='sgd')
69 | expected_shape_l4 = (self.batch_size, self.n_filters,
70 | self.filter_width, self.filter_width)
71 | output_l4 = model.predict(x)
72 | self.assertEqual(expected_shape_l4, output_l4.shape)
73 |
74 | # input: (batch_size, n_filters, filter_width, filter_width)
75 | # output: (batch_size, n_filters, filter_width, filter_width)
76 | model.add(ZeroFillDiagonals(
77 | self.batch_size, self.n_filters, self.filter_width))
78 | model.compile(loss='mse', optimizer='sgd')
79 | expected_shape_l5 = (self.batch_size, self.n_filters,
80 | self.filter_width, self.filter_width)
81 | output_l5 = model.predict(x)
82 | self.assertEqual(expected_shape_l5, output_l5.shape)
83 |
84 | def testSplitOutputByFilter(self):
85 | self.setSeeds()
86 |
87 | input_shape = (self.batch_size, self.max_seq_len,
88 | self.n_filters * self.filter_width)
89 | output_shape = (self.batch_size, self.n_filters,
90 | self.max_seq_len, self.filter_width)
91 |
92 | x = np.arange(np.prod(input_shape))
93 | x = x.reshape(input_shape).astype(np.int32)
94 | y = np.zeros_like(x)
95 | y = np.reshape(y, output_shape)
96 |
97 | for i in range(self.n_filters):
98 | s = x[:, :, i*self.filter_width:(i+1)*self.filter_width]
99 | y[:, i, :, :] = s
100 |
101 | xt = T.itensor3('xt')
102 | layer = SplitOutputByFilter(self.n_filters, self.filter_width)
103 | yt = layer._get_output(xt)
104 |
105 | f = theano.function(inputs=[xt], outputs=yt)
106 | y_theano = f(x)
107 |
108 | self.assertEquals(y.shape, y_theano.shape)
109 | self.assertTrue(np.all(y == y_theano))
110 |
111 | def testSlidingWindowL2MaxPooling(self):
112 | self.assertTrue(
113 | self.max_seq_len - self.filter_width > self.n_filters)
114 |
115 | self.setSeeds()
116 |
117 | input_shape = (self.batch_size, self.n_filters,
118 | self.max_seq_len, self.filter_width)
119 | output_shape = (self.batch_size, self.n_filters,
120 | self.filter_width, self.filter_width)
121 |
122 | x = np.zeros(shape=input_shape)
123 | expected = np.zeros(shape=output_shape)
124 |
125 | max_input_shape = (self.batch_size, self.filter_width, self.filter_width)
126 |
127 | # For the i-th filter, make i the offset at which the maximum
128 | # L2 norm occurs.
129 | for i in np.arange(self.n_filters):
130 | start = i
131 | end = i+self.filter_width
132 | values = i + np.arange(np.prod(max_input_shape))
133 | values = values.reshape(max_input_shape)
134 | x[:, i, start:end, :] = values
135 | expected[:, i, :, :] = values
136 |
137 | it = T.iscalar()
138 | x3d = T.dtensor3('x3d')
139 | x4d = T.dtensor4('x4d')
140 |
141 | layer = SlidingWindowL2MaxPooling(
142 | self.batch_size, self.n_filters, self.filter_width,
143 | self.max_seq_len)
144 |
145 | '''
146 | Use the first sample and first filter to test `filter_dimension`.
147 | '''
148 | yt_filter_dim = layer.filter_dimension(it, x3d)
149 | f_filter_dim = theano.function(inputs=[it, x3d], outputs=yt_filter_dim)
150 | y_filter_dim_out = f_filter_dim(0, x[0])
151 | self.assertEquals((self.filter_width, self.filter_width),
152 | y_filter_dim_out.shape)
153 | self.assertTrue(np.all(expected[0, 0, :, :] == y_filter_dim_out))
154 |
155 | '''
156 | Use the first sample to test `filter_dimension`.
157 | '''
158 | yt_sample_dim = layer.sample_dimension(it, x4d)
159 | f_sample_dim = theano.function(inputs=[it, x4d], outputs=yt_sample_dim)
160 | y_sample_dim_out = f_sample_dim(0, x)
161 | self.assertEquals((self.n_filters, self.filter_width, self.filter_width),
162 | y_sample_dim_out.shape)
163 | self.assertTrue(np.all(expected[0, :, :, :] == y_sample_dim_out))
164 |
165 | '''
166 | Use all of `x` to test `_get_output`.
167 | '''
168 | yt_output = layer._get_output(x4d)
169 | f_output = theano.function(inputs=[x4d], outputs=yt_output)
170 | yt_out = f_output(x)
171 | self.assertEquals(
172 | (self.batch_size, self.n_filters, self.filter_width,
173 | self.filter_width), yt_out.shape)
174 | self.assertTrue(np.all(expected == yt_out))
175 |
176 | def testZeroFillDiagonals(self):
177 | input_shape = (self.batch_size, self.n_filters,
178 | self.filter_width, self.filter_width)
179 | mask = np.ones(input_shape)
180 | diag_indices = np.arange(self.filter_width)
181 | for i in np.arange(self.batch_size):
182 | for j in np.arange(self.n_filters):
183 | mask[i, j, diag_indices, diag_indices] = 0
184 |
185 | x = np.arange(np.prod(input_shape)).reshape(input_shape)
186 | expected = x * mask
187 |
188 | x4d = T.dtensor4('x4d')
189 | layer = ZeroFillDiagonals(
190 | self.batch_size, self.n_filters, self.filter_width)
191 | yt_output = layer._get_output(x4d)
192 | f_output = theano.function(inputs=[x4d], outputs=yt_output)
193 |
194 | yt_out = f_output(x)
195 | self.assertEquals(expected.shape, yt_out.shape)
196 | self.assertTrue(np.all(expected == yt_out))
197 |
--------------------------------------------------------------------------------
/train_chainer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from __future__ import absolute_import
4 | from __future__ import print_function
5 |
6 | import sys
7 | import six
8 | import argparse
9 | import progressbar
10 | import copy
11 | import cPickle
12 |
13 | import numpy as np
14 | import pandas as pd
15 |
16 | import chainer
17 | from chainer import cuda
18 | from modeling.chainer_model import Classifier
19 | from modeling.utils import (
20 | load_model_data, load_model_json, build_model_id, build_model_path,
21 | setup_model_dir, setup_logging, ModelConfig)
22 | import modeling.parser
23 |
24 | def main(args):
25 | if args.gpu >= 0:
26 | cuda.check_cuda_available()
27 | xp = cuda.cupy if args.gpu >= 0 else np
28 |
29 | model_id = build_model_id(args)
30 | model_path = build_model_path(args, model_id)
31 | setup_model_dir(args, model_path)
32 | sys.stdout, sys.stderr = setup_logging(args)
33 |
34 | x_train, y_train = load_model_data(args.train_file,
35 | args.data_name, args.target_name,
36 | n=args.n_train)
37 | x_validation, y_validation = load_model_data(
38 | args.validation_file,
39 | args.data_name, args.target_name,
40 | n=args.n_validation)
41 |
42 | rng = np.random.RandomState(args.seed)
43 |
44 | N = len(x_train)
45 | N_validation = len(x_validation)
46 |
47 | n_classes = max(np.unique(y_train)) + 1
48 | json_cfg = load_model_json(args, x_train, n_classes)
49 |
50 | print('args.model_dir', args.model_dir)
51 | sys.path.append(args.model_dir)
52 | from model import Model
53 | model_cfg = ModelConfig(**json_cfg)
54 | model = Model(model_cfg)
55 | setattr(model, 'stop_training', False)
56 |
57 | if args.gpu >= 0:
58 | cuda.get_device(args.gpu).use()
59 | model.to_gpu()
60 |
61 | best_accuracy = 0.
62 | best_epoch = 0
63 |
64 | def keep_training(epoch, best_epoch):
65 | if model_cfg.n_epochs is not None and epoch > model_cfg.n_epochs:
66 | return False
67 | if epoch > 1 and epoch - best_epoch > model_cfg.patience:
68 | return False
69 | return True
70 |
71 | epoch = 1
72 |
73 | while True:
74 | if not keep_training(epoch, best_epoch):
75 | break
76 |
77 | if args.shuffle:
78 | perm = np.random.permutation(N)
79 | else:
80 | perm = np.arange(N)
81 |
82 | sum_accuracy = 0
83 | sum_loss = 0
84 |
85 | pbar = progressbar.ProgressBar(term_width=40,
86 | widgets=[' ', progressbar.Percentage(),
87 | ' ', progressbar.ETA()],
88 | maxval=N).start()
89 |
90 | for j, i in enumerate(six.moves.range(0, N, model_cfg.batch_size)):
91 | pbar.update(j+1)
92 | x_batch = xp.asarray(x_train[perm[i:i + model_cfg.batch_size]].flatten())
93 | y_batch = xp.asarray(y_train[perm[i:i + model_cfg.batch_size]])
94 | pred, loss, acc = model.fit(x_batch, y_batch)
95 | sum_loss += float(loss.data) * len(y_batch)
96 | sum_accuracy += float(acc.data) * len(y_batch)
97 |
98 | pbar.finish()
99 | print('train epoch={}, mean loss={}, accuracy={}'.format(
100 | epoch, sum_loss / N, sum_accuracy / N))
101 |
102 | # Validation set evaluation
103 | sum_accuracy = 0
104 | sum_loss = 0
105 |
106 | pbar = progressbar.ProgressBar(term_width=40,
107 | widgets=[' ', progressbar.Percentage(),
108 | ' ', progressbar.ETA()],
109 | maxval=N_validation).start()
110 |
111 | for i in six.moves.range(0, N_validation, model_cfg.batch_size):
112 | pbar.update(i+1)
113 | x_batch = xp.asarray(x_validation[i:i + model_cfg.batch_size].flatten())
114 | y_batch = xp.asarray(y_validation[i:i + model_cfg.batch_size])
115 | pred, loss, acc = model.predict(x_batch, target=y_batch)
116 | sum_loss += float(loss.data) * len(y_batch)
117 | sum_accuracy += float(acc.data) * len(y_batch)
118 |
119 | pbar.finish()
120 | validation_accuracy = sum_accuracy / N_validation
121 | validation_loss = sum_loss / N_validation
122 |
123 | if validation_accuracy > best_accuracy:
124 | best_accuracy = validation_accuracy
125 | best_epoch = epoch
126 | if model_path is not None:
127 | if args.gpu >= 0:
128 | model.to_cpu()
129 | store = {
130 | 'args': args,
131 | 'model': model,
132 | }
133 | cPickle.dump(store, open(model_path + '.store', 'w'))
134 | if args.gpu >= 0:
135 | model.to_gpu()
136 |
137 | print('validation epoch={}, mean loss={}, accuracy={} best=[accuracy={} epoch={}]'.format(
138 | epoch, validation_loss, validation_accuracy,
139 | best_accuracy,
140 | best_epoch))
141 |
142 | epoch += 1
143 |
144 | if __name__ == '__main__':
145 | parser = modeling.parser.build_chainer()
146 | sys.exit(main(parser.parse_args()))
147 |
--------------------------------------------------------------------------------
/train_keras_simple.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from __future__ import absolute_import
4 | from __future__ import print_function
5 |
6 | import os, sys, shutil
7 | import logging
8 | import json
9 | import uuid
10 | import json
11 | import itertools
12 |
13 | import numpy as np
14 |
15 | import theano
16 | import h5py
17 | import six
18 | from sklearn.metrics import accuracy_score
19 |
20 | from keras.utils import np_utils
21 | from keras.optimizers import SGD
22 | import keras.callbacks
23 | from keras.callbacks import ModelCheckpoint, EarlyStopping
24 | import keras.models
25 |
26 | sys.path.append('.')
27 |
28 | from modeling.callbacks import (ClassificationReport,
29 | ConfusionMatrix, PredictionCallback,
30 | DelegatingMetricCallback,
31 | SingleStepLearningRateSchedule)
32 | from modeling.utils import (count_parameters, callable_print,
33 | setup_logging, setup_model_dir, save_model_info,
34 | load_model_data, load_model_json, load_target_data,
35 | build_model_id, build_model_path,
36 | ModelConfig)
37 | import modeling.preprocess
38 | import modeling.parser
39 |
40 | def main(args):
41 | model_id = build_model_id(args)
42 | model_path = build_model_path(args, model_id)
43 | setup_model_dir(args, model_path)
44 |
45 | rng = np.random.RandomState(args.seed)
46 |
47 | json_cfg = load_model_json(args, x_train=None, n_classes=None)
48 | model_cfg = ModelConfig(**json_cfg)
49 | if args.verbose:
50 | print("model_cfg " + str(model_cfg))
51 |
52 | sys.path.append(args.model_dir)
53 | import model
54 | from model import build_model, fit_model, load_train, load_validation
55 |
56 | train_data = load_train(args, model_cfg)
57 | validation_data = load_validation(args, model_cfg)
58 |
59 | if args.verbose:
60 | print("loading model")
61 | model = build_model(model_cfg, train_data, validation_data)
62 | fit_model(model, train_data, validation_data, args)
63 |
64 | if __name__ == '__main__':
65 | parser = modeling.parser.build_keras()
66 | sys.exit(main(parser.parse_args()))
67 |
--------------------------------------------------------------------------------
/train_lasagne.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from __future__ import absolute_import
4 | from __future__ import print_function
5 |
6 | import sys
7 | import six
8 | import argparse
9 | import progressbar
10 | import copy
11 | import cPickle
12 | import itertools
13 |
14 | import numpy as np
15 | import pandas as pd
16 |
17 | from modeling.lasagne_model import Classifier
18 | from modeling.utils import (
19 | load_model_data, load_model_json, build_model_id, build_model_path,
20 | setup_model_dir, setup_logging, ModelConfig)
21 | import modeling.parser
22 |
23 | def keep_training(epoch, best_epoch, model_cfg):
24 | if model_cfg.n_epochs is not None and epoch > model_cfg.n_epochs:
25 | return False
26 | if epoch > 1 and epoch - best_epoch > model_cfg.patience:
27 | return False
28 | return True
29 |
30 | def train_one_epoch(model, x_train, y_train, args, model_cfg, progress=False):
31 | n = len(x_train)
32 |
33 | if args.shuffle:
34 | perm = np.random.permutation(n)
35 | else:
36 | perm = np.arange(n)
37 |
38 | if progress:
39 | pbar = progressbar.ProgressBar(term_width=40,
40 | widgets=[' ', progressbar.Percentage(),
41 | ' ', progressbar.ETA()],
42 | maxval=n).start()
43 | else:
44 | pbar = None
45 |
46 | train_loss = 0
47 |
48 | for j, i in enumerate(six.moves.range(0, n, model_cfg.batch_size)):
49 | if progress:
50 | pbar.update(j+1)
51 | x = x_train[perm[i:i + model_cfg.batch_size]]
52 | y = y_train[perm[i:i + model_cfg.batch_size]]
53 | if len(x) != model_cfg.batch_size:
54 | # TODO: how do other frameworks solve this?
55 | continue
56 | train_loss += model.fit(x, y)
57 |
58 | if progress:
59 | pbar.finish()
60 |
61 | return train_loss/float(n)
62 |
63 | def validate(model, x_valid, y_valid, args, model_cfg, progress=False):
64 | n = len(x_valid)
65 |
66 | if progress:
67 | pbar = progressbar.ProgressBar(term_width=40,
68 | widgets=[' ', progressbar.Percentage(),
69 | ' ', progressbar.ETA()],
70 | maxval=n).start()
71 | else:
72 | pbar = None
73 |
74 | val_accuracy = 0.
75 | val_loss = 0.
76 |
77 | for i in six.moves.range(0, n, model_cfg.batch_size):
78 | if progress:
79 | pbar.update(i+1)
80 | x = x_valid[i:i + model_cfg.batch_size]
81 | y = y_valid[i:i + model_cfg.batch_size]
82 | loss, acc = model.evaluate(x, y)
83 | val_loss += loss
84 | val_accuracy += acc
85 |
86 | if progress:
87 | pbar.finish()
88 |
89 | return val_loss/float(n), val_accuracy/float(n)
90 |
91 | def main(args):
92 | model_id = build_model_id(args)
93 | model_path = build_model_path(args, model_id)
94 | setup_model_dir(args, model_path)
95 | sys.stdout, sys.stderr = setup_logging(args)
96 |
97 | rng = np.random.RandomState(args.seed)
98 |
99 | x_train, y_train = load_model_data(args.train_file,
100 | args.data_name, args.target_name,
101 | n=args.n_train)
102 |
103 | x_valid, y_valid = load_model_data(
104 | args.validation_file,
105 | args.data_name, args.target_name,
106 | n=args.n_validation)
107 |
108 | train_files = args.extra_train_file + [args.train_file]
109 | train_files_iter = itertools.cycle(train_files)
110 |
111 | n_classes = max(np.unique(y_train)) + 1
112 | json_cfg = load_model_json(args, x_train, n_classes)
113 |
114 | sys.path.append(args.model_dir)
115 | from model import Model
116 | model_cfg = ModelConfig(**json_cfg)
117 | model = Model(model_cfg)
118 | setattr(model, 'stop_training', False)
119 |
120 | best_accuracy = 0.
121 | best_epoch = 0
122 |
123 | epoch = 1
124 | iteration = 0
125 |
126 | while True:
127 | if not keep_training(epoch, best_epoch, model_cfg):
128 | break
129 |
130 | train_loss = train_one_epoch(model, x_train, y_train,
131 | args, model_cfg, progress=args.progress)
132 |
133 | val_loss, val_accuracy = validate(model, x_valid, y_valid,
134 | args, model_cfg, progress=args.progress)
135 |
136 | if val_accuracy > best_accuracy:
137 | best_accuracy = val_accuracy
138 | best_epoch = epoch
139 | if model_path is not None:
140 | model.save_weights(model_path + '.npz')
141 | cPickle.dump(model, open(model_path + '.pkl', 'w'))
142 |
143 | print('epoch={epoch:05d}, iteration={iteration:05d}, loss={loss:.04f}, val_loss={val_loss:.04f}, val_acc={val_acc:.04f} best=[accuracy={best_accuracy:.04f} epoch={best_epoch:05d}]'.format(
144 | epoch=epoch, iteration=iteration,
145 | loss=train_loss, val_loss=val_loss, val_acc=val_accuracy,
146 | best_accuracy=best_accuracy, best_epoch=best_epoch))
147 |
148 | iteration += 1
149 | if iteration % len(train_files) == 0:
150 | epoch += 1
151 |
152 | x_train, y_train = load_model_data(
153 | next(train_files_iter),
154 | args.data_name, args.target_name,
155 | n=args.n_train)
156 |
157 | if __name__ == '__main__':
158 | parser = modeling.parser.build_lasagne()
159 | sys.exit(main(parser.parse_args()))
160 |
--------------------------------------------------------------------------------