├── cnn_text_trainer
    ├── __init__.py
    ├── rw
    │   ├── __init__.py
    │   ├── wordvecs.py
    │   └── datasets.py
    ├── config
    │   ├── __init__.py
    │   └── config.py
    └── core
    │   ├── __init__.py
    │   ├── multichannel
    │       ├── __init__.py
    │       └── model.py
    │   ├── unichannel
    │       ├── __init__.py
    │       └── model.py
    │   └── nn_classes.py
├── test
    ├── __init__.py
    ├── testConfig.json
    └── test_cnn_text_trainer.py
├── sample
    ├── configs
    │   ├── sampleMCConfig.json
    │   ├── sampleStaticConfig.json
    │   ├── sampleNonStaticConfig.json
    │   ├── mc
    │   │   ├── config-mc5.json
    │   │   ├── config-mc1.json
    │   │   ├── config-mc2.json
    │   │   ├── config-mc3.json
    │   │   └── config-mc4.json
    │   ├── static
    │   │   ├── config-static1.json
    │   │   ├── config-static2.json
    │   │   ├── config-static3.json
    │   │   ├── config-static4.json
    │   │   └── config-static5.json
    │   └── nonstatic
    │   │   ├── config-nonstatic5.json
    │   │   ├── config-nonstatic1.json
    │   │   ├── config-nonstatic2.json
    │   │   ├── config-nonstatic3.json
    │   │   └── config-nonstatic4.json
    └── datasets
    │   └── sst_small_sample.csv
├── requirements.txt
├── downloadWordVecs.sh
├── .gitignore
├── train.py
├── make
    └── gdown.pl
├── gpu_to_cpu.py
├── README.md
├── server.py
├── test.py
└── LICENSE


/cnn_text_trainer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'devashish.shankar'
2 | 


--------------------------------------------------------------------------------
/cnn_text_trainer/rw/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'devashish.shankar'
2 | 


--------------------------------------------------------------------------------
/cnn_text_trainer/config/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'devashish.shankar'
2 | 


--------------------------------------------------------------------------------
/cnn_text_trainer/core/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'devashish.shankar'
2 | 


--------------------------------------------------------------------------------
/cnn_text_trainer/core/multichannel/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'devashish.shankar'
2 | 


--------------------------------------------------------------------------------
/cnn_text_trainer/core/unichannel/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'devashish.shankar'
2 | 


--------------------------------------------------------------------------------
/test/testConfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "max_l":56,
 5 | "filter_h":5,
 6 | "filter_hs":[3,4,5],
 7 | "mlp_hidden_units":[],
 8 | "dropout_rate":0.5,
 9 | "shuffle_batch":true,
10 | "n_epochs":5,
11 | "batch_size":50,
12 | "lr_decay":0.95,
13 | "conv_non_linear":"relu",
14 | "mode":"static"
15 | }


--------------------------------------------------------------------------------
/sample/configs/sampleMCConfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "max_l":56,
 5 | "filter_h":5,
 6 | "filter_hs":[3,4,5],
 7 | "mlp_hidden_units":[],
 8 | "dropout_rate":0.5,
 9 | "shuffle_batch":true,
10 | "n_epochs":5,
11 | "batch_size":50,
12 | "lr_decay":0.95,
13 | "conv_non_linear":"relu",
14 | "mode":"multichannel"
15 | }


--------------------------------------------------------------------------------
/sample/configs/sampleStaticConfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "max_l":56,
 5 | "filter_h":5,
 6 | "filter_hs":[3,4,5],
 7 | "mlp_hidden_units":[],
 8 | "dropout_rate":0.5,
 9 | "shuffle_batch":true,
10 | "n_epochs":5,
11 | "batch_size":50,
12 | "lr_decay":0.95,
13 | "conv_non_linear":"relu",
14 | "mode":"static"
15 | }


--------------------------------------------------------------------------------
/sample/configs/sampleNonStaticConfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "max_l":56,
 5 | "filter_h":5,
 6 | "filter_hs":[3,4,5],
 7 | "mlp_hidden_units":[],
 8 | "dropout_rate":0.5,
 9 | "shuffle_batch":true,
10 | "n_epochs":5,
11 | "batch_size":50,
12 | "lr_decay":0.95,
13 | "conv_non_linear":"relu",
14 | "mode":"nonstatic"
15 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Flask==0.10.1
 2 | Jinja2==2.7.3
 3 | MarkupSafe==0.23
 4 | Theano==0.7.0
 5 | Werkzeug==0.10.4
 6 | argparse==1.2.1
 7 | distribute==0.6.24
 8 | gunicorn==19.3.0
 9 | itsdangerous==0.24
10 | nltk==3.0.3
11 | numpy==1.9.2
12 | pandas==0.16.2
13 | python-dateutil==2.4.2
14 | pytz==2015.4
15 | scikit-learn==0.16.1
16 | scipy==0.15.1
17 | six==1.9.0
18 | wsgiref==0.1.2
19 | 


--------------------------------------------------------------------------------
/sample/configs/mc/config-mc5.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":100,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"multichannel"
16 | }


--------------------------------------------------------------------------------
/sample/configs/mc/config-mc1.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":200,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[100],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"multichannel"
16 | }


--------------------------------------------------------------------------------
/sample/configs/mc/config-mc2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":200,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[50],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"multichannel"
16 | }


--------------------------------------------------------------------------------
/sample/configs/mc/config-mc3.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":300,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[100],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"multichannel"
16 | }


--------------------------------------------------------------------------------
/sample/configs/mc/config-mc4.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":300,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[50],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"multichannel"
16 | }


--------------------------------------------------------------------------------
/sample/configs/static/config-static1.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":200,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[100],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"static"
16 | }


--------------------------------------------------------------------------------
/sample/configs/static/config-static2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":200,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[50],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"static"
16 | }


--------------------------------------------------------------------------------
/sample/configs/static/config-static3.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":300,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[100],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"static"
16 | }


--------------------------------------------------------------------------------
/sample/configs/static/config-static4.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":300,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[50],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"static"
16 | }


--------------------------------------------------------------------------------
/sample/configs/static/config-static5.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":100,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"static"
16 | }


--------------------------------------------------------------------------------
/sample/configs/nonstatic/config-nonstatic5.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":100,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"nonstatic"
16 | }


--------------------------------------------------------------------------------
/sample/configs/nonstatic/config-nonstatic1.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":200,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[100],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"nonstatic"
16 | }


--------------------------------------------------------------------------------
/sample/configs/nonstatic/config-nonstatic2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":200,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[50],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"nonstatic"
16 | }


--------------------------------------------------------------------------------
/sample/configs/nonstatic/config-nonstatic3.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":300,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[100],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"nonstatic"
16 | }


--------------------------------------------------------------------------------
/sample/configs/nonstatic/config-nonstatic4.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "word2vec":"GoogleNews-vectors-negative300.bin",
 3 | "dim":300,
 4 | "conv_features":300,
 5 | "max_l":56,
 6 | "filter_h":5,
 7 | "filter_hs":[3,4,5],
 8 | "mlp_hidden_units":[50],
 9 | "dropout_rate":0.5,
10 | "shuffle_batch":true,
11 | "n_epochs":50,
12 | "batch_size":50,
13 | "lr_decay":0.95,
14 | "conv_non_linear":"relu",
15 | "mode":"nonstatic"
16 | }


--------------------------------------------------------------------------------
/downloadWordVecs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ ! -f GoogleNews-vectors-negative300.bin ]; then
 3 |     echo "Downloading google word2vecs"
 4 |     perl ./make/gdown.pl "https://docs.google.com/uc?export=download&confirm=Kqnw&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM" GoogleNews-vectors-negative300.bin.gz
 5 |     echo "done downloading word2vec. Uncompressing them"
 6 |     gunzip  GoogleNews-vectors-negative300.bin.gz
 7 |     echo "done uncompressing word2vec"
 8 | fi
 9 | 
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Intellij
 2 | .idea/
 3 | .idea/*
 4 | *.iml
 5 | *.iws
 6 | 
 7 | # Byte-compiled / optimized / DLL files
 8 | __pycache__/
 9 | *.py[cod]
10 | 
11 | # C extensions
12 | *.so
13 | 
14 | # Distribution / packaging
15 | .Python
16 | env/
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | 
32 | # PyInstaller
33 | #  Usually these files are written by a python script from a template
34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 | 
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 | 
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *,cover
51 | 
52 | # Translations
53 | *.mo
54 | *.pot
55 | 
56 | # Django stuff:
57 | *.log
58 | 
59 | # Sphinx documentation
60 | docs/_build/
61 | 
62 | # PyBuilder
63 | target/
64 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from cnn_text_trainer.rw import datasets
 3 | from cnn_text_trainer.config import config
 4 | from cnn_text_trainer.core.multichannel.model import MultiChannelTrainer
 5 | from cnn_text_trainer.core.unichannel.model import TextCNNModelTrainer
 6 | from cnn_text_trainer.rw import wordvecs
 7 | 
 8 | __author__ = 'devashish.shankar'
 9 | 
10 | if __name__=="__main__":
11 |     if len(sys.argv)<5:
12 |         print "Usage: training.py"
13 |         print "\t<model config file path>"
14 |         print "\t<training data file path>"
15 |         print "\t<file path to store classifier model>"
16 |         print "\t<true/false(preprocessing flag)>"
17 |         exit(0)
18 | 
19 |     #processing..
20 |     config_file=sys.argv[1]
21 |     train_data_file=sys.argv[2]
22 |     model_output_file=sys.argv[3]
23 |     preprocess=sys.argv[4].lower()
24 | 
25 |     training_config = config.get_training_config_from_json(config_file)
26 |     sentences, vocab, labels = datasets.build_data(train_data_file,preprocess)
27 |     print "Dataset loaded"
28 |     word_vecs = wordvecs.load_wordvecs(training_config.word2vec,vocab)
29 |     print "Loaded word vecs from file"
30 | 
31 |     if training_config.mode=="multichannel":
32 |         nntrainer = MultiChannelTrainer(training_config,word_vecs,sentences,labels)
33 |     else:
34 |         nntrainer = TextCNNModelTrainer(training_config,word_vecs,sentences,labels)
35 | 
36 |     nntrainer.train(model_output_file)
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/make/gdown.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/perl
 2 | #
 3 | # Google Drive direct download of big files
 4 | # ./gdown.pl 'gdrive file url' ['desired file name']
 5 | #
 6 | # v1.0 by circulosmeos 04-2014.
 7 | # http://circulosmeos.wordpress.com/2014/04/12/google-drive-direct-download-of-big-files
 8 | # Distributed under GPL 3 (http://www.gnu.org/licenses/gpl-3.0.html)
 9 | #
10 | use strict;
11 | 
12 | my $TEMP='/tmp';
13 | my $COMMAND;
14 | my $confirm;
15 | my $check;
16 | sub execute_command();
17 | 
18 | my $URL=shift;
19 | die "\n./gdown.pl 'gdrive file url' [desired file name]\n\n" if $URL eq '';
20 | my $FILENAME=shift;
21 | $FILENAME='gdown' if $FILENAME eq '';
22 | 
23 | execute_command();
24 | 
25 | while (-s $FILENAME < 100000) { # only if the file isn't the download yet
26 |     open fFILENAME, '<', $FILENAME;
27 |     $check=0;
28 |     foreach (<fFILENAME>) {
29 |         if (/href="(\/uc\?export=download[^"]+)/) {
30 |             $URL='https://docs.google.com'.$1;
31 |             $URL=~s/&amp;/&/g;
32 |             $confirm='';
33 |             $check=1;
34 |             last;
35 |         }
36 |         if (/confirm=([^;&]+)/) {
37 |             $confirm=$1;
38 |             $check=1;
39 |             last;
40 |         }
41 |         if (/"downloadUrl":"([^"]+)/) {
42 |             $URL=$1;
43 |             $URL=~s/\\u003d/=/g;
44 |             $URL=~s/\\u0026/&/g;
45 |             $confirm='';
46 |             $check=1;
47 |             last;
48 |         }
49 |     }
50 |     close fFILENAME;
51 |     die "Couldn't download the file :-(\n" if ($check==0);
52 |     $URL=~s/confirm=([^;&]+)/confirm=$confirm/ if $confirm ne '';
53 | 
54 |     execute_command();
55 | }
56 | 
57 | sub execute_command() {
58 |     $COMMAND="wget --load-cookie $TEMP/cookie.txt --save-cookie $TEMP/cookie.txt \"$URL\"";
59 |     $COMMAND.=" -O \"$FILENAME\"" if $FILENAME ne '';
60 |     `$COMMAND`;
61 |     return 1;
62 | }
63 | 


--------------------------------------------------------------------------------
/test/test_cnn_text_trainer.py:
--------------------------------------------------------------------------------
 1 | import cPickle
 2 | import os
 3 | from cnn_text_trainer.config.config import get_training_config_from_json
 4 | from cnn_text_trainer.core.unichannel.model import TextCNNModelTrainer
 5 | from cnn_text_trainer.rw import wordvecs
 6 | from cnn_text_trainer.rw.datasets import build_data
 7 | 
 8 | __author__ = 'devashish.shankar'
 9 | 
10 | def test_config_reader():
11 |     #TODO improve this test case, probably check if values are actually getting correctly parsed from config
12 |     config  = get_training_config_from_json("testConfig.json")
13 |     assert config.mode == "static"
14 |     print config
15 | 
16 | def test_dataset_reader():
17 |     sentences,vocabs,labels = build_data("../sample/datasets/sst_small_sample.csv")
18 |     assert len(sentences) == 300
19 |     assert len(labels) == 2
20 |     assert "neg" in labels and "pos" in labels
21 | 
22 | def trainer_helper(configFile,dataSetFile,tempModel):
23 |     print "Training model on ",configFile,dataSetFile
24 |     config  = get_training_config_from_json(configFile)
25 |     sentences, vocab, labels = build_data(dataSetFile,True)
26 |     word_vecs = wordvecs.load_wordvecs(config.word2vec,vocab)
27 |     trainer = TextCNNModelTrainer(config,word_vecs,sentences,labels)
28 |     trainer.train(tempModel)
29 |     print "Succesfully trained model on ",configFile,dataSetFile," and model is at ",tempModel
30 |     print "Will proceed at testing the model on same data. If everything is correct, you should see the same accuracy"
31 |     model = cPickle.load(open(tempModel,"rb"))
32 |     op = model.classify(sentences)
33 |     os.remove(tempModel)
34 | 
35 | def test_all_trainers():
36 |     trainer_helper("../sample/configs/sampleMCConfig.json","../sample/datasets/sst_small_sample.csv","tempModel.p")
37 |     trainer_helper("../sample/configs/sampleNonStaticConfig.json","../sample/datasets/sst_small_sample.csv","tempModel.p")
38 |     trainer_helper("../sample/configs/sampleMCConfig.json","../sample/datasets/sst_small_sample.csv","tempModel.p")
39 |     #TODO validate embeddings change in MC in test case
40 |     #TODO validate if preprocess flag is working
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/cnn_text_trainer/rw/wordvecs.py:
--------------------------------------------------------------------------------
 1 | import cPickle
 2 | import os
 3 | import numpy as np
 4 | 
 5 | def load_wordvecs_from_binfile(word_vec_file,vocab=None):
 6 |     """
 7 |     Load word vectors from bin file
 8 |     :param word_vec_file: file path
 9 |     :param vocab: vocabulary. If not none, only words from this vocab will be loaded
10 |     :return: dictionary of word to word_vector
11 |     """
12 |     with open(word_vec_file, "rb") as f:
13 |         word_vecs = {}
14 |         header = f.readline()
15 |         vocab_size, layer1_size = map(int, header.split())
16 |         binary_len = np.dtype('float32').itemsize * layer1_size
17 |         i = 0
18 |         for line in xrange(vocab_size):
19 |             word = []
20 |             while True:
21 |                 ch = f.read(1)
22 |                 if ch == ' ':
23 |                     word = ''.join(word)
24 |                     break
25 |                 if ch != '\n':
26 |                     word.append(ch)
27 |             if vocab == None or word in vocab:
28 |                 word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
29 |             else:
30 |                 f.read(binary_len)
31 |         return word_vecs
32 | 
33 | 
34 | def load_wordvecs(word_vec_file,vocab=None):
35 |     i = 0
36 |     cwd = os.getcwd()
37 |     os.chdir(os.path.dirname(os.path.realpath(__file__)))
38 |     while not os.path.isfile(word_vec_file):       #TODO this is a hack. Find better way
39 |         word_vec_file='../'+word_vec_file
40 |         i+=1
41 |         if i==4:
42 |             raise Exception("File "+word_vec_file+" not found. Searched "+str(i)+" level above the cwd: till "+os.path.abspath(word_vec_file))
43 | 
44 |     word_vec_file = os.path.abspath(word_vec_file)
45 | 
46 |     os.chdir(cwd)
47 |     if word_vec_file.endswith('.bin'):
48 |         return load_wordvecs_from_binfile(word_vec_file,vocab)
49 |     else:
50 |         model=cPickle.load(open(word_vec_file,"rb"))
51 |         word_idx_map, W = model[2], model[3]
52 |         word_vecs = {}
53 |         for word in word_idx_map:
54 |             word_vecs[word]=W[word_idx_map[word]]
55 |         return word_vecs
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/cnn_text_trainer/rw/datasets.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import csv
 3 | import re
 4 | 
 5 | 
 6 | def build_data(fname,preprocess=True):
 7 |     """
 8 |     Reads a CSV file with headers 'labels' and 'text' (containing label string and text respectively)
 9 |     and outputs sentences, vocab and labels
10 |     :param fname: file name to read
11 |     :param preprocess: should data be preprocessed
12 |     :return: sentences is a list of dictionary (a format which NNTrainer accepts) [{'text': <text>, 'y':<label>},...]
13 |     """
14 |     sentences = []
15 |     vocab = defaultdict(float)
16 |     labels=[]
17 |     rows = []
18 |     with open(fname, "rb") as f:
19 |         reader = csv.DictReader(f)
20 |         count = 0
21 |         for line in reader:
22 |             if count%1000 == 0:
23 |                 print "Reading line no. ",count
24 |             count+=1
25 |             label = line['labels']
26 |             rows.append((label,line['text']))  # Tuple: (label,text)
27 |             labels=labels+[label]
28 |     labels = list(set(labels))
29 |     labels.sort()
30 |     print labels
31 |     labelIdToLabel = dict(zip(labels,range(len(labels))))
32 |     for row in rows:
33 |         y=labelIdToLabel[row[0]]
34 |         rev = []
35 |         rev.append(row[1].strip())
36 |         if preprocess==True:
37 |             orig_rev = clean_str(" ".join(rev))
38 |         else:
39 |             orig_rev = rev[0]
40 | 
41 |         words = set(orig_rev.split())
42 |         for word in words:
43 |             vocab[word] += 1
44 | 
45 |         datum  = {"y":y,
46 |                   "text": orig_rev,
47 |                   "num_words": len(orig_rev.split())}
48 |         sentences.append(datum)
49 |     return sentences, vocab, labels
50 | 
51 | 
52 | def clean_str(str):
53 |     """
54 |     Tokenization/string cleaning. This is specific to tweets, but can be applied for other texts as well.
55 |     """
56 |     str=str+" "
57 |     str=re.sub("http[^ ]*[\\\]","\\\\",str)                    #Remove hyperlinks
58 |     str=re.sub("http[^ ]* "," ",str)                           #Remove hyperlinks
59 |     str=str.replace('\\n',' ')
60 |     arr=re.findall(r"\w+(?:[-']\w+)*|'|[:)-.(]+|\S\w*", str)   #Single punctuation mark is removed, smileys remain intact
61 |     arr=[i for i in arr if len(i)>1 and i[0]!='@']             #Remove words starting with @ (Twitter mentions)
62 |     arr=[i if i[0]!='#' else i[1:] for i in arr]               #Remove '#' from hashtags
63 |     #arr=[i for i in arr if i!='http' and i!='com' and i!='org']
64 |     res=" ".join(arr)
65 |     return res.lower().strip()
66 | 


--------------------------------------------------------------------------------
/cnn_text_trainer/config/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Methods for reading the config from file into a config object
 3 | 
 4 | TODO: type validation
 5 | """
 6 | import json
 7 | 
 8 | 
 9 | class TrainingConfig:
10 |     def __init__(self,dim=300,word2vec='GoogleNews-vectors-negative300.bin',mode='static',
11 |                  max_l=56,filter_h=5,filter_hs=[3,4,5],conv_features=100,mlp_hidden_units=[],dropout_rate=0.5,
12 |                  shuffle_batch=True,n_epochs=50,batch_size=50,lr_decay=0.95,conv_non_linear='relu',sqr_norm_lim=9):
13 |         self.dim = dim
14 |         self.word2vec = word2vec
15 |         self.mode = check_training_mode(mode)
16 |         self.max_l = max_l
17 |         self.filter_h = filter_h
18 |         self.filter_hs = filter_hs
19 |         self.conv_features = conv_features
20 |         if mlp_hidden_units is None:
21 |             mlp_hidden_units = []
22 |         self.mlp_hidden_units = mlp_hidden_units
23 |         self.dropout_rate = dropout_rate
24 |         self.shuffle_batch = shuffle_batch
25 |         self.n_epochs = n_epochs
26 |         self.batch_size = batch_size
27 |         self.lr_decay = lr_decay
28 |         self.conv_non_linear = conv_non_linear
29 |         self.sqr_norm_lim = sqr_norm_lim
30 | 
31 | 
32 | def check_training_mode(str):
33 |     if not str in ['static','nonstatic','multichannel']:
34 |         raise KeyError(str+' not a valid training mode')
35 |     return str
36 | 
37 | def get_training_config_from_json(file):
38 |     with open(file,'rb') as f:
39 |         jsonConfigs = json.loads(f.read())
40 |         return TrainingConfig(dim=jsonConfigs.get('dim',300),
41 |                               word2vec=jsonConfigs.get('word2vec','GoogleNews-vectors-negative300.bin'),
42 |                               mode=check_training_mode(jsonConfigs.get('mode','static')),
43 |                               max_l=jsonConfigs.get('max_l',56),
44 |                               filter_h=jsonConfigs.get('filter_h',5),
45 |                               filter_hs=jsonConfigs.get('filter_hs',[3,4,5]),
46 |                               conv_features=jsonConfigs.get('conv_features',100),
47 |                               mlp_hidden_units=jsonConfigs.get('mlp_hidden_units',[50]),
48 |                               dropout_rate=jsonConfigs.get('dropout_rate',0.5),
49 |                               shuffle_batch=jsonConfigs.get('shuffle_batch',True),
50 |                               n_epochs=jsonConfigs.get('n_epochs',50),
51 |                               batch_size=jsonConfigs.get('batch_size',50),
52 |                               lr_decay=jsonConfigs.get('lr_decay',0.95),
53 |                               conv_non_linear=jsonConfigs.get('conv_non_linear','relu')
54 |                              )


--------------------------------------------------------------------------------
/gpu_to_cpu.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Under development
 3 | '''
 4 | import theano
 5 | from theano.tensor.sharedvar import TensorSharedVariable
 6 | import pickle
 7 | import sys
 8 | from cnn_text_trainer.core.nn_classes import MLPDropout, UnpickledLayer
 9 | import os
10 | theano.config.experimental.unpickle_gpu_on_cpu = True
11 | 
12 | '''
13 | Converts a CudaSharedVariable into a TensorSharedVariable.
14 | This code should be run on a GPU machine.
15 | '''
16 | def cudaSharedVarToTensorSharedVar4d(cudaSharedVar):
17 |     value = cudaSharedVar.get_value()
18 |     theano4dtensor = theano.tensor.tensor4(dtype='float64')
19 |     tensorType4d = theano4dtensor.__dict__['type']
20 |     return TensorSharedVariable(name=cudaSharedVar.__dict__['name'],type=tensorType4d,value=value,strict=False)
21 | 
22 | '''
23 | Converts a CudaSharedVariable into a TensorSharedVariable.
24 | This code should be run on a GPU machine.
25 | '''
26 | def cudaSharedVarToTensorSharedVarVector(cudaSharedVar):
27 |     value = cudaSharedVar.get_value()
28 |     theano4dtensor = theano.tensor.vector(dtype='float64')
29 |     tensorType4d = theano4dtensor.__dict__['type']
30 |     return TensorSharedVariable(name=cudaSharedVar.__dict__['name'],type=tensorType4d,value=value,strict=False)
31 | 
32 | 
33 | '''
34 | Converts a CudaSharedVariable into a TensorSharedVariable.
35 | This code should be run on a GPU machine.
36 | '''
37 | def cudaSharedVarToTensorSharedVarMatrix(cudaSharedVar):
38 |     value = cudaSharedVar.get_value()
39 |     theano4dtensor = theano.tensor.matrix(dtype='float64')
40 |     tensorType4d = theano4dtensor.__dict__['type']
41 |     return TensorSharedVariable(name=cudaSharedVar.__dict__['name'],type=tensorType4d,value=value,strict=False)
42 | 
43 | 
44 | 
45 | '''
46 | steps.
47 | '''
48 | def gpu_to_cpu(modelName):
49 |     o = pickle.load(open(modelName))
50 |     print('pickle.loaded')
51 | 
52 |     for i in range(len(o.conv_layers)):
53 |         o.conv_layers[i].W = cudaSharedVarToTensorSharedVar4d(o.conv_layers[i].W)
54 |         o.conv_layers[i].b = cudaSharedVarToTensorSharedVarVector(o.conv_layers[i].b)
55 |     wbclassifier = []
56 |     for i in range(len(o.classifier.layers)):
57 |         w = cudaSharedVarToTensorSharedVarMatrix(o.classifier.layers[i].W.owner.inputs[0].owner.inputs[0])
58 |         b = cudaSharedVarToTensorSharedVarVector(o.classifier.layers[i].b)
59 |         wb = UnpickledLayer(w, b)
60 |         wbclassifier.append(wb)
61 |     o.classifier = MLPDropout(layers=wbclassifier, activations=o.classifier.activations)
62 |     filename, file_extension = os.path.splitext(modelName)
63 |     pickle.dump(o, open(filename+"_cpu.p", 'w'))
64 |     print('pickle.dumped')
65 | 
66 | if __name__=='__main__':
67 |     if len(sys.argv) < 2:
68 |         print "too few arguements.\n usage: script <modelName>"
69 |         exit()
70 |     elif len(sys.argv) > 2:
71 |         print "too many arguements.\n usage: script <modelName>"
72 |         exit()
73 | 
74 |     gpu_to_cpu(sys.argv[1])
75 | 
76 |     # /var/lib/fk-ark-webservice/cnnClassifiers/cnnModel_1053650855.p


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Optimus
 2 | Quickly train, evaluate and deploy an optimum classifier for your text classification task. Currently, it allows you to train a CNN (Convolutional Neural Network) based text classifier. Using this toolkit, you should be able to train a classifier for most of the text classification tasks without writing a single piece of code. 
 3 | 
 4 | The main features of Optimus are:
 5 | * Easily train a CNN  classifier
 6 | * Config driven to make hyperparameter tuning and experimentation easy
 7 | * Debug mode: which allows you to visualize what is happening in the internal layers of the model
 8 | * Flask server for querying the trained model through an API
 9 | 
10 | This project is based on: https://github.com/yoonkim/CNN_sentence (Many thanks to Yoon for open sourcing his code for the paper: http://arxiv.org/abs/1408.5882, which is arguably the best generic Deep Learning based text classifier at the time of writing.)
11 | The improvements over the original code are:
12 | * Multi-channel mode
13 | * Complete refactoring to make the code modular
14 | * GPU/CPU unpickling of models
15 | * Config driven, for easy experimentation
16 | * Model serialization/deserialization
17 | * Detailed evaluation results
18 | * Model deployment on a Flask server
19 | * Multi Class classification [In progress]
20 | * Debug Mode [In progress]
21 | ![img](https://drive.google.com/uc?id=1c0oa0YzKoBTD3JXztVTHtgTR_DTMXPKOWw)
22 | 
23 | This project is also inspired by https://github.com/japerk/nltk-trainer, which allows users to easily train NLTK based statistical classifiers. 
24 | 
25 | ## Why deep learning?
26 | Deep learning has dominated pattern recognition in the last few years, especially in image and speech. Recently deep learning models have outperformed statistical classifiers in a variety of NLP tasks as well. Also, one of the biggest advantage of using deep learning models is that task specific feature engineering is not required. The wiki contains a summary of exciting results we obtained using optimus, on a variety of different text classification tasks. Those interested in understanding how this model works can also check out my [talk](https://fifthelephant.talkfunnel.com/2015/64-deep-learning-for-natural-language-processing) at Fifth elephant, in which I give an introduction to NLP using deep learning. Other good recommended resources can also be found [here](http://deeplearning.net/) and [here](http://devashishshankar.com/).
27 | 
28 | 
29 | ## Requirements
30 | Code requires Python 2.7 and Theano 0.7. You can go to the [Setting Up page](https://github.com/flipkart-incubator/optimus/wiki/Setting-Up), for instructions on how to quickly set up the python environment required for Optimus. Requirements are also listed in the requirements.txt file.
31 | 
32 | ## Start Using it
33 | Visit the [Quick Start](https://github.com/flipkart-incubator/optimus/wiki/Quick-Start) guide to get started on using Optimus! I have also written a small [tutorial on Optimus](http://devashishshankar.com/2015/07/21/train-an-optimum-text-classifier-using-optimus/) on my blog.
34 | 
35 | You can compare models trained using optimus to statistical models by using https://github.com/japerk/nltk-trainer, an awesome tool for easily training statistical classifiers. If you get some good results on a dataset, I would love to know about them! 
36 | 
37 | In case you face any issue, you can create an issue on github or send me a mail at devashish.shankar@flipkart.com. Suggestions and improvements are most welcome. Open github issues are a good place to start. A contributor's guide is under works.
38 | 
39 | ## Core contributors
40 | * Devashish Shankar ([@devashishshankar](https://github.com/devashishshankar))
41 | * Prerana Singhal ([@singhalprerana](https://www.linkedin.com/in/singhalprerana))
42 | 


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from collections import OrderedDict
  3 | import json
  4 | from flask import Flask, request
  5 | import sys
  6 | import numpy
  7 | from cnn_text_trainer.rw.datasets import clean_str
  8 | 
  9 | __author__ = 'devashish.shankar'
 10 | 
 11 | 
 12 | #General refactoring, comments, etc.
 13 | 
 14 | app = Flask(__name__)
 15 | app.config['SECRET_KEY'] = 'F34TF$($e34D';   #Required for flask server TODO check
 16 | 
 17 | import pickle
 18 | 
 19 | @app.route('/healthcheck')
 20 | def healthcheck():
 21 |     return json.dumps({})
 22 | 
 23 | 
 24 | @app.route('/')
 25 | def home():
 26 |     #The tweet to classify
 27 |     try:
 28 |         tweet=request.args['text'].lower()
 29 |     except Exception as e:
 30 |         print "Error processing request. Improper format of request.args['text'] might be causing an issue. Returning empty array"
 31 |         print "request.args['text'] = ",request.args['text']
 32 |         return json.dumps({})
 33 |     #The path to file containing the model
 34 |     model=str(request.args['model'])
 35 |     #Should the tweet be preprocessed
 36 |     preprocess=str(request.args['preprocess']).lower()
 37 |     #Lazily load the model
 38 |     if model not in models:
 39 |         print "Model not in memory: ",model
 40 |         print "Loading model"
 41 |         models[model]=pickle.load(open(model,"rb"))
 42 |         if(load_word_vecs):
 43 |             print "Adding wordvecs"
 44 |             models[model].add_global_word_vecs(wordvecs)
 45 |         print "Done"
 46 | 
 47 |     if preprocess == "True":
 48 |         tweet = clean_str(tweet)
 49 | 
 50 |     [y_pred,prob_pred] = models[model].classify([{'text':tweet}])
 51 |     labels = models[model].labels
 52 | 
 53 |     label_to_prob={}
 54 |     for i in range(len(labels)):
 55 |         if(isinstance(prob_pred[0][i], numpy.float32) or isinstance(prob_pred[0][i], numpy.float64)):
 56 |             label_to_prob[labels[i]]=prob_pred[0][i].item()
 57 |         else:
 58 |             label_to_prob[labels[i]] = prob_pred[0][i]
 59 |     return json.dumps(label_to_prob)
 60 | 
 61 | 
 62 | import logging
 63 | 
 64 | # Log only in production mode.
 65 | if not app.debug:
 66 |     stream_handler = logging.StreamHandler()
 67 |     stream_handler.setLevel(logging.INFO)
 68 |     app.logger.addHandler(stream_handler)
 69 | 
 70 | class LimitedSizeDict(OrderedDict):
 71 |   def __init__(self, *args, **kwds):
 72 |     self.size_limit = kwds.pop("size_limit", None)
 73 |     OrderedDict.__init__(self, *args, **kwds)
 74 |     self._check_size_limit()
 75 | 
 76 |   def __setitem__(self, key, value):
 77 |     OrderedDict.__setitem__(self, key, value)
 78 |     self._check_size_limit()
 79 | 
 80 |   def _check_size_limit(self):
 81 |     if self.size_limit is not None:
 82 |       while len(self) > self.size_limit:
 83 |         self.popitem(last=False)
 84 | #In memory dictionary which will load all the models lazily
 85 | models=LimitedSizeDict(size_limit=10)
 86 | #In memory dictionary which will load all the word vectors lazily
 87 | wordvecs={}
 88 | 
 89 | load_word_vecs = False
 90 | 
 91 | if __name__ == "__main__":
 92 |     if len(sys.argv)<4:
 93 |         print "Usage: server.py"
 94 |         print "\t<port number to deploy the app>"
 95 |         print "\t<enable flask debug mode (true/false). >"
 96 |         print "\t<load word vectors in memory (true/false). This will give accuracy gains, but will have a lot of memory pressure. If false, words not encountered during training are skipped while predicting >"
 97 |         exit(0)
 98 |     port=int(sys.argv[1])
 99 |     debug = sys.argv[2].lower()=="true"
100 |     load_word_vecs = sys.argv[3].lower()=="true"
101 | 
102 |     #run app..
103 | 
104 |     app.run(debug=debug,host='0.0.0.0',port=port,threaded=True)
105 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | import sys
  4 | import cPickle
  5 | import operator
  6 | from cnn_text_trainer.rw import datasets
  7 | 
  8 | __author__ = 'devashish.shankar'
  9 | 
 10 | #TODO clean up this. Move to core maybe?
 11 | def evaluate(data,outputf):
 12 |     """
 13 |     Ported from initial version. TODO refactor to accept new format of data and clean up this code
 14 |     :param data: array containing outputs in old format: [[prob_pred,pred_label,actual_label,text],...]
 15 |     :param outputf: output directory
 16 |     """
 17 |     filept=open(outputf+"/info_"+testfile.split("/")[-1].split(".")[0]+"_"+modelfile.split("/")[-1].split(".")[0]+".csv", "wb")
 18 |     filep=csv.writer(filept)
 19 |     filep.writerow(["Number of data-points ",len(data)])
 20 |     print "Number of data-points: "+str(len(data))
 21 |     filep.writerow(["Number of labels ",len(labels)])
 22 |     print "Number of labels: "+str(len(labels))
 23 |     perf=float(len([row[1] for row in data if row[1]==row[2]]))/float(len(data))
 24 |     filep.writerow(["Accuracy ",str(perf*100)+"%"])
 25 |     filep.writerow([])
 26 |     print "Performance: "+str(perf*100)+"%\n"
 27 |     data.sort(key=operator.itemgetter(0),reverse=True)
 28 |     y_pred=[row[1] for row in data]
 29 |     y_true=[row[2] for row in data]
 30 |     for n in labels:
 31 |         tp=float(sum([(y_true[i]==n) and (y_pred[i]==n) for i in range(len(y_true))]))
 32 |         tn=float(sum([(y_true[i]!=n) and (y_pred[i]!=n) for i in range(len(y_true))]))
 33 |         fp=float(sum([(y_true[i]!=n) and (y_pred[i]==n) for i in range(len(y_true))]))
 34 |         fn=float(sum([(y_true[i]==n) and (y_pred[i]!=n) for i in range(len(y_true))]))
 35 |         fscore=(200*tp)/(2*tp+fp+fn)
 36 |         filep.writerow(["Label ",n])
 37 |         filep.writerow(["F-score  ",str(fscore)+"%"])
 38 |         filep.writerow(["TP ",int(tp),"FP ",int(fp),"TN ",int(tn),"FN ",int(fn)])
 39 |         filep.writerow([])
 40 |         print "F-score for label-"+str(n)+" is: "+str(fscore)+"%"
 41 |     filept.close()
 42 | 
 43 |     print "Printing output file"
 44 |     with open(outputf+"/output_"+testfile.split("/")[-1].split(".")[0]+"_"+modelfile.split("/")[-1].split(".")[0]+".csv", "wb") as f:
 45 |         writer = csv.writer(f)
 46 |         writer.writerow(["probabilities","y_predicted","y_actual","tweets"])
 47 |         for line in data:
 48 |             writer.writerow(line)
 49 | 
 50 |     print "Printing misclassification file"
 51 |     with open(outputf+"/misclassification_"+testfile.split("/")[-1].split(".")[0]+"_"+modelfile.split("/")[-1].split(".")[0]+".csv", "wb") as f:
 52 |         writer = csv.writer(f)
 53 |         writer.writerow(["probabilities","y_predicted","y_actual","tweets"])
 54 |         for line in data:
 55 |             if line[1]!=line[2]:
 56 |                 writer.writerow(line)
 57 | 
 58 | 
 59 | if __name__=="__main__":
 60 |     if len(sys.argv)<6:
 61 |         print "Usage: testing.py"
 62 |         print "\t<model file path>"
 63 |         print "\t<testing file path>"
 64 |         print "\t<folder to store detailed output analysis>"
 65 |         print "\t<true/false preprocess>"
 66 |         print "\t<load word vectors? (true/false). This will give accuracy gains, but will have a lot of memory pressure. If false, words not encountered during training are skipped while testing >"
 67 |         exit(0)
 68 |     import theano
 69 |     theano.config.experimental.unpickle_gpu_on_cpu = True
 70 |     testfile=sys.argv[2]
 71 |     modelfile=sys.argv[1]
 72 |     outputdir=sys.argv[3]
 73 |     preprocess=sys.argv[4].lower()
 74 |     load_word_vecs = sys.argv[5].lower()=="true"
 75 | 
 76 |     if not os.path.exists(outputdir):
 77 |         print "Output dir ",outputdir, " doesn't exist. Creating it"
 78 |         os.makedirs(outputdir)
 79 |     else:
 80 |         print "Using Output dir ",outputdir,". Any previous results in this dir on same dataset might get overwritten. "
 81 |     model = cPickle.load(open(modelfile,"rb"))
 82 |     if load_word_vecs:
 83 |         print "Loading word vectors"
 84 |         model.add_global_word_vecs({})
 85 |         print "Loading word vectors done"
 86 |     sentences,vocab, labels = datasets.build_data(testfile,preprocess)
 87 |     labels = model.get_labels()
 88 |     output = model.classify(sentences)
 89 |     #Free memory
 90 |     del model
 91 |     print "Removed model from memory"
 92 |     #Format the output to earlier format
 93 |     #TODO evaluate function should be changed to accept newer format, which is cleaner
 94 |     data = []
 95 |     for i in range(len(output[0])):
 96 |         actual_label = sentences[i]['y']
 97 |         text = sentences[i]['text']
 98 |         predicted_label = output[0][i]
 99 |         predicted_prob = output[1][i][predicted_label]
100 |         data.append([predicted_prob,labels[predicted_label],labels[actual_label],text])
101 |     evaluate(data,outputdir)
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/cnn_text_trainer/core/multichannel/model.py:
--------------------------------------------------------------------------------
  1 | import cPickle
  2 | 
  3 | import numpy as np
  4 | import theano
  5 | import theano.tensor as T
  6 | 
  7 | from cnn_text_trainer.core.nn_classes import MLPDropout
  8 | from cnn_text_trainer.core.unichannel.model import TextCNNModelTrainer, sgd_updates_adadelta, TextCNNModel
  9 | from cnn_text_trainer.rw import wordvecs
 10 | 
 11 | 
 12 | class MultiChannelModel(object):
 13 | 
 14 |     def __init__(self,trainingConfig,conv_layers,classifier,word_idx_map,Words_static,Words_nonstatic,labels,img_h):
 15 |         self.trainingConfig = trainingConfig
 16 |         self.conv_layers = conv_layers
 17 |         self.classifier = classifier
 18 |         self.word_idx_map_static=word_idx_map.copy()
 19 |         self.word_idx_map_nonstatic=word_idx_map.copy()
 20 |         self.Words_static = Words_static
 21 |         self.Words_nonstatic = Words_nonstatic
 22 |         self.labels = labels
 23 |         self.img_h = img_h
 24 |         self.word_vecs = {}
 25 | 
 26 |     def get_labels(self):
 27 |         return self.labels
 28 | 
 29 |     def add_global_word_vecs(self,word_vec_dict):
 30 |         """
 31 |         This function should be called by the instantiator, this allows the model
 32 |         to pick up word vectors, if they pre exist in memory. If not, they are
 33 |         loaded from file
 34 | 
 35 |         :param word_vec_dict: Global word vector dictionary
 36 |         """
 37 |         if self.trainingConfig.word2vec in word_vec_dict:
 38 |             self.word_vecs = word_vec_dict[self.trainingConfig.word2vec]
 39 |         else:
 40 |             self.word_vecs = wordvecs.load_wordvecs(self.trainingConfig.word2vec)
 41 |             word_vec_dict[self.trainingConfig.word2vec] = self.word_vecs
 42 | 
 43 |     def _classify(self,dataset_static,dataset_nonstatic):
 44 |         """
 45 |         Classify method for static or non-static models.
 46 |         :param classifier: model
 47 |         :param conv_layers: list of convPoolLayer objects
 48 |         :param Words: Dictionary of word index to word vectors
 49 |         :param dataset: Indices of words for the current sentence/dataset
 50 |         :param dim: dimension of word vector
 51 |         :param img_h: length of sentence vector after padding
 52 |         :return: [y_pred,prob_pred] The probability for each class
 53 |         """
 54 |         x_static = T.imatrix('x_static')
 55 |         x_nonstatic = T.imatrix('x_nonstatic')
 56 |         y = T.ivector('y')
 57 |         Words_static = theano.shared(value = self.Words_static, name = "Words_static")
 58 |         Words_nonstatic = theano.shared(value = self.Words_nonstatic, name = "Words_nonstatic")
 59 | 
 60 |         test_pred_layers = []
 61 |         test_size = np.shape(dataset_static)[0]
 62 |         test_layer0_input_static = Words_static[T.cast(x_static.flatten(),dtype="int32")].reshape((test_size,1,self.img_h,self.Words_static.shape[1]))
 63 |         test_layer0_input_nonstatic = Words_nonstatic[T.cast(x_nonstatic.flatten(),dtype="int32")].reshape((test_size,1,self.img_h,self.Words_nonstatic.shape[1]))
 64 |         for i in range(len(self.conv_layers)/2):
 65 |             test_layer0_output = self.conv_layers[i].predict(test_layer0_input_nonstatic, test_size)
 66 |             test_pred_layers.append(test_layer0_output.flatten(2))
 67 |         for i in range(len(self.conv_layers)/2,len(self.conv_layers)):
 68 |             test_layer0_output = self.conv_layers[i].predict(test_layer0_input_static, test_size)
 69 |             test_pred_layers.append(test_layer0_output.flatten(2))
 70 | 
 71 |         test_layer1_input = T.concatenate(test_pred_layers, 1)
 72 |         test_y_pred = self.classifier.predict(test_layer1_input)
 73 |         test_prob_pred = self.classifier.predict_p(test_layer1_input)
 74 |         test_model_all = theano.function([x_static,x_nonstatic], (test_y_pred,test_prob_pred))
 75 | 
 76 |         return test_model_all(dataset_static,dataset_nonstatic)
 77 | 
 78 |     def classify(self,sentences):
 79 |         datasets_static = []
 80 |         datasets_nonstatic = []
 81 |         for sentence in sentences:
 82 |             dataset_static,self.Words_static,self.word_idx_map_static = self.get_idx_from_sent(sentence["text"],self.Words_static,self.word_idx_map_static)
 83 |             dataset_nonstatic,self.Words_nonstatic,self.word_idx_map_nonstatic = self.get_idx_from_sent(sentence["text"],self.Words_nonstatic,self.word_idx_map_nonstatic)
 84 |             datasets_static.append(dataset_static)
 85 |             datasets_nonstatic.append(dataset_nonstatic)
 86 |         datasets_static=np.array(datasets_static,dtype="int32")
 87 |         datasets_nonstatic=np.array(datasets_nonstatic,dtype="int32")
 88 | 
 89 |         return self._classify(datasets_static,datasets_nonstatic)
 90 | 
 91 |     def get_idx_from_sent(self, sent,Words,word_idx_map):
 92 |         """
 93 |         Transforms sentence into a list of indices. Pad with zeroes.
 94 |         """
 95 |         x = []
 96 |         pad = self.trainingConfig.filter_h - 1
 97 |         for i in xrange(pad):
 98 |             x.append(0)
 99 |         words = sent.split()
100 |         W=list(Words)
101 | 
102 |         for wd in words:
103 |             if wd in word_idx_map:
104 |                 x.append(word_idx_map[wd])
105 |             elif self.word_vecs!=None and wd in self.word_vecs:
106 |                 word_idx_map[wd]=len(word_idx_map)+1
107 |                 W.append(self.word_vecs[wd])
108 |                 x.append(word_idx_map[wd])
109 | 
110 |         max_l = self.trainingConfig.max_l
111 |         while len(x) < max_l +2*pad:
112 |             x.append(0)
113 |         if len(x) > max_l +2*pad:
114 |             x=x[:max_l +2*pad]
115 |         Words=np.array(W,dtype=theano.config.floatX)
116 | 
117 |         return x,Words,word_idx_map
118 | 
119 | 
120 | 
121 | 
122 | 
123 | class MultiChannelTrainer(TextCNNModelTrainer):
124 | 
125 |     def save_model(self,outputPath):
126 |         cPickle.dump(MultiChannelModel(self.trainingConfig,self.conv_layers,self.classifier,self.word_idx_map,self.Words[0].get_value(),self.Words[1].get_value(),self.labels,self.img_h), open(outputPath, "wb"))
127 |         # cPickle.dump([self.classifier,self.conv_layers,self.word_idx_map,self.Words.get_value(),self.labels,self.trainingConfig.max_l,self.trainingConfig.filter_h,self.trainingConfig.dim,self.trainingConfig.mode,self.trainingConfig.word2vec], open(outputPath, "wb"))
128 | 
129 |     # def save_model(self,outputPath):
130 |     #     cPickle.dump([self.classifier,self.conv_layers,self.word_idx_map,[self.Words[0].get_value(),self.Words[1].get_value()],self.labels,self.trainingConfig.max_l,self.filter_h,self.trainingConfig.dim,self.trainingConfig.mode,self.trainingConfig.word2vec], open(outputPath, "wb"))
131 |     #
132 | 
133 |     def construct_models(self, x,y):
134 |         """
135 |         Get MLP and Conv Net objects. Also define the parameters to backpropogate into.
136 |         :param x:
137 |         :return:
138 |         """
139 |         rng = np.random.RandomState(3435)
140 |         Words_nonstatic = theano.shared(value = self.U, name = "Words_nonstatic")
141 |         Words_static = theano.shared(value = self.U, name = "Words_static")
142 | 
143 |         conv_layer_input_nonstatic = Words_nonstatic[T.cast(x.flatten(),dtype="int32")].reshape((x.shape[0],1,x.shape[1],Words_nonstatic.shape[1]))
144 |         conv_layer_input_static = Words_static[T.cast(x.flatten(),dtype="int32")].reshape((x.shape[0],1,x.shape[1],Words_static.shape[1]))
145 | 
146 |         conv_layers = []
147 |         conv_outputs = []
148 |         for i in xrange(len(self.trainingConfig.filter_hs)):
149 |             conv_layer, conv_output = self.construct_conv_layer(self.filter_shapes[i], self.pool_sizes[i], conv_layer_input_nonstatic, rng)
150 |             conv_layer_s, conv_output_s = self.construct_conv_layer(self.filter_shapes[i], self.pool_sizes[i], conv_layer_input_static, rng)
151 |             conv_layers.extend([conv_layer,conv_layer_s])
152 |             conv_outputs.extend([conv_output,conv_output_s])
153 |         conv_output = T.concatenate(conv_outputs, 1)
154 |         mlp_input_size = self.trainingConfig.conv_features * len(conv_layers)
155 |         classifier = MLPDropout(rng, input=conv_output, layer_sizes=[mlp_input_size]+self.trainingConfig.mlp_hidden_units+[len(self.labels)],
156 |                                 activations=self.hidden_layer_activations,
157 |                                 dropout_rates=[self.trainingConfig.dropout_rate])
158 | 
159 |         #define parameters of the model and update functions using adadelta
160 |         params = classifier.params
161 |         for conv_layer in conv_layers:
162 |             params += conv_layer.params
163 |         params += [Words_nonstatic]
164 |         cost = classifier.negative_log_likelihood(y)
165 |         dropout_cost = classifier.dropout_negative_log_likelihood(y)
166 |         grad_updates = sgd_updates_adadelta(params, dropout_cost, self.trainingConfig.lr_decay, 1e-6, self.trainingConfig.sqr_norm_lim)
167 | 
168 |         self.Words = [Words_nonstatic,Words_static]
169 |         return classifier, conv_layers,cost,grad_updates
170 | 
171 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/cnn_text_trainer/core/nn_classes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Classes defining basic NN building blocks and layers
  3 | 
  4 | Sample code for
  5 | Convolutional Neural Networks for Sentence Classification
  6 | http://arxiv.org/pdf/1408.5882v2.pdf
  7 | 
  8 | Much of the code is modified from
  9 | - deeplearning.net (for ConvNet classes)
 10 | - https://github.com/mdenil/dropout (for dropout)
 11 | - https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
 12 | """
 13 | 
 14 | import numpy
 15 | import theano.tensor.shared_randomstreams
 16 | import theano
 17 | import theano.tensor as T
 18 | from theano.tensor.signal import downsample
 19 | from theano.tensor.nnet import conv
 20 | 
 21 | def ReLU(x):
 22 |     y = T.maximum(0.0, x)
 23 |     return(y)
 24 | def Sigmoid(x):
 25 |     y = T.nnet.sigmoid(x)
 26 |     return(y)
 27 | def Tanh(x):
 28 |     y = T.tanh(x)
 29 |     return(y)
 30 | def Iden(x):
 31 |     y = x
 32 |     return(y)
 33 | 
 34 | 
 35 | class UnpickledLayer:
 36 |    def __init__(self,W,b):
 37 |        self.W = W
 38 |        self.b = b
 39 | 
 40 | 
 41 | class HiddenLayer(object):
 42 |     """
 43 |     Class for HiddenLayer
 44 |     """
 45 |     def __init__(self, rng, input, n_in, n_out, activation, W=None, b=None,
 46 |                  use_bias=False):
 47 | 
 48 |         self.input = input
 49 |         self.activation = activation
 50 | 
 51 |         if W is None:
 52 |             if activation.func_name == "ReLU":
 53 |                 W_values = numpy.asarray(0.01 * rng.standard_normal(size=(n_in, n_out)), dtype=theano.config.floatX)
 54 |             else:
 55 |                 W_values = numpy.asarray(rng.uniform(low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)),
 56 |                                                      size=(n_in, n_out)), dtype=theano.config.floatX)
 57 |             W = theano.shared(value=W_values, name='W')
 58 |         if b is None:
 59 |             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
 60 |             b = theano.shared(value=b_values, name='b')
 61 | 
 62 |         self.W = W
 63 |         self.b = b
 64 | 
 65 |         if use_bias:
 66 |             lin_output = T.dot(input, self.W) + self.b
 67 |         else:
 68 |             lin_output = T.dot(input, self.W)
 69 | 
 70 |         self.output = (lin_output if activation is None else activation(lin_output))
 71 | 
 72 |         # parameters of the model
 73 |         if use_bias:
 74 |             self.params = [self.W, self.b]
 75 |         else:
 76 |             self.params = [self.W]
 77 | 
 78 | def _dropout_from_layer(rng, layer, p):
 79 |     """p is the probablity of dropping a unit
 80 | """
 81 |     srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(999999))
 82 |     # p=1-p because 1's indicate keep and p is prob of dropping
 83 |     mask = srng.binomial(n=1, p=1-p, size=layer.shape)
 84 |     # The cast is important because
 85 |     # int * float32 = float64 which pulls things off the gpu
 86 |     output = layer * T.cast(mask, theano.config.floatX)
 87 |     return output
 88 | 
 89 | class DropoutHiddenLayer(HiddenLayer):
 90 |     def __init__(self, rng, input, n_in, n_out,
 91 |                  activation, dropout_rate, use_bias, W=None, b=None):
 92 |         super(DropoutHiddenLayer, self).__init__(
 93 |                 rng=rng, input=input, n_in=n_in, n_out=n_out, W=W, b=b,
 94 |                 activation=activation, use_bias=use_bias)
 95 | 
 96 |         self.output = _dropout_from_layer(rng, self.output, p=dropout_rate)
 97 | 
 98 | class MLPDropout(object):
 99 |     """A multilayer perceptron with dropout"""
100 |     def __init__(self,rng=None,input=None,layer_sizes=None,dropout_rates=None,activations=None,use_bias=True,layers=None):
101 |         if(layers!=None):         #From pickle file
102 |             self.layers = layers
103 |             self.activations = activations
104 |         else:
105 |             #rectified_linear_activation = lambda x: T.maximum(0.0, x)
106 | 
107 |             # Set up all the hidden layers
108 |             self.weight_matrix_sizes = zip(layer_sizes, layer_sizes[1:])
109 |             self.layers = []
110 |             self.dropout_layers = []
111 |             self.activations = activations
112 |             next_layer_input = input
113 |             #first_layer = True
114 |             # dropout the input
115 |             next_dropout_layer_input = _dropout_from_layer(rng, input, p=dropout_rates[0])
116 |             layer_counter = 0
117 |             for n_in, n_out in self.weight_matrix_sizes[:-1]:
118 |                 next_dropout_layer = DropoutHiddenLayer(rng=rng,
119 |                         input=next_dropout_layer_input,
120 |                         activation=activations[layer_counter],
121 |                         n_in=n_in, n_out=n_out, use_bias=use_bias,
122 |                         dropout_rate=dropout_rates[layer_counter])
123 |                 self.dropout_layers.append(next_dropout_layer)
124 |                 next_dropout_layer_input = next_dropout_layer.output
125 | 
126 |                 # Reuse the parameters from the dropout layer here, in a different
127 |                 # path through the graph.
128 |                 next_layer = HiddenLayer(rng=rng,
129 |                         input=next_layer_input,
130 |                         activation=activations[layer_counter],
131 |                         # scale the weight matrix W with (1-p)
132 |                         W=next_dropout_layer.W * (1 - dropout_rates[layer_counter]),
133 |                         b=next_dropout_layer.b,
134 |                         n_in=n_in, n_out=n_out,
135 |                         use_bias=use_bias)
136 |                 self.layers.append(next_layer)
137 |                 next_layer_input = next_layer.output
138 |                 #first_layer = False
139 |                 layer_counter += 1
140 | 
141 |             # Set up the output layer
142 |             n_in, n_out = self.weight_matrix_sizes[-1]
143 |             dropout_output_layer = LogisticRegression(
144 |                     input=next_dropout_layer_input,
145 |                     n_in=n_in, n_out=n_out)
146 |             self.dropout_layers.append(dropout_output_layer)
147 | 
148 |             # Again, reuse paramters in the dropout output.
149 |             output_layer = LogisticRegression(
150 |                 input=next_layer_input,
151 |                 # scale the weight matrix W with (1-p)
152 |                 W=dropout_output_layer.W * (1 - dropout_rates[-1]),
153 |                 b=dropout_output_layer.b,
154 |                 n_in=n_in, n_out=n_out)
155 |             self.layers.append(output_layer)
156 | 
157 |             # Use the negative log likelihood of the logistic regression layer as
158 |             # the objective.
159 |             self.dropout_negative_log_likelihood = self.dropout_layers[-1].negative_log_likelihood
160 |             self.dropout_errors = self.dropout_layers[-1].errors
161 | 
162 |             self.negative_log_likelihood = self.layers[-1].negative_log_likelihood
163 |             self.errors = self.layers[-1].errors
164 | 
165 |             # Grab all the parameters together.
166 |             self.params = [ param for layer in self.dropout_layers for param in layer.params ]
167 | 
168 |     def predict(self, new_data):
169 |         next_layer_input = new_data
170 |         for i,layer in enumerate(self.layers):
171 |             if i<len(self.layers)-1:
172 |                 next_layer_input = self.activations[i](T.dot(next_layer_input,layer.W) + layer.b)
173 |             else:
174 |                 p_y_given_x = T.nnet.softmax(T.dot(next_layer_input, layer.W) + layer.b)
175 |         y_pred = T.argmax(p_y_given_x, axis=1)
176 |         return y_pred
177 | 
178 |     def predict_p(self, new_data):
179 |         next_layer_input = new_data
180 |         for i,layer in enumerate(self.layers):
181 |             if i<len(self.layers)-1:
182 |                 next_layer_input = self.activations[i](T.dot(next_layer_input,layer.W) + layer.b)
183 |             else:
184 |                 p_y_given_x = T.nnet.softmax(T.dot(next_layer_input, layer.W) + layer.b)
185 |         return p_y_given_x
186 | 
187 |     def __getstate__(self):
188 |         return (self.layers,self.activations)
189 |     def __setstate__(self, state):
190 |          self.layers,self.activations = state
191 |         
192 | 
193 | class MLP(object):
194 |     """Multi-Layer Perceptron Class
195 | 
196 |     A multilayer perceptron is a feedforward artificial neural network model
197 |     that has one layer or more of hidden units and nonlinear activations.
198 |     Intermediate layers usually have as activation function tanh or the
199 |     sigmoid function (defined here by a ``HiddenLayer`` class)  while the
200 |     top layer is a softamx layer (defined here by a ``LogisticRegression``
201 |     class).
202 |     """
203 | 
204 |     def __init__(self, rng, input, n_in, n_hidden, n_out):
205 |         """Initialize the parameters for the multilayer perceptron
206 | 
207 |         :type rng: numpy.random.RandomState
208 |         :param rng: a random number generator used to initialize weights
209 | 
210 |         :type input: theano.tensor.TensorType
211 |         :param input: symbolic variable that describes the input of the
212 |         architecture (one minibatch)
213 | 
214 |         :type n_in: int
215 |         :param n_in: number of input units, the dimension of the space in
216 |         which the datapoints lie
217 | 
218 |         :type n_hidden: int
219 |         :param n_hidden: number of hidden units
220 | 
221 |         :type n_out: int
222 |         :param n_out: number of output units, the dimension of the space in
223 |         which the labels lie
224 | 
225 |         """
226 | 
227 |         # Since we are dealing with a one hidden layer MLP, this will translate
228 |         # into a HiddenLayer with a tanh activation function connected to the
229 |         # LogisticRegression layer; the activation function can be replaced by
230 |         # sigmoid or any other nonlinear function
231 |         self.hiddenLayer = HiddenLayer(rng=rng, input=input,
232 |                                        n_in=n_in, n_out=n_hidden,
233 |                                        activation=T.tanh)
234 | 
235 |         # The logistic regression layer gets as input the hidden units
236 |         # of the hidden layer
237 |         self.logRegressionLayer = LogisticRegression(
238 |             input=self.hiddenLayer.output,
239 |             n_in=n_hidden,
240 |             n_out=n_out)
241 | 
242 |         # L1 norm ; one regularization option is to enforce L1 norm to
243 |         # be small
244 | 
245 |         # negative log likelihood of the MLP is given by the negative
246 |         # log likelihood of the output of the model, computed in the
247 |         # logistic regression layer
248 |         self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
249 |         # same holds for the function computing the number of errors
250 |         self.errors = self.logRegressionLayer.errors
251 | 
252 |         # the parameters of the model are the parameters of the two layer it is
253 |         # made out of
254 |         self.params = self.hiddenLayer.params + self.logRegressionLayer.params
255 | 
256 | class LogisticRegression(object):
257 |     """Multi-class Logistic Regression Class
258 | 
259 |     The logistic regression is fully described by a weight matrix :math:`W`
260 |     and bias vector :math:`b`. Classification is done by projecting data
261 |     points onto a set of hyperplanes, the distance to which is used to
262 |     determine a class membership probability.
263 |     """
264 | 
265 |     def __init__(self, input, n_in, n_out, W=None, b=None):
266 |         """ Initialize the parameters of the logistic regression
267 | 
268 |     :type input: theano.tensor.TensorType
269 |     :param input: symbolic variable that describes the input of the
270 |     architecture (one minibatch)
271 | 
272 |     :type n_in: int
273 |     :param n_in: number of input units, the dimension of the space in
274 |     which the datapoints lie
275 | 
276 |     :type n_out: int
277 |     :param n_out: number of output units, the dimension of the space in
278 |     which the labels lie
279 | 
280 |     """
281 | 
282 |         # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
283 |         if W is None:
284 |             self.W = theano.shared(
285 |                     value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX),
286 |                     name='W')
287 |         else:
288 |             self.W = W
289 | 
290 |         # initialize the baises b as a vector of n_out 0s
291 |         if b is None:
292 |             self.b = theano.shared(
293 |                     value=numpy.zeros((n_out,), dtype=theano.config.floatX),
294 |                     name='b')
295 |         else:
296 |             self.b = b
297 | 
298 |         # compute vector of class-membership probabilities in symbolic form
299 |         self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
300 | 
301 |         # compute prediction as class whose probability is maximal in
302 |         # symbolic form
303 |         self.y_pred = T.argmax(self.p_y_given_x, axis=1)
304 | 
305 |         # parameters of the model
306 |         self.params = [self.W, self.b]
307 | 
308 |     def negative_log_likelihood(self, y):
309 |         """Return the mean of the negative log-likelihood of the prediction
310 |         of this model under a given target distribution.
311 | 
312 |     .. math::
313 | 
314 |     \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
315 |     \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
316 |     \ell (\theta=\{W,b\}, \mathcal{D})
317 | 
318 |     :type y: theano.tensor.TensorType
319 |     :param y: corresponds to a vector that gives for each example the
320 |     correct label
321 | 
322 |     Note: we use the mean instead of the sum so that
323 |     the learning rate is less dependent on the batch size
324 |     """
325 |         # y.shape[0] is (symbolically) the number of rows in y, i.e.,
326 |         # number of examples (call it n) in the minibatch
327 |         # T.arange(y.shape[0]) is a symbolic vector which will contain
328 |         # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
329 |         # Log-Probabilities (call it LP) with one row per example and
330 |         # one column per class LP[T.arange(y.shape[0]),y] is a vector
331 |         # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
332 |         # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
333 |         # the mean (across minibatch examples) of the elements in v,
334 |         # i.e., the mean log-likelihood across the minibatch.
335 |         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
336 | 
337 |     def errors(self, y):
338 |         """Return a float representing the number of errors in the minibatch ;
339 |     zero one loss over the size of the minibatch
340 | 
341 |     :type y: theano.tensor.TensorType
342 |     :param y: corresponds to a vector that gives for each example the
343 |     correct label
344 |     """
345 | 
346 |         # check if y has same dimension of y_pred
347 |         if y.ndim != self.y_pred.ndim:
348 |             raise TypeError('y should have the same shape as self.y_pred',
349 |                 ('y', y.type, 'y_pred', self.y_pred.type))
350 |         # check if y is of the correct datatype
351 |         if y.dtype.startswith('int'):
352 |             # the T.neq operator returns a vector of 0s and 1s, where 1
353 |             # represents a mistake in prediction
354 |             return T.mean(T.neq(self.y_pred, y))
355 |         else:
356 |             raise NotImplementedError()
357 | 
358 | class LeNetConvPoolLayer(object):
359 |     """Pool Layer of a convolutional network """
360 | 
361 |     def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2), non_linear="tanh"):
362 |         """
363 |         Allocate a LeNetConvPoolLayer with shared variable internal parameters.
364 | 
365 |         :type rng: numpy.random.RandomState
366 |         :param rng: a random number generator used to initialize weights
367 | 
368 |         :type input: theano.tensor.dtensor4
369 |         :param input: symbolic image tensor, of shape image_shape
370 | 
371 |         :type filter_shape: tuple or list of length 4
372 |         :param filter_shape: (number of filters, num input feature maps,
373 |                               filter height,filter width)
374 | 
375 |         :type image_shape: tuple or list of length 4
376 |         :param image_shape: (batch size, num input feature maps,
377 |                              image height, image width)
378 | 
379 |         :type poolsize: tuple or list of length 2
380 |         :param poolsize: the downsampling (pooling) factor (#rows,#cols)
381 |         """
382 | 
383 |         assert image_shape[1] == filter_shape[1]
384 |         self.input = input
385 |         self.filter_shape = filter_shape
386 |         self.image_shape = image_shape
387 |         self.poolsize = poolsize
388 |         self.non_linear = non_linear
389 |         # there are "num input feature maps * filter height * filter width"
390 |         # inputs to each hidden unit
391 |         fan_in = numpy.prod(filter_shape[1:])
392 |         # each unit in the lower layer receives a gradient from:
393 |         # "num output feature maps * filter height * filter width" /
394 |         #   pooling size
395 |         fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /numpy.prod(poolsize))
396 |         # initialize weights with random weights
397 |         if self.non_linear=="none" or self.non_linear=="relu":
398 |             rng_uniform = rng.uniform(low=-0.01, high=0.01, size=filter_shape)
399 |             self.W = theano.shared(numpy.asarray(rng_uniform,dtype=theano.config.floatX),borrow=True,name="W_conv")
400 |         else:
401 |             W_bound = numpy.sqrt(6. / (fan_in + fan_out))
402 |             self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
403 |                 dtype=theano.config.floatX),borrow=True,name="W_conv")
404 |         b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
405 |         self.b = theano.shared(value=b_values, borrow=True, name="b_conv")
406 | 
407 |         # convolve input feature maps with filters
408 |         conv_out = conv.conv2d(input=input, filters=self.W,filter_shape=self.filter_shape, image_shape=self.image_shape)
409 |         if self.non_linear=="tanh":
410 |             conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
411 |             self.output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True)
412 |         elif self.non_linear=="relu":
413 |             conv_out_tanh = ReLU(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
414 |             self.output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True)
415 |         else:
416 |             pooled_out = downsample.max_pool_2d(input=conv_out, ds=self.poolsize, ignore_border=True)
417 |             self.output = pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')
418 |         self.params = [self.W, self.b]
419 | 
420 |     def predict(self, new_data, batch_size):
421 |         """
422 |         predict for new data
423 |         """
424 |         img_shape = (batch_size, 1, self.image_shape[2], self.image_shape[3])
425 |         conv_out = conv.conv2d(input=new_data, filters=self.W, filter_shape=self.filter_shape, image_shape=img_shape)
426 |         if self.non_linear=="tanh":
427 |             conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
428 |             output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True)
429 |         if self.non_linear=="relu":
430 |             conv_out_tanh = ReLU(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
431 |             output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True)
432 |         else:
433 |             pooled_out = downsample.max_pool_2d(input=conv_out, ds=self.poolsize, ignore_border=True)
434 |             output = pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')
435 |         return output
436 | 
437 | 
438 |     def __getstate__(self):
439 |         return (self.b,self.W,self.image_shape,self.filter_shape,self.non_linear,self.poolsize)
440 |     def __setstate__(self, state):
441 |          self.b,self.W,self.image_shape,self.filter_shape,self.non_linear,self.poolsize = state
442 | 


--------------------------------------------------------------------------------
/cnn_text_trainer/core/unichannel/model.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | import cPickle
  3 | 
  4 | import numpy as np
  5 | 
  6 | import theano
  7 | import theano.tensor as T
  8 | from cnn_text_trainer.core.nn_classes import LeNetConvPoolLayer, MLPDropout, Iden
  9 | from cnn_text_trainer.rw import wordvecs
 10 | 
 11 | 
 12 | class TextCNNModel(object):
 13 | 
 14 |     def __init__(self,trainingConfig,conv_layers,classifier,word_idx_map,Words,labels,img_h):
 15 |         self.trainingConfig = trainingConfig
 16 |         self.conv_layers = conv_layers
 17 |         self.classifier = classifier
 18 |         self.word_idx_map = word_idx_map
 19 |         self.Words = Words
 20 |         self.labels = labels
 21 |         self.img_h = img_h
 22 |         self.word_vecs = {}
 23 | 
 24 |     def get_labels(self):
 25 |         return self.labels
 26 | 
 27 |     def add_global_word_vecs(self,word_vec_dict):
 28 |         """
 29 |         This function should be called by the instantiator, this allows the model
 30 |         to pick up word vectors, if they pre exist in memory. If not, they are
 31 |         loaded from file
 32 | 
 33 |         :param word_vec_dict: Global word vector dictionary
 34 |         """
 35 |         if self.trainingConfig.word2vec in word_vec_dict:
 36 |             self.word_vecs = word_vec_dict[self.trainingConfig.word2vec]
 37 |         else:
 38 |             self.word_vecs = wordvecs.load_wordvecs(self.trainingConfig.word2vec)
 39 |             word_vec_dict[self.trainingConfig.word2vec] = self.word_vecs
 40 | 
 41 |     def _classify(self,dataset):
 42 |         """
 43 |         Classify method for static or non-static models.
 44 |         :param classifier: model
 45 |         :param conv_layers: list of convPoolLayer objects
 46 |         :param Words: Dictionary of word index to word vectors
 47 |         :param dataset: Indices of words for the current sentence/dataset
 48 |         :param dim: dimension of word vector
 49 |         :param img_h: length of sentence vector after padding
 50 |         :return: [y_pred,prob_pred] The probability for each class
 51 |         """
 52 |         x = T.imatrix('x')
 53 | 
 54 |         test_pred_layers = []
 55 |         test_size = np.shape(dataset)[0]
 56 |         Words = theano.shared(value = self.Words, name = "Words")
 57 |         test_layer0_input = Words[T.cast(x.flatten(),dtype="int32")].reshape((test_size,1,self.img_h,self.Words.shape[1]))
 58 |         for conv_layer in self.conv_layers:
 59 |             test_layer0_output = conv_layer.predict(test_layer0_input, test_size)
 60 |             test_pred_layers.append(test_layer0_output.flatten(2))
 61 | 
 62 |         test_layer1_input = T.concatenate(test_pred_layers, 1)
 63 |         test_y_pred = self.classifier.predict(test_layer1_input)
 64 |         test_prob_pred = self.classifier.predict_p(test_layer1_input)
 65 |         test_model_all = theano.function([x], (test_y_pred,test_prob_pred))
 66 | 
 67 |         return test_model_all(dataset)
 68 | 
 69 |     def classify(self,sentences):
 70 |         datasets = []
 71 |         for sentence in sentences:
 72 |             dataset = self.get_idx_from_sent(sentence["text"])
 73 |             datasets.append(dataset)
 74 |         datasets=np.array(datasets,dtype="int32")
 75 |         print "lds",len(datasets)
 76 |         print "dss",np.shape(datasets[0])
 77 |         return self._classify(datasets)
 78 | 
 79 |     def get_idx_from_sent(self, sent):
 80 |         """
 81 |         Transforms sentence into a list of indices. Pad with zeroes.
 82 |         """
 83 |         x = []
 84 |         pad = self.trainingConfig.filter_h - 1
 85 |         for i in xrange(pad):
 86 |             x.append(0)
 87 |         words = sent.split()
 88 |         W=list(self.Words)
 89 | 
 90 |         for wd in words:
 91 |             if wd in self.word_idx_map:
 92 |                 x.append(self.word_idx_map[wd])
 93 |             elif self.word_vecs!=None and wd in self.word_vecs:
 94 |                 self.word_idx_map[wd]=len(self.word_idx_map)+1
 95 |                 W.append(self.word_vecs[wd])
 96 |                 x.append(self.word_idx_map[wd])
 97 | 
 98 |         max_l = self.trainingConfig.max_l
 99 |         while len(x) < max_l +2*pad:
100 |             x.append(0)
101 |         if len(x) > max_l +2*pad:
102 |             x=x[:max_l +2*pad]
103 |         self.Words=np.array(W,dtype=theano.config.floatX)
104 |         return x
105 | 
106 | 
107 | class TextCNNModelTrainer(object):
108 |     """
109 |     Trainer class. Constructs the model, and trains the dataset.
110 |     """
111 | 
112 |     def __init__(self,trainingConfig,word_vecs,sentences,labels):
113 |         """
114 |         Inititalize the trainer.
115 |         trainingConfig = config object
116 |         word_vecs = Dictionary of words to word vectors (as loaded from file)
117 |         sentences = list of dictionary. Each sentence is represented by a dict and has two keys: text and y
118 |         """
119 |         self.trainingConfig = trainingConfig
120 |         self.labels = labels
121 |         self.hidden_layer_activations = [Iden]
122 |         #Get index map and U
123 |         self.U,self.word_idx_map = self.index_wordvecs(word_vecs)
124 |         print "Converted word vectors to word_idx_map"
125 | 
126 |         #Convert training data into a matrix according to word vector indices (dataset)
127 |         self.datasets,self.num_labels = self.get_dataset_from_sentences(sentences,
128 |                                                              self.word_idx_map,
129 |                                                              self.trainingConfig.max_l,
130 |                                                              self.trainingConfig.filter_h,
131 |                                                              self.trainingConfig.dim)
132 |         print "Converted the dataset into matrix. Num Rows = ",len(self.datasets)," Num labels = ",self.num_labels
133 |         self.filter_shapes,self.pool_sizes = self.init_convolution_layer_params()
134 | 
135 |         self.parameters = [("image shape",self.img_h,self.img_w),("filter shape",self.filter_shapes), ("hidden_units",trainingConfig.mlp_hidden_units),
136 |                   ("dropout", trainingConfig.dropout_rate), ("batch_size",trainingConfig.batch_size),
137 |                     ("learn_decay",trainingConfig.lr_decay), ("conv_non_linear", trainingConfig.conv_non_linear), ("mode", trainingConfig.mode)
138 |                     ,("sqr_norm_lim",trainingConfig.sqr_norm_lim),("shuffle_batch",trainingConfig.shuffle_batch),("n_epochs",trainingConfig.n_epochs)]
139 |         print "Parameters for model: ",self.parameters
140 | 
141 | 
142 |     def train(self,modelOutputPath):
143 |         train_model,test_model,val_model,n_train_batches,n_val_batches = self.construct_theano_functions()
144 | 
145 |         #Theano function for setting zero padding back to zero
146 |         zero_vec_tensor = T.vector()
147 |         if type(self.Words) is list:    #In case words have multiple channels, first is assumed to be non static
148 |             set_zero = theano.function([zero_vec_tensor], updates=[(self.Words[0], T.set_subtensor(self.Words[0][0,:], zero_vec_tensor))])
149 |         else:
150 |             set_zero = theano.function([zero_vec_tensor], updates=[(self.Words, T.set_subtensor(self.Words[0,:], zero_vec_tensor))])
151 |         zero_vec = np.zeros(self.img_w,dtype=theano.config.floatX)
152 | 
153 |         #start training over mini-batches
154 |         epoch = 0
155 |         best_val_perf = 0
156 |         val_perf = 0
157 |         test_perf = 0
158 |         cost_epoch = 0
159 |         while (epoch < self.trainingConfig.n_epochs):
160 |             epoch = epoch + 1
161 |             if self.trainingConfig.shuffle_batch:
162 |                 for minibatch_index in np.random.permutation(range(n_train_batches)):
163 |                     cost_epoch = train_model(minibatch_index)
164 |                     set_zero(zero_vec)
165 |             else:
166 |                 for minibatch_index in xrange(n_train_batches):
167 |                     cost_epoch = train_model(minibatch_index)
168 |                     set_zero(zero_vec)
169 | 
170 |             train_losses = [test_model(i) for i in xrange(n_train_batches)]
171 |             train_perf = 1 - np.mean(train_losses)
172 |             val_losses = [val_model(i) for i in xrange(n_val_batches)]
173 |             val_perf = 1- np.mean(val_losses)
174 |             print('epoch %i, train perf %f %%, val perf %f' % (epoch, train_perf * 100., val_perf*100.))
175 |             if val_perf >= best_val_perf:      #Only save the model if it's validation performace is better. This is to prevent overfitting
176 |                 best_val_perf = val_perf
177 |                 print "Saving the best model"
178 |                 self.save_model(modelOutputPath)
179 |         print "Training finished. Best model is at ",modelOutputPath
180 | 
181 |     def construct_theano_functions(self):
182 |         """
183 |         Construct the theano functions for training, testing and validation
184 |         :return:
185 |         """
186 |         #define model architecture
187 |         index = T.lscalar()
188 |         x = T.matrix('x')
189 |         y = T.ivector('y')
190 |         classifier, conv_layers,cost,grad_updates = self.construct_models(x,y)
191 | 
192 |         #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate
193 |         #extra data (at random)
194 |         np.random.seed(3435)
195 |         if self.datasets.shape[0] % self.trainingConfig.batch_size > 0:
196 |             extra_data_num = self.trainingConfig.batch_size - self.datasets.shape[0] % self.trainingConfig.batch_size
197 |             train_set = np.random.permutation(self.datasets)
198 |             extra_data = train_set[:extra_data_num]
199 |             new_data=np.append(self.datasets,extra_data,axis=0)
200 |         else:
201 |             new_data = self.datasets
202 |         new_data = np.random.permutation(new_data)
203 |         n_batches = new_data.shape[0]/self.trainingConfig.batch_size
204 |         n_train_batches = int(np.round(n_batches*0.9))
205 | 
206 |         #divide train set into train/val sets
207 |         train_set = new_data[:n_train_batches*self.trainingConfig.batch_size,:]
208 |         val_set = new_data[n_train_batches*self.trainingConfig.batch_size:,:]
209 |         train_set_x, train_set_y = shared_dataset((train_set[:,:self.img_h],train_set[:,-1]))
210 |         val_set_x, val_set_y = shared_dataset((val_set[:,:self.img_h],val_set[:,-1]))
211 |         n_val_batches = n_batches - n_train_batches
212 |         val_model = theano.function([index], classifier.errors(y),
213 |              givens={
214 |                 x: val_set_x[index * self.trainingConfig.batch_size: (index + 1) * self.trainingConfig.batch_size],
215 |                 y: val_set_y[index * self.trainingConfig.batch_size: (index + 1) * self.trainingConfig.batch_size]})
216 | 
217 |         #compile theano functions to get train/val/test errors
218 |         test_model = theano.function([index], classifier.errors(y),
219 |                  givens={
220 |                     x: train_set_x[index * self.trainingConfig.batch_size: (index + 1) * self.trainingConfig.batch_size],
221 |                     y: train_set_y[index * self.trainingConfig.batch_size: (index + 1) * self.trainingConfig.batch_size]})
222 |         train_model = theano.function([index], cost, updates=grad_updates,
223 |               givens={
224 |                 x: train_set_x[index*self.trainingConfig.batch_size:(index+1)*self.trainingConfig.batch_size],
225 |                 y: train_set_y[index*self.trainingConfig.batch_size:(index+1)*self.trainingConfig.batch_size]})
226 | 
227 |         self.classifier = classifier
228 |         self.conv_layers = conv_layers
229 |         return train_model,test_model,val_model,n_train_batches,n_val_batches
230 | 
231 |     def construct_conv_layer(self, filter_shape, pool_size, layer0_input, rng):
232 |         conv_layer = LeNetConvPoolLayer(rng, input=layer0_input,
233 |                                         image_shape=(self.trainingConfig.batch_size, 1, self.img_h, self.img_w),
234 |                                         filter_shape=filter_shape, poolsize=pool_size,
235 |                                         non_linear=self.trainingConfig.conv_non_linear)
236 |         layer1_input = conv_layer.output.flatten(2)
237 |         return conv_layer, layer1_input
238 | 
239 |     def construct_models(self, x,y):
240 |         """
241 |         Get MLP and Conv Net objects. Also define the parameters to backpropogate into.
242 |         :param x:
243 |         :return:
244 |         """
245 |         rng = np.random.RandomState(3435)
246 |         Words = theano.shared(value = self.U, name = "Words")
247 |         conv_layer_input = Words[T.cast(x.flatten(), dtype="int32")].reshape((x.shape[0], 1, x.shape[1], Words.shape[1]))
248 |         conv_layers = []
249 |         conv_outputs = []
250 |         for i in xrange(len(self.trainingConfig.filter_hs)):
251 |             conv_layer, conv_output = self.construct_conv_layer(self.filter_shapes[i], self.pool_sizes[i], conv_layer_input, rng)
252 |             conv_layers.append(conv_layer)
253 |             conv_outputs.append(conv_output)
254 |         conv_output = T.concatenate(conv_outputs, 1)
255 |         mlp_input_size = self.trainingConfig.conv_features * len(conv_layers)
256 |         classifier = MLPDropout(rng, input=conv_output, layer_sizes=[mlp_input_size]+self.trainingConfig.mlp_hidden_units+[len(self.labels)],
257 |                                 activations=self.hidden_layer_activations,
258 |                                 dropout_rates=[self.trainingConfig.dropout_rate])
259 | 
260 |         #define parameters of the model and update functions using adadelta
261 |         params = classifier.params
262 |         for conv_layer in conv_layers:
263 |             params += conv_layer.params
264 |         if self.trainingConfig.mode=="nonstatic":
265 |             #if word vectors are allowed to change, add them as model parameters
266 |             params += [Words]
267 |         cost = classifier.negative_log_likelihood(y)
268 |         dropout_cost = classifier.dropout_negative_log_likelihood(y)
269 |         grad_updates = sgd_updates_adadelta(params, dropout_cost, self.trainingConfig.lr_decay, 1e-6, self.trainingConfig.sqr_norm_lim)
270 | 
271 |         self.Words = Words
272 |         return classifier, conv_layers,cost,grad_updates
273 | 
274 |     def init_convolution_layer_params(self):
275 |         """
276 |         Initialize configs for conv layers and max pooling
277 |         filter_shapes: list of [shape of convolution filter]
278 |         pool_sizes: list of pool size
279 |         """
280 |         self.img_w = self.trainingConfig.dim
281 |         self.img_h = len(self.datasets[0])-1
282 |         filter_w = self.img_w
283 |         conv_features = self.trainingConfig.conv_features
284 |         filter_shapes = []
285 |         pool_sizes = []
286 |         for filter_h in self.trainingConfig.filter_hs:
287 |             filter_shapes.append((conv_features, 1, filter_h, filter_w))
288 |             pool_sizes.append((self.img_h-filter_h+1, self.img_w-filter_w+1))
289 |         return filter_shapes,pool_sizes
290 | 
291 |     def save_model(self,outputPath):
292 |         cPickle.dump(TextCNNModel(self.trainingConfig,self.conv_layers,self.classifier,self.word_idx_map,self.Words.get_value(),self.labels,self.img_h), open(outputPath, "wb"))
293 |         # cPickle.dump([self.classifier,self.conv_layers,self.word_idx_map,self.Words.get_value(),self.labels,self.trainingConfig.max_l,self.trainingConfig.filter_h,self.trainingConfig.dim,self.trainingConfig.mode,self.trainingConfig.word2vec], open(outputPath, "wb"))
294 | 
295 |     def index_wordvecs(self,word_vecs):
296 |         """
297 |         Get word matrix. W[i] is the vector for word indexed by i
298 |         word_vecs = Dictionary of words to word vectors (as loaded from file)
299 |         """
300 |         vocab_size = len(word_vecs)
301 |         word_idx_map = dict()
302 |         W = np.zeros(shape=(vocab_size+1, self.trainingConfig.dim),dtype=theano.config.floatX)
303 |         W[0] = np.zeros(self.trainingConfig.dim,dtype=theano.config.floatX)
304 |         i = 1
305 |         for word in word_vecs:     #Iterate over keys
306 |             W[i] = word_vecs[word]
307 |             word_idx_map[word] = i
308 |             i += 1
309 |         return W, word_idx_map
310 | 
311 |     def get_dataset_from_sentences(self,sentences, word_idx_map, max_l, filter_h, k):
312 |         """
313 |         Transforms sentences into a 2-d matrix.
314 |         """
315 |         train = []
316 |         num_labels=0
317 |         for sentence in sentences:
318 |             sent = self.get_idx_from_sent(sentence["text"], word_idx_map, max_l, filter_h, k)
319 |             sent.append(sentence["y"])
320 |             if(sentence["y"]>num_labels):
321 |                 num_labels=sentence["y"]
322 |             train.append(sent)
323 |         train = np.array(train,dtype="int")
324 |         return train,num_labels+1
325 | 
326 |     def get_idx_from_sent(self,sent, word_idx_map, max_l, filter_h, k):
327 |         """
328 |         Transforms sentence into a list of indices. Pad with zeroes.
329 |         """
330 |         x = []
331 |         pad = filter_h - 1
332 |         for i in xrange(pad):
333 |             x.append(0)
334 |         words = sent.split()
335 |         for word in words:
336 |             if word in word_idx_map:
337 |                 x.append(word_idx_map[word])
338 |         while len(x) < max_l+2*pad:
339 |             x.append(0)
340 |         if len(x) > max_l+2*pad:
341 |             x=x[:max_l+2*pad]
342 |         return x
343 | 
344 | #Helper methods for the NN
345 | def sgd_updates_adadelta(params,cost,rho=0.95,epsilon=1e-6,norm_lim=9,word_vec_name='Words'):
346 |     """
347 |     adadelta update rule, mostly from
348 |     https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
349 |     """
350 |     updates = OrderedDict({})
351 |     exp_sqr_grads = OrderedDict({})
352 |     exp_sqr_ups = OrderedDict({})
353 |     gparams = []
354 |     for param in params:
355 |         empty = np.zeros_like(param.get_value())
356 |         exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name)
357 |         gp = T.grad(cost, param)
358 |         exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name)
359 |         gparams.append(gp)
360 |     for param, gp in zip(params, gparams):
361 |         exp_sg = exp_sqr_grads[param]
362 |         exp_su = exp_sqr_ups[param]
363 |         up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
364 |         updates[exp_sg] = up_exp_sg
365 |         step =  -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp
366 |         updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
367 |         stepped_param = param + step
368 |         if (param.get_value(borrow=True).ndim == 2) and (param.name!='Words'):
369 |             col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
370 |             desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim))
371 |             scale = desired_norms / (1e-7 + col_norms)
372 |             updates[param] = stepped_param * scale
373 |         else:
374 |             updates[param] = stepped_param
375 |     return updates
376 | 
377 | def shared_dataset(data_xy, borrow=True):
378 |         """ Function that loads the dataset into shared variables
379 | 
380 |         The reason we store our dataset in shared variables is to allow
381 |         Theano to copy it into the GPU memory (when code is run on GPU).
382 |         Since copying data into the GPU is slow, copying a minibatch everytime
383 |         is needed (the default behaviour if the data is not in a shared
384 |         variable) would lead to a large decrease in performance.
385 |         """
386 |         data_x, data_y = data_xy
387 |         shared_x = theano.shared(np.asarray(data_x,
388 |                                                dtype=theano.config.floatX),
389 |                                  borrow=borrow)
390 |         shared_y = theano.shared(np.asarray(data_y,
391 |                                                dtype=theano.config.floatX),
392 |                                  borrow=borrow)
393 |         return shared_x, T.cast(shared_y, 'int32')
394 | 
395 | def as_floatX(variable):
396 |     if isinstance(variable, float):
397 |         return np.cast[theano.config.floatX](variable)
398 | 
399 |     if isinstance(variable, np.ndarray):
400 |         return np.cast[theano.config.floatX](variable)
401 |     return theano.tensor.cast(variable, theano.config.floatX)
402 | 
403 | def safe_update(dict_to, dict_from):
404 |     """
405 |     re-make update dictionary for safe updating
406 |     """
407 |     for key, val in dict(dict_from).iteritems():
408 |         if key in dict_to:
409 |             raise KeyError(key)
410 |         dict_to[key] = val
411 |     return dict_to
412 | 
413 | 


--------------------------------------------------------------------------------
/sample/datasets/sst_small_sample.csv:
--------------------------------------------------------------------------------
  1 | labels,text
  2 | neg,Maybe I found the proceedings a little bit too conventional .
  3 | neg,"As with too many studio pics , plot mechanics get in the way of what should be the lighter-than-air adventure ."
  4 | pos,"Cute , funny , heartwarming digitally animated feature film with plenty of slapstick humor for the kids , lots of in-jokes for the adults and heart enough for everyone ."
  5 | neg,So what is the point ?
  6 | pos,"While this film is not in the least surprising , it is still ultimately very satisfying ."
  7 | neg,"I found it slow , drab , and bordering on melodramatic ."
  8 | pos,"Awkward but sincere and , ultimately , it wins you over ."
  9 | neg,The movie makes absolutely no sense .
 10 | neg,"But as a movie , it 's a humorless , disjointed mess ."
 11 | pos,"I complain all the time about seeing the same ideas repeated in films over and over again , but The Bourne Identity proves that a fresh take is always possible ."
 12 | neg,"There 's an audience for it , but it could have been funnier and more innocent ."
 13 | neg,Could The Country Bears really be as bad as its trailers ?
 14 | pos,"Renner 's performance as Dahmer is unforgettable , deeply absorbing ."
 15 | neg,"It 's the kind of movie that ends up festooning U.S. art house screens for no reason other than the fact that it 's in French ( well , mostly ) with English subtitles and is magically ` significant ' because of that ."
 16 | pos,"An offbeat , sometimes gross and surprisingly appealing animated film about the true meaning of the holidays ."
 17 | pos,"In the process , they demonstrate that there 's still a lot of life in Hong Kong cinema ."
 18 | neg,This goofy gangster yarn never really elevates itself from being yet another earnestly generic crime-busting comic vehicle -- a well-intentioned remake that shows some spunk and promise but fails to register as anything distinctive or daring
 19 | pos,"Celebrated at Sundance , this slight comedy of manners has winning performances and a glossy , glib charm that 's hard to beat ."
 20 | pos,It 's solid and affecting and exactly as thought-provoking as it should be .
 21 | pos,It 's a refreshing change from the self-interest and paranoia that shape most American representations of Castro .
 22 | neg,"Though Ganesh is successful in a midlevel sort of way , there 's nothing so striking or fascinating or metaphorically significant about his career as to rate two hours of our attention ."
 23 | neg,"Uncertain in tone ... a garbled exercise in sexual politics , a junior varsity Short Cuts by way of Very Bad Things ."
 24 | neg,About as satisfying and predictable as the fare at your local drive through .
 25 | neg,Marries the amateurishness of The Blair Witch Project with the illogic of Series 7 : The Contenders to create a completely crass and forgettable movie .
 26 | neg,There 's nothing provocative about this film save for the ways in which it studiously avoids provoking thought .
 27 | neg,"Matthew McConaughey tries , and fails , to control the screen with swaggering machismo and over-the-top lunacy ."
 28 | neg,"The film would work much better as a video installation in a museum , where viewers would be free to leave ."
 29 | neg,A frustrating combination of strained humor and heavy-handed sentimentality .
 30 | neg,This is junk food cinema at its greasiest .
 31 | pos,"Foster nails the role , giving a tight , focused performance illuminated by shards of feeling ."
 32 | pos,"Zhang Yimou delivers warm , genuine characters who lie not through dishonesty , but because they genuinely believe it 's the only way to bring happiness to their loved ones ."
 33 | neg,"It is a comedy that 's not very funny and an action movie that is not very thrilling ( and an uneasy alliance , at that ) ."
 34 | pos,"For the first two-thirds of this sparklingly inventive and artful , always fast and furious tale , kids will go happily along for the ride ."
 35 | neg,Neither funny nor suspenseful nor particularly well-drawn .
 36 | pos,"Too bad , but thanks to some lovely comedic moments and several fine performances , it 's not a total loss ."
 37 | neg,No. .
 38 | neg,All the necessary exposition prevents the picture from rising above your generic sand 'n' sandal adventure .
 39 | neg,"Rambles on in a disjointed , substandard fashion from one poorly executed action sequence to the next ."
 40 | pos,A wonderful character-based comedy .
 41 | neg,"It 's so underwritten that you ca n't figure out just where the other characters , including Ana 's father and grandfather , come down on the issue of Ana 's future ."
 42 | pos,"There are slow and repetitive parts , but it has just enough spice to keep it interesting ."
 43 | neg,This film 's relationship to actual tension is the same as what Christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation .
 44 | neg,... Hudlin is stuck trying to light a fire with soggy leaves .
 45 | neg,Feels like six different movies fighting each other for attention .
 46 | neg,There 's plenty of style in Guillermo Del Toro 's sequel to the 1998 hit but why do we need 117 minutes to tell a tale that simply ca n't sustain more than 90 minutes .
 47 | pos,The tenderness of the piece is still intact .
 48 | pos,An enjoyably half-wit remake of the venerable Italian comedy Big Deal on Madonna Street .
 49 | neg,The movie bounces all over the map .
 50 | pos,What makes the movie a comedy is the way it avoids the more serious emotions involved .
 51 | neg,"It may be an easy swipe to take , but this Barbershop just does n't make the cut ."
 52 | pos,This is art paying homage to art .
 53 | neg,focuses on Joan 's raging hormones and sledgehammers the audience with Spanish inquisitions about her `` madness '' so much that I became mad that I wasted 123 minutes and $ 9.50 on this 21st century torture device .
 54 | pos,"Shainberg weaves a carefully balanced scenario that is controlled by neither character , is weirdly sympathetic to both and manages to be tender and darkly comic ."
 55 | pos,"( Drumline ) is entertaining for what it does , and admirable for what it does n't do ."
 56 | pos,"This is the kind of movie that used to be right at home at the Saturday matinee , and it still is ."
 57 | neg,"It 's not horrible , just horribly mediocre ."
 58 | pos,The acting alone is worth the price of admission .
 59 | pos,"The diversity of the artists represented , both in terms of style and ethnicity , prevents the proceedings from feeling repetitious , as does the appropriately brief 40-minute running time ."
 60 | neg,"If this dud had been made in the '70s , it would have been called The Hills Have Antlers and played for about three weeks in drive-ins ."
 61 | pos,A beautiful and haunting examination of the stories we tell ourselves to make sense of the mundane horrors of the world .
 62 | neg,But I was n't .
 63 | pos,An excellent romp that boasts both a heart and a mind .
 64 | neg,The cartoon that is n't really good enough to be on afternoon TV is now a movie that is n't really good enough to be in theaters .
 65 | neg,`` An entire film about researchers quietly reading dusty old letters . ''
 66 | pos,"Often hilarious , well-shot and , importantly , entertaining , Hell House is a fascinating document of an event that has to be seen to be believed ."
 67 | neg,"Too stupid to be satire , too obviously hateful to be classified otherwise , Frank Novak 's irritating slice of lumpen life is as reliably soul-killing as its title is nearly meaningless ."
 68 | neg,How did it ever get made ?
 69 | neg,Extremely boring .
 70 | pos,Presents an astute appraisal of Middle American musical torpor and the desperate struggle to escape it .
 71 | neg,Son of the Bride may be a good half-hour too long but comes replete with a flattering sense of mystery and quietness .
 72 | pos,"On its own cinematic terms , it successfully showcases the passions of both the director and novelist Byatt ."
 73 | neg,"How on earth , or anywhere else , did director Ron Underwood manage to blow $ 100 million on this ?"
 74 | pos,"A very witty take on change , risk and romance , and the film uses humour to make its points about acceptance and growth ."
 75 | pos,"This is such a high-energy movie where the drumming and the marching are so excellent , who cares if the story 's a little weak ."
 76 | pos,It 's one of the saddest films I have ever seen that still manages to be uplifting but not overly sentimental .
 77 | pos,The Bourne Identity is what summer screen escapism used to be in the decades when it was geared more to grownups .
 78 | neg,"Guided more by intellect than heart , his story flattens instead of sharpens ."
 79 | neg,"The noble tradition of men in drag hits an all-time low in Sorority Boys , whose makers apparently believe that women 's clothing can cover up any deficiency in acting , writing or direction ."
 80 | pos,This is one of Polanski 's best films .
 81 | neg,Her film is unrelentingly claustrophobic and unpleasant .
 82 | neg,Too bad the former Murphy Brown does n't pop Reese back .
 83 | pos,But fans should have fun meeting a brand-new Pokemon called Celebi .
 84 | pos,"An elegant , exquisitely modulated psychological thriller ."
 85 | pos,The experience of watching blobby old-school CGI animation in this superlarge format is just surreal enough to be diverting .
 86 | pos,"It 's endearing to hear Madame D. refer to her husband as ` Jackie ' -- and he does make for excellent company , not least as a self-conscious performer ."
 87 | neg,"A woozy , roisterous , exhausting mess , and the off-beat casting of its two leads turns out to be as ill-starred as you might expect ."
 88 | neg,"Unfortunately , the picture failed to capture me ."
 89 | pos,Those outside show business will enjoy a close look at people they do n't really want to know .
 90 | neg,Any movie this boring should be required to have ushers in the theater that hand you a cup of coffee every few minutes .
 91 | pos,"s personal revelations regarding what the shop means in the big picture , iconic characters gambol fluidly through the story , with charming results ."
 92 | neg,"It 's badly acted , blandly directed , and could have been scripted by someone who just graduated from elementary school ."
 93 | pos,A well-made and often lovely depiction of the mysteries of friendship .
 94 | pos,"Exquisitely acted and masterfully if preciously interwoven ... ( the film ) addresses in a fascinating , intelligent manner the intermingling of race , politics and local commerce ."
 95 | neg,"Too loud , too long and too frantic by half , Die Another Day suggests that the Bond franchise has run into a creative wall that 007 can not fly over , tunnel under or barrel through ."
 96 | neg,"I saw Knockaround Guys yesterday , and already the details have faded like photographs from the Spanish-American War ... It 's so unmemorable that it turned my ballpoint notes to invisible ink ."
 97 | neg,Going to the website may be just as fun ( and scary ) as going to the film .
 98 | neg,Even those of a single digit age will be able to recognize that this story is too goofy ... even for Disney .
 99 | pos,a compelling journey ... and `` His Best Friend Remembers '' is up there with the finest of specials .
100 | pos,It 's both a necessary political work and a fascinating documentary ...
101 | neg,"When the painted backdrops in a movie are more alive than its characters , you know you 're in trouble ."
102 | pos,About nowhere kids who appropriated turfs as they found them and become self-made celebrity athletes -- a low-down version of the American dream .
103 | pos,The talents of the actors helps `` Moonlight Mile '' rise above its heart-on-its-sleeve writing .
104 | pos,The film overcomes the regular minefield of coming-of-age cliches with potent doses of honesty and sensitivity .
105 | neg,"If Shayamalan wanted to tell a story about a man who loses his faith , why did n't he just do it , instead of using bad sci-fi as window dressing ?"
106 | pos,An experience so engrossing it is like being buried in a new environment .
107 | pos,"A thoughtful , provocative , insistently humanizing film ."
108 | neg,Begins as a promising meditation on one of America 's most durable obsessions but winds up as a slender cinematic stunt .
109 | pos,The filmmakers skillfully evoke the sense of menace that nature holds for many urban dwellers .
110 | pos,The New Guy does have a heart .
111 | pos,"Parker holds true to Wilde 's own vision of a pure comedy with absolutely no meaning , and no desire to be anything but a polished , sophisticated entertainment that is in love with its own cleverness ."
112 | neg,You 're better off staying home and watching The X-Files .
113 | neg,Falters when it takes itself too seriously and when it depends too heavily on its otherwise talented cast to clown in situations that are n't funny .
114 | neg,The story and characters are nowhere near gripping enough .
115 | neg,"It does n't help that the director and cinematographer Stephen Kazmierski shoot on grungy video , giving the whole thing a dirty , tasteless feel ."
116 | pos,"Roger Michell , who did an appealing job directing Persuasion and Notting Hill in England , gets too artsy in his American debut ."
117 | pos,"It extends the writings of Jean Genet and John Rechy , the films of Fassbinder , perhaps even the nocturnal works of Goya ."
118 | pos,"Often gruelling and heartbreaking to witness , but Seldahl and Wollter 's sterling performances raise this far above the level of the usual maudlin disease movie ."
119 | pos,"Mixes likeable personalities , inventive photography and cutting , and wall-to-wall toe-tapping music to paint a picture of a subculture that is at once exhilarating , silly , perverse , hopeful and always fun ."
120 | neg,"Nothing Denis has made before , like Beau Travil and Nenette et Boni , could prepare us for this gory , perverted , sex-soaked riff on the cannibal genre ."
121 | pos,"Ranging from funny to shattering and featuring some of the year 's best acting , Personal Velocity gathers plenty of dramatic momentum ."
122 | pos,Moore 's performance impresses almost as much as her work with Haynes in 1995 's Safe .
123 | pos,"( Schweiger is ) talented and terribly charismatic , qualities essential to both movie stars and social anarchists ."
124 | pos,This is simply the most fun you 'll ever have with a documentary !
125 | neg,A benign but forgettable sci-fi diversion .
126 | neg,Aspires for the piquant but only really achieves a sort of ridiculous sourness .
127 | neg,It wo n't be long before you 'll spy I Spy at a video store near you .
128 | neg,The story loses its bite in a last-minute happy ending that 's even less plausible than the rest of the picture .
129 | pos,Skip work to see it at the first opportunity .
130 | pos,Some actors have so much charisma that you 'd be happy to listen to them reading the phone book .
131 | pos,"Subversive , meditative , clinical and poetic , The Piano Teacher is a daring work of genius ."
132 | pos,"Call me a wimp , but I cried , not once , but three times in this animated sweet film ."
133 | pos,May be spoofing an easy target -- those old ' 50 's giant creature features -- but ... it acknowledges and celebrates their cheesiness as the reason why people get a kick out of watching them today .
134 | pos,"It 's rare to find a film that dazzles the eye , challenges the brain , AND satisfies our lust for fast-paced action , but Minority Report delivers all that and a whole lot more ."
135 | neg,Mark me down as a non-believer in werewolf films that are not serious and rely on stupidity as a substitute for humor .
136 | neg,Flaunts its quirky excesses like a New Year 's Eve drunk sporting a paper party hat .
137 | neg,But this time there 's some mold on the gold .
138 | pos,Makes an aborbing if arguable case for the man 's greatness .
139 | neg,"Do not , under any circumstances , consider taking a child younger than middle school age to this wallow in crude humor ."
140 | neg,It 's a great deal of sizzle and very little steak .
141 | pos,"If you sometimes like to go to the movies to have fun , Wasabi is a good place to start ."
142 | neg,"... too slow , too boring , and occasionally annoying ."
143 | pos,"The movie is full of fine performances , led by Josef Bierbichler as Brecht and Monica Bleibtreu as Helene Weigel , his wife ."
144 | neg,"Marisa Tomei is good , but Just A Kiss is just a mess ."
145 | neg,It 's neither as romantic nor as thrilling as it should be .
146 | pos,"Merely as a technical , logistical feat , Russian Ark marks a cinematic milestone ."
147 | pos,Ford deserves to be remembered at Oscar time for crafting this wonderful portrait of a conflicted soldier .
148 | neg,"A recipe for cinematic disaster ... part Quentin Tarantino , part Guy Ritchie , and part 1960s spy spoof , it 's all bad ."
149 | pos,Like old myths and wonder tales spun afresh .
150 | pos,Provide ( s ) nail-biting suspense and credible characters without relying on technology-of-the-moment technique or pretentious dialogue .
151 | pos,"Mr. Parker has brilliantly updated his source and grasped its essence , composing a sorrowful and hilarious tone poem about alienated labor , or an absurdist workplace sitcom ."
152 | neg,Its save-the-planet message clashes with its crass marketing .
153 | pos,"( A ) rare , beautiful film ."
154 | pos,There is n't a weak or careless performance amongst them .
155 | neg,But it pays a price for its intricate intellectual gamesmanship .
156 | neg,"With a story inspired by the tumultuous surroundings of Los Angeles , where feelings of marginalization loom for every dreamer with a burst bubble , The Dogwalker has a few characters and ideas , but it never manages to put them on the same path ."
157 | neg,"Pryor Lite , with half the demons , half the daring , much less talent , many fewer laughs ."
158 | pos,"An intelligent , moving and invigorating film ."
159 | pos,An unbelievably fun film just a leading man away from perfection .
160 | neg,"With its hints of a greater intelligence lurking somewhere , The Ring makes its stupidity more than obvious ."
161 | pos,"Watching Scarlet Diva , one is poised for titillation , raw insight or both ."
162 | pos,"Here 's a British flick gleefully unconcerned with plausibility , yet just as determined to entertain you ."
163 | neg,This is n't a `` Friday '' worth waiting for .
164 | neg,Automatically pegs itself for the straight-to-video sci-fi rental shelf .
165 | pos,Mnch 's genuine insight makes the film 's occasional overindulgence forgivable .
166 | neg,"Bad Company leaves a bad taste , not only because of its bad-luck timing , but also the staleness of its script ."
167 | neg,"It 's the kind of movie you ca n't quite recommend because it is all windup and not much of a pitch , yet you ca n't bring yourself to dislike it ."
168 | neg,fear dot com is so rambling and disconnected it never builds any suspense .
169 | pos,A byzantine melodrama that stimulates the higher brain functions as well as the libido .
170 | neg,This is the type of movie best enjoyed by frat boys and college kids while sucking on the bong and downing one alcoholic beverage after another .
171 | neg,Maudlin and melodramatic we expected .
172 | pos,"Enormously likable , partly because it is aware of its own grasp of the absurd ."
173 | neg,"Flashy , pretentious and as impenetrable as Morvern 's thick , working-class Scottish accent ."
174 | neg,"The character is too forced and overwritten to be funny or believable much of the time , and Clayburgh does n't always improve the over-the-top mix ."
175 | pos,A dream cast of solid female talent who build a seamless ensemble .
176 | neg,The Sum of All Fears is almost impossible to follow -- and there 's something cringe-inducing about seeing an American football stadium nuked as pop entertainment .
177 | neg,"Unfortunately , as a writer , Mr. Montias is n't nearly as good to his crew as he is as a director or actor ."
178 | neg,"Instead , we just get messy anger , a movie as personal therapy ."
179 | pos,... a poignant and powerful narrative that reveals that reading writing and arithmetic are not the only subjects to learn in life .
180 | pos,A remarkable 179-minute meditation on the nature of revolution .
181 | pos,"This road movie gives you emotional whiplash , and you 'll be glad you went along for the ride ."
182 | pos,"While not as aggressively impressive as its American counterpart , `` In the Bedroom , '' Moretti 's film makes its own , quieter observations"
183 | neg,Beyond a handful of mildly amusing lines ... there just is n't much to laugh at .
184 | neg,Has little on its mind aside from scoring points with drag gags .
185 | pos,"The film delivers not just the full assault of Reno 's immense wit and insight , but a time travel back to what it felt like during those unforgettably uncertain days ."
186 | neg,A sermonizing and lifeless paean to teenage dullards .
187 | neg,It took 19 predecessors to get THIS ?
188 | pos,"Newton draws our attention like a magnet , and acts circles around her better known co-star , Mark Wahlberg ."
189 | pos,"A bold and subversive film that cuts across the grain of what is popular and powerful in this high-tech age , speaking its truths with spellbinding imagery and the entrancing music of Philip Glass ."
190 | pos,"In scope , ambition and accomplishment , Children of the Century ... takes Kurys ' career to a whole new level ."
191 | neg,"Once the audience figure out what 's being said , the filmmaker 's relative passivity will make it tough for them to really care ."
192 | neg,Little more than a frothy vanity project .
193 | pos,"Zany , exuberantly irreverent animated space adventure ."
194 | pos,"De Oliveira creates an emotionally rich , poetically plump and visually fulsome , but never showy , film whose bittersweet themes are reinforced and brilliantly personified by Michel Piccoli ."
195 | pos,"There are times when A Rumor of Angels plays like an extended episode of Touched by an Angel -- a little too much dancing , a few too many weeping scenes -- but I liked its heart and its spirit ."
196 | neg,"Verbinski substitutes atmosphere for action , tedium for thrills ."
197 | neg,"Others may find it migraine-inducing , despite Moore 's attempts at whimsy and spoon feeding ."
198 | pos,"After making several adaptations of other writers ' work , Armenian-Canadian director Atom Egoyan broached an original treatment of a deeply personal subject ."
199 | neg,Its underlying mythology is a hodgepodge of inconsistencies that pose the question : Since when did dumb entertainment have to be this dumb ?
200 | neg,"... an unimaginative , nasty , glibly cynical piece of work ."
201 | neg,"Still , it gets the job done -- a sleepy afternoon rental ."
202 | neg,A predictable and stereotypical little B-movie .
203 | pos,... always remains movingly genuine .
204 | pos,"It 's traditional moviemaking all the way , but it 's done with a lot of careful period attention as well as some very welcome wit ."
205 | neg,` Dragonfly ' is a movie about a bus wreck that turns into a film wreck .
206 | neg,Here 's a self-congratulatory 3D IMAX rah-rah .
207 | neg,Watching the film is like reading a Times Portrait of Grief that keeps shifting focus to the journalist who wrote it .
208 | pos,"The Hours makes you examine your own life in much the same way its characters do , and the experience is profound ."
209 | pos,"But based on CQ , I 'll certainly be keeping an eye out for his next project ."
210 | pos,"A somewhat crudely constructed but gripping , questing look at a person so racked with self-loathing , he becomes an enemy to his own race ."
211 | neg,But hard-to-believe plot twists force the movie off track in its final half hour .
212 | neg,The backyard battles you staged with your green plastic army men were more exciting and almost certainly made more sense .
213 | pos,Good actress .
214 | pos,"His characters are engaging , intimate and the dialogue is realistic and greatly moving ."
215 | neg,"All ends well , sort of , but the frenzied comic moments never click ."
216 | pos,"A recent favourite at Sundance , this white-trash satire will inspire the affection of even those unlucky people who never owned a cassette of Def Leppard 's Pyromania ."
217 | pos,"Not everything works , but the average is higher than in Mary and most other recent comedies ."
218 | neg,Director Dirk Shafer and co-writer Greg Hinton ride the dubious divide where gay porn reaches for serious drama .
219 | neg,"Watching Trouble Every Day , at least if you do n't know what 's coming , is like biting into what looks like a juicy , delicious plum on a hot summer day and coming away with your mouth full of rotten pulp and living worms ."
220 | neg,The film is a travesty of the genre and even as spoof takes itself too seriously .
221 | neg,A bravura exercise in emptiness .
222 | neg,Moderately involving despite bargain-basement photography and hackneyed romance .
223 | neg,"All in all , there 's only one thing to root for : expulsion for everyone ."
224 | pos,Bow 's best moments are when he 's getting busy on the basketball court because that 's when he really scores .
225 | neg,The actors are forced to grapple with hazy motivations that never come into focus .
226 | neg,It becomes gimmicky instead of compelling .
227 | pos,"De Niro and McDormand give solid performances , but their screen time is sabotaged by the story 's inability to create interest ."
228 | pos,"A sensual performance from Abbass buoys the flimsy story , but her inner journey is largely unexplored and we 're left wondering about this exotic-looking woman whose emotional depths are only hinted at ."
229 | pos,"It suggests the wide-ranging effects of media manipulation , from the kind of reporting that is done by the supposedly liberal media ... to the intimate and ultimately tragic heartache of maverick individuals like Hatfield and Hicks ."
230 | neg,"Herzog is obviously looking for a moral to his fable , but the notion that a strong , unified showing among Germany and Eastern European Jews might have changed 20th-Century history is undermined by Ahola 's inadequate performance ."
231 | pos,"He just wants them to be part of the action , the wallpaper of his chosen reality ."
232 | pos,"But in Imax 3-D , the clichs disappear into the vertiginous perspectives opened up by the photography ."
233 | neg,"Like most Bond outings in recent years , some of the stunts are so outlandish that they border on being cartoonlike ."
234 | pos,A classy item by a legend who may have nothing left to prove but still has the chops and drive to show how its done .
235 | pos,"If no one singles out any of these performances as award-worthy , it 's only because we would expect nothing less from this bunch ."
236 | pos,"Solid , lump-in-the-throat family entertainment that derives its power by sticking to the facts ."
237 | pos,"The animated subplot keenly depicts the inner struggles of our adolescent heroes - insecure , uncontrolled , and intense ."
238 | neg,There are films that try the patience of even the most cinema-besotted critic -- and this was one of them .
239 | pos,"This is n't my favorite in the series , still I enjoyed it enough to recommend ."
240 | pos,The way Coppola professes his love for movies -- both colorful pop junk and the classics that unequivocally qualify as art -- is giddily entertaining .
241 | pos,"This is what IMAX was made for : Strap on a pair of 3-D goggles , shut out the real world , and take a vicarious voyage to the last frontier -- space ."
242 | pos,A richly imagined and admirably mature work from a gifted director who definitely has something on his mind .
243 | neg,"Starts off with a bang , but then fizzles like a wet stick of dynamite at the very end ."
244 | neg,What parents will suspect is that they 're watching a 76-minute commercial .
245 | pos,"The WWII drama is well plotted , visually striking and filled with enjoyably complex characters who are never what they first appear ."
246 | pos,Few films this year have been as resolute in their emotional nakedness .
247 | neg,"If this is the resurrection of the Halloween franchise , it would have been better off dead ."
248 | neg,Does n't amount to much of anything .
249 | neg,"So stupid , so ill-conceived , so badly drawn , it created whole new levels of ugly ."
250 | neg,One groan-inducing familiarity begets another .
251 | pos,"Funny and , at times , poignant , the film from director George Hickenlooper all takes place in Pasadena , `` a city where people still read . ''"
252 | pos,"Only an epic documentary could get it all down , and Spike Lee 's Jim Brown : All American at long last gives its subject a movie worthy of his talents ."
253 | pos,"( Wendigo is ) why we go to the cinema : to be fed through the eye , the heart , the mind ."
254 | neg,One Hour Photo is an intriguing snapshot of one man and his delusions ; it 's just too bad it does n't have more flashes of insight .
255 | pos,"It 's rather like a Lifetime special -- pleasant , sweet and forgettable ."
256 | pos,The result is mesmerizing -- filled with menace and squalor .
257 | pos,( Howard ) so good as Leon Barlow ... that he hardly seems to be acting .
258 | neg,Looks like a high school film project completed the day before it was due .
259 | pos,"Lovingly choreographed bloodshed taking place in a pristine movie neverland , basically ."
260 | neg,"An ungainly , comedy-deficient , B-movie rush job ..."
261 | neg,"For anyone who grew up on Disney 's 1950 Treasure Island , or remembers the 1934 Victor Fleming classic , this one feels like an impostor ."
262 | pos,The episodic film makes valid points about the depersonalization of modern life .
263 | pos,"Doug Liman , the director of Bourne , directs the traffic well , gets a nice wintry look from his locations , absorbs us with the movie 's spycraft and uses Damon 's ability to be focused and sincere ."
264 | pos,"The film is surprisingly well-directed by Brett Ratner , who keeps things moving well -- at least until the problematic third act ."
265 | neg,"The performances are so leaden , Michael Rymer 's direction is so bloodless and the dialogue is so corny that the audience laughs out loud ."
266 | pos,A sports movie with action that 's exciting on the field and a story you care about off it .
267 | neg,"A turgid little history lesson , humourless and dull ."
268 | pos,"A comprehensive and provocative film -- one that pushes the boundaries of biography , and challenges its audience ."
269 | pos,"Ms. Fulford-Wierzbicki is almost spooky in her sulky , calculating Lolita turn ."
270 | pos,A vivid cinematic portrait .
271 | neg,Bears is bad .
272 | pos,A carefully structured scream of consciousness that is tortured and unsettling -- but unquestionably alive .
273 | pos,It 's a terrific American sports movie and Dennis Quaid is its athletic heart .
274 | pos,Fuller would surely have called this gutsy and at times exhilarating movie a great yarn .
275 | pos,"A triumph , relentless and beautiful in its downbeat darkness ."
276 | pos,Steers turns in a snappy screenplay that curls at the edges ; it 's so clever you want to hate it .
277 | pos,The movie stays afloat thanks to its hallucinatory production design .
278 | pos,"My goodness , Queen Latifah has a lot to offer and she seemed to have no problem flaunting her natural gifts ."
279 | pos,"As Weber and Weissman demonstrate with such insight and celebratory verve , the Cockettes were n't as much about gender , sexual preference or political agitprop as they were simply a triumph of the indomitable human will to rebel , connect and create ."
280 | pos,A psychologically rich and suspenseful moral thriller with a stellar performance by Al Pacino .
281 | pos,"A slick , well-oiled machine , exquisitely polished and upholstered ."
282 | neg,Barely goes beyond comic book status .
283 | neg,"Another big , dumb action movie in the vein of XXX , The Transporter is riddled with plot holes big enough for its titular hero to drive his sleek black BMW through ."
284 | neg,"The low-budget Full Frontal was one of the year 's murkiest , intentionally obscure and self-indulgent pictures , and Solaris is its big-budget brother ."
285 | neg,Rates an ` E ' for effort -- and a ` B ' for boring .
286 | pos,"Visits spy-movie territory like a novel you ca n't put down , examines a footnote to history seldom brought to light on the screen , and keeps you guessing from first frame to last ."
287 | neg,"Still , this thing feels flimsy and ephemeral ."
288 | neg,An ultra-low-budget indie debut that smacks more of good intentions than talent .
289 | neg,Abandons all pretense of creating historical context and waltzes off into a hectic soap about the ups and downs of the heavy breathing between the two artists .
290 | pos,"In its ragged , cheap and unassuming way , the movie works ."
291 | pos,"At about 95 minutes , Treasure Planet maintains a brisk pace as it races through the familiar story ."
292 | neg,A thinly veiled excuse for Wilson to play his self-deprecating act against Murphy 's well-honed prima donna shtick .
293 | neg,"Afraid to pitch into farce , yet only half-hearted in its spy mechanics , All the Queen 's Men is finally just one long drag ."
294 | neg,"Theology aside , why put someone who ultimately does n't learn at the center of a kids ' story ?"
295 | pos,Take Care of My Cat offers a refreshingly different slice of Asian cinema .
296 | pos,An involving true story of a Chinese actor who takes up drugs and winds up in an institution -- acted mostly by the actual people involved .
297 | neg,"Plays as hollow catharsis , with lots of tears but very little in the way of insights ."
298 | pos,This is such a dazzlingly self-assured directorial debut that it 's hard to know what to praise first .
299 | pos,"another great ` what you do n't see ' is much more terrifying than what you do see thriller , coupled with some arresting effects , incandescent tones and stupendous performances"
300 | neg,"Some of Seagal 's action pictures are guilty pleasures , but this one is so formulaic that it seems to be on auto-pilot ."
301 | neg,"I did n't laugh at the ongoing efforts of Cube , and his skinny buddy Mike Epps , to make like Laurel and Hardy 'n the hood ."
302 | 


--------------------------------------------------------------------------------