├── .gitignore
├── ModelTest.py
├── __init__.py
├── debug.py
├── environ-pku1.sh
├── environ.sh
├── experiments.py
├── models.py
├── real
    ├── __init__.py
    ├── baseline
    │   ├── gen_vocab.sh
    │   ├── lm_main.sh
    │   ├── main.sh
    │   ├── main_kn.sh
    │   └── sri_env.sh
    ├── exp_nce0_norm.py
    ├── exp_nce2_norm.py
    ├── exp_nce2_zregression.py
    ├── exp_nce4_norm.py
    ├── layers.py
    ├── main_lblv1-pku1.sh
    ├── main_lblv1.py
    ├── main_lblv1.sh
    ├── main_lblv2.py
    ├── main_lblv2.sh
    ├── main_nce2.py
    ├── main_nce2.sh
    ├── main_nce4.py
    ├── main_nce4.sh
    ├── main_nce4_lab_proxy_pku2.sh
    ├── main_nce4_pku1.sh
    ├── main_nce4_pku1_proxy_pku2.sh
    ├── main_nce4_pku2.sh
    ├── main_nce4_pku3.sh
    ├── main_nce4_pku3_proxy_pku2.sh
    ├── main_nce7.py
    ├── main_nce7.sh
    ├── main_nce7_pku1.sh
    ├── main_nce7_pku2.sh
    ├── main_nce7_pku3.sh
    ├── main_nce8.py
    ├── main_nce8.sh
    ├── main_nce8_nodecay_lab.sh
    ├── main_nce8_pku1.sh
    ├── main_nce8_pku2.sh
    ├── main_nce8_pku3.sh
    ├── models.py
    ├── run_batch.sh
    ├── run_nce0.py
    ├── run_nce0_default.py
    ├── run_nce0_neg100_default.py
    ├── run_nce0_neg50_lr0.005.py
    ├── run_nce0_neg50_lr0.01.py
    ├── run_nce0_neg50_lr0.01_g0.001.py
    ├── run_nce1_neg50_default.py
    ├── run_nce2.py
    ├── run_nce2_neg50_lr0.01_g0.001.py
    ├── run_nce3.py
    ├── run_nce4.py
    ├── run_nce5.py
    ├── run_nce6.py
    ├── run_tree_huffman_lr0.01_g0.001.py
    ├── utils
    │   ├── __init__.py
    │   ├── check_maps.py
    │   ├── preprocess.py
    │   └── utils.py
    └── workspace
    │   ├── export_sri_data.py
    │   ├── extract_learning_curv_data.py
    │   ├── gen_train_data.py
    │   └── show_time_loss.m
├── stat
    ├── get_stat.py
    ├── read_stats.m
    └── show_stats.m
├── test
    ├── __init__.py
    ├── snippet.py
    └── test_io.py
└── utils
    ├── __init__.py
    ├── fake_data.py
    ├── preprocess.py
    ├── test.py
    └── tree_util.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | brown-cluster
 2 | srilm-1.7.1
 3 | data
 4 | stat/*.fig
 5 | stat/*.jpg
 6 | .idea
 7 | .ipynb_checkpoints
 8 | # Created by .ignore support plugin (hsz.mobi)
 9 | ### Matlab template
10 | ##---------------------------------------------------
11 | ## Remove autosaves generated by the Matlab editor
12 | ## We have git for backups!
13 | ##---------------------------------------------------
14 | 
15 | # Windows default autosave extension
16 | *.asv
17 | 
18 | # OSX / *nix default autosave extension
19 | *.m~
20 | 
21 | # Compiled MEX binaries (all platforms)
22 | *.mex*
23 | 
24 | # Simulink Code Generation
25 | slprj/
26 | 
27 | 
28 | ### Python template
29 | # Byte-compiled / optimized / DLL files
30 | __pycache__/
31 | *.py[cod]
32 | 
33 | # C extensions
34 | *.so
35 | 
36 | # Distribution / packaging
37 | .Python
38 | env/
39 | build/
40 | develop-eggs/
41 | dist/
42 | downloads/
43 | eggs/
44 | .eggs/
45 | lib/
46 | lib64/
47 | parts/
48 | sdist/
49 | var/
50 | *.egg-info/
51 | .installed.cfg
52 | *.egg
53 | 
54 | # PyInstaller
55 | #  Usually these files are written by a python script from a template
56 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
57 | *.manifest
58 | *.spec
59 | 
60 | # Installer logs
61 | pip-log.txt
62 | pip-delete-this-directory.txt
63 | 
64 | # Unit test / coverage reports
65 | htmlcov/
66 | .tox/
67 | .coverage
68 | .coverage.*
69 | .cache
70 | nosetests.xml
71 | coverage.xml
72 | 
73 | # Translations
74 | *.mo
75 | *.pot
76 | 
77 | # Django stuff:
78 | *.log
79 | 
80 | # Sphinx documentation
81 | docs/_build/
82 | 
83 | # PyBuilder
84 | target/
85 | 
86 | 
87 | ### Vim template
88 | [._]*.s[a-w][a-z]
89 | [._]s[a-w][a-z]
90 | *.un~
91 | Session.vim
92 | .netrwhist
93 | *~
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/ModelTest.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | __author__ = 'Yunchuan Chen'
  4 | 
  5 | from models import SimpleLangModel, NCELangModel, NCELangModelV1, TreeLangModel
  6 | from keras.layers.core import Dropout, Dense
  7 | import os
  8 | import logging
  9 | import optparse
 10 | import cPickle as pickle
 11 | import theano
 12 | 
 13 | floatX = theano.config.floatX
 14 | 
 15 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
 16 | parser.add_option("-q", "--quiet",
 17 |                   action="store_false", dest="verbose", default=True,
 18 |                   help="don't print progress bar to stdout")
 19 | parser.add_option("-s", "--simple",
 20 |                   action="store_true", dest="train_simple", default=False,
 21 |                   help="Train Simple language model")
 22 | parser.add_option("-n", "--nce",
 23 |                   action="store_true", dest="train_nce", default=False,
 24 |                   help="Train NCE based language model")
 25 | parser.add_option("-c", "--nce1",
 26 |                   action="store_true", dest="train_nce1", default=False,
 27 |                   help="Train NCE based language model V1")
 28 | parser.add_option("-b", "--batch-size", type='int', dest="batch_size", default=256,
 29 |                   help="Batch size")
 30 | parser.add_option("-t", "--test",
 31 |                   action="store_true", dest="test", default=False,
 32 |                   help="train on small data set")
 33 | parser.add_option("-r", "--tree",
 34 |                   action="store_true", dest="tree", default=False,
 35 |                   help="Train hierarchical softmax language model")
 36 | parser.add_option("-m", "--tree-type", type='str',
 37 |                   dest="tree_type", default='huffman',
 38 |                   help="Specify the type of the tree")
 39 | parser.add_option("-d", "--debug",
 40 |                   action="store_true", dest="debug", default=False,
 41 |                   help="show debug information")
 42 | parser.add_option("-g", "--unigram",
 43 |                   action="store_true", dest="unigram", default=False,
 44 |                   help="Whether use unigram distribution for noise samples")
 45 | parser.add_option("-z", "--optimizer", type='str',
 46 |                   dest="optimizer", default='adam',
 47 |                   help="Specify optimizer")
 48 | parser.add_option('-a', "--attention", dest="attention", type='str', default='none',
 49 |                   help='Specify attention model')
 50 | parser.add_option('-l', '--attention-length', dest='att_len', type='int', default=10,
 51 |                   help='Specify attention bias length')
 52 | 
 53 | options, args = parser.parse_args()
 54 | # ====================================================================================
 55 | # if TESTLM environment variable is defined, run the program on a small data set.
 56 | if os.environ.get('TESTLM') is not None or options.test:
 57 |     data_path = os.path.abspath('data/fake/test')
 58 | else:
 59 |     data_path = os.path.abspath('data/fake')
 60 | #data_path = os.path.abspath('data/fake/test')
 61 | 
 62 | if options.debug:
 63 |     logging.basicConfig(level=logging.DEBUG)
 64 | else:
 65 |     logging.basicConfig(level=logging.INFO)
 66 | 
 67 | if options.unigram:
 68 |     import cPickle as pickle
 69 |     with file(os.path.join(data_path, 'meta.pkl'), 'rb') as mf:
 70 |         meta = pickle.load(mf)
 71 |         negprob_table = meta['rel_freq']
 72 | else:
 73 |     negprob_table = None
 74 | 
 75 | if options.train_simple:
 76 |     logging.info('Train simple language model')
 77 |     model = SimpleLangModel(vocab_size=15, embed_dims=128, context_dims=128, optimizer=options.optimizer)
 78 |     model.compile()
 79 |     model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose)
 80 | 
 81 | if options.train_nce:
 82 |     logging.info('Train NCE based language model')
 83 |     model = NCELangModel(vocab_size=15, nb_negative=2, embed_dims=128, negprob_table=negprob_table,
 84 |                          optimizer=options.optimizer)
 85 |     model.compile()
 86 |     logging.debug('compile success')
 87 |     model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose)
 88 | 
 89 | if options.train_nce1:
 90 |     logging.info('Train NCE based language model (1)')
 91 |     model = NCELangModelV1(vocab_size=15, nb_negative=6, embed_dims=128, negprob_table=negprob_table,
 92 |                            optimizer=options.optimizer)
 93 |     model.compile()
 94 |     logging.debug('compile success')
 95 |     model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose)
 96 | 
 97 | if options.tree:
 98 |     logging.info('Train hierarchical softmax language model')
 99 | 
100 |     if options.tree_type == 'huffman':
101 |         logging.info('train with Huffman Tree')
102 |         tree_file = 'data/fake/tree-info-huffman.pkl'
103 |     else:
104 |         tree_file = 'data/fake/tree-info.pkl'
105 |         logging.info('Train with Brown tree')
106 | 
107 |     with file('data/fake/tree-info.pkl', 'rb') as f:
108 |         tree_info = pickle.load(f)
109 |         word2cls = tree_info['idx2cls']
110 |         word2bitstr = tree_info['idx2bitstr'].astype(floatX)
111 | 
112 |     model = TreeLangModel(vocab_size=15, embed_dim=128, cntx_dim=128, word2class=word2cls, word2bitstr=word2bitstr)
113 |     model.compile(optimizer=options.optimizer)
114 |     logging.debug('compile success')
115 |     model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose)
116 | 
117 | if options.attention == 'simple':
118 |     from models import SimpAttLangModel
119 |     model = SimpAttLangModel(vocab_size=15, embed_dims=128, context_dim=128, attention_len=options.att_len)
120 |     model.compile()
121 |     logging.debug('compile success')
122 |     model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose)
123 | 
124 | if options.attention == 'parallel':
125 |     from models import ParallelAttLangModel
126 |     model = ParallelAttLangModel(vocab_size=15, embed_dims=128, context_dim=128, attention_len=options.att_len)
127 |     model.compile()
128 |     logging.debug('compile success')
129 |     model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose)
130 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | __author__ = 'Yunchuan Chen'
4 | 
5 | 


--------------------------------------------------------------------------------
/debug.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from models import TreeLangModel
 6 | import cPickle as pickle
 7 | import theano
 8 | import numpy as np
 9 | 
10 | floatX = theano.config.floatX
11 | train_file = 'data/fake/001.bz2'
12 | 
13 | with file('data/fake/tree-info.pkl', 'rb') as f:
14 |         tree_info = pickle.load(f)
15 |         word2cls = tree_info['idx2cls']
16 |         word2bitstr = tree_info['idx2bitstr']
17 | 
18 | model = TreeLangModel(vocab_size=15, embed_dim=128, cntx_dim=128, word2class=word2cls, word2bitstr=word2bitstr)
19 | 
20 | 
21 | X = np.loadtxt(train_file, dtype='int32')
22 | data = X[:256]
23 | del X
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/environ-pku1.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # source ~/bin/ch.gcc-4.8.4.sh
 3 | export THEANO_FLAGS=mode=FAST_RUN,device=cpu,floatX=float32,nvcc.fastmath=True,scan.allow_gc=True,allow_gc=True
 4 | #export PYTHONPATH=/home/cyc/Documents/workspace:$PYTHONPATH
 5 | export PYTHONPATH=$HOME/.chen/workspace:$PYTHONPATH
 6 | export OMP_NUM_THREADS=4
 7 | export SRILM="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/srilm-1.7.1
 8 | export PATH=$PATH:$SRILM/bin/i686-m64
 9 | export MANPATH=$MANPATH:$SRILM/man
10 | export LC_NUMERIC=C
11 | 
12 | 


--------------------------------------------------------------------------------
/environ.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | source ~/bin/ch.gcc-4.8.4.sh
 3 | export THEANO_FLAGS=mode=FAST_RUN,device=cpu,floatX=float32,nvcc.fastmath=True,scan.allow_gc=True,allow_gc=True
 4 | export PYTHONPATH=/home/cyc/Documents/workspace:$PYTHONPATH
 5 | # export PYTHONPATH=$HOME/.chen/workspace:$PYTHONPATH
 6 | export OMP_NUM_THREADS=4
 7 | export SRILM="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/srilm-1.7.1
 8 | export PATH=$PATH:$SRILM/bin/i686-m64
 9 | export MANPATH=$MANPATH:$SRILM/man
10 | export LC_NUMERIC=C
11 | 
12 | 


--------------------------------------------------------------------------------
/real/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | __author__ = 'Yunchuan Chen'


--------------------------------------------------------------------------------
/real/baseline/gen_vocab.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/bash
2 | for (( nb_vocab=10000; nb_vocab<52000; nb_vocab+=2000 )); do
3 |     let file_name=${nb_vocab}/1000
4 |     for (( i=0; i<$nb_vocab; ++i )); do
5 | 	echo $i 
6 |     done > ../../data/corpus/sri/${file_name}k.vocab
7 | done
8 | 
9 | 


--------------------------------------------------------------------------------
/real/baseline/lm_main.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_DIR="../../data/corpus/sri"
 4 | TEXTFILE="${DATA_DIR}/wiki-trn-R100m-V100k.txt"
 5 | TEST_DATA="${DATA_DIR}/wiki-val-R5m-V100k.txt"
 6 | COUNT_FILE="${DATA_DIR}/wiki-trn-R100m-order4-gt1-3.count"
 7 | OOV="900000"
 8 | 
 9 | ngram-count -text $TEXTFILE -order 4 -write-binary $COUNT_FILE \
10 |     -gt1 ${DATA_DIR}/gt1.params \
11 |     -gt2 ${DATA_DIR}/gt2.params \
12 |     -gt3 ${DATA_DIR}/gt3.params
13 | 
14 | for (( nb_vocab=10; nb_vocab <=50; nb_vocab+= 2 )); do
15 |     LM="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-gt1-3.lm"
16 |     echo "Begin: Test order 4, gt=${gt}, vocab: ${nb_vocab}"
17 | 
18 |     ngram-count -read $COUNT_FILE -vocab ${nb_vocab}k.vocab -unk -map-unk $OOV \ 
19 |         -order 4 -write-binary-lm -lm $LM \
20 |         -gt1 ${DATA_DIR}/gt1.params \
21 |         -gt2 ${DATA_DIR}/gt2.params \
22 |         -gt3 ${DATA_DIR}/gt3.params
23 |     ngram -unk -map-unk $OOV -lm ${LM} -ppl ${TEST_DATA}
24 | 
25 |     echo "END: Test order 4, gt=${gt}, vocab: ${nb_vocab}"
26 | done
27 | 
28 | # for (( nb_vocab=10; nb_vocab <=50; nb_vocab+= 2 )); do
29 | #     TEXTFILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k.txt"
30 | #     TEST_DATA="${DATA_DIR}/wiki-val-R5m-V${nb_vocab}k.txt"
31 | #     for (( kn = 1; kn <=4; kn += 1 )); do
32 | #         COUNT_FILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-kn${kn}.count"
33 | #         LM="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-kn${kn}.lm"
34 | #         echo "Begin: Test order 4, kn=${kn}, vocab: ${nb_vocab}"
35 | #         ngram-count -text $TEXTFILE -kndiscount $kn -order 4 -write-binary $COUNT_FILE
36 | #         ngram-count -read $COUNT_FILE -kn-counts-modified -write-binary-lm -lm $LM
37 | #         ngram -lm ${LM} -ppl ${TEST_DATA}
38 | #         echo "END: Test order 4, kn=${kn}"
39 | #     done
40 | # done
41 | 


--------------------------------------------------------------------------------
/real/baseline/main.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_DIR="../../data/corpus/sri"
 4 | 
 5 | for (( nb_vocab=10; nb_vocab <=50; nb_vocab+= 2 )); do
 6 |     TEXTFILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k.txt"
 7 |     TEST_DATA="${DATA_DIR}/wiki-val-R5m-V${nb_vocab}k.txt"
 8 |     for (( kn = 1; kn <=4; kn += 1 )); do
 9 |         COUNT_FILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-kn${kn}.count"
10 |         LM="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-kn${kn}.lm"
11 |         echo "Begin: Test order 4, kn=${kn}, vocab: ${nb_vocab}"
12 |         ngram-count -text $TEXTFILE -kndiscount $kn -order 4 -write-binary $COUNT_FILE
13 |         ngram-count -read $COUNT_FILE -kn-counts-modified -write-binary-lm -lm $LM
14 |         ngram -lm ${LM} -ppl ${TEST_DATA}
15 |         echo "END: Test order 4, kn=${kn}"
16 |     done
17 | done
18 | 
19 | for (( nb_vocab=10; nb_vocab <=50; nb_vocab+= 2 )); do
20 |     TEXTFILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k.txt"
21 |     TEST_DATA="${DATA_DIR}/wiki-val-R5m-V${nb_vocab}k.txt"
22 |     for (( gt = 1; gt <=4; gt += 1 )); do
23 |         COUNT_FILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-gt${gt}.count"
24 |         LM="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-gt${gt}.lm"
25 |         PAR="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-gt${gt}.gt"
26 |         echo "Begin: Test order 4, gt=${gt}, vocab: ${nb_vocab}"
27 |         ngram-count -text $TEXTFILE  -order 4 -gt${gt} ${PAR} -write-binary $COUNT_FILE
28 |         ngram-count -read $COUNT_FILE -write-binary-lm -lm ${LM} -gt${gt} ${PAR}
29 |         ngram -lm ${LM} -ppl ${TEST_DATA}
30 |         echo "END: Test order 4, gt=${gt}, vocab: ${nb_vocab}"
31 |     done
32 | done


--------------------------------------------------------------------------------
/real/baseline/main_kn.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Usage: prog logfile
 3 | DATA_DIR="../../data/corpus/sri"
 4 | TEXTFILE="$DATA_DIR/wiki-trn-R100m.txt"
 5 | TESTFILE="$DATA_DIR/wiki-val-R5m.txt"
 6 | 
 7 | KnParams="-kn1 ${DATA_DIR}/wiki-kn1.param -kn2 ${DATA_DIR}/wiki-kn2.param -kn3 ${DATA_DIR}/wiki-kn3.param -kn4 ${DATA_DIR}/wiki-kn4.param"
 8 | CommParams="-unk -map-unk 10000000"
 9 | ngram-count -order 4 -text $TEXTFILE $KnParams
10 | 
11 | for (( nb_vocab=10; nb_vocab<=50; nb_vocab+=2 )); do
12 | 	NGRAMS=$DATA_DIR/wiki-V${nb_vocab}k.4grams
13 | 	VOCAB=$DATA_DIR/${nb_vocab}k.vocab
14 | 
15 | 	ngram-count -order 4 -text $TEXTFILE $CommParams -vocab $VOCAB -write-binary $NGRAMS
16 | 	for order in 2 3 4; do
17 | 		LM=$DATA_DIR/wiki-V${nb_vocab}k-order${order}.lm
18 | 		ngram-count -order $order -read $NGRAMS $CommParams \
19 | 		    -kndiscount${order} -write-binary-lm -lm $LM -vocab $VOCAB $KnParams
20 | 
21 | 		echo "PPL Results for V=$nb_vocab and order=$order: " | tee -a $1
22 | 		ngram -lm $LM -ppl $TESTFILE $CommParams | tee -a $1
23 | 	done
24 | done


--------------------------------------------------------------------------------
/real/baseline/sri_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | export SRILM="$( cd "../../$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/srilm-1.7.1
3 | export PATH=$PATH:$SRILM/bin/i686-m64
4 | export MANPATH=$MANPATH:$SRILM/man
5 | export LC_NUMERIC=C


--------------------------------------------------------------------------------
/real/exp_nce0_norm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | __author__ = 'Yunchuan Chen'
  4 | 
  5 | from utils import get_unigram_probtable
  6 | import optparse
  7 | from keras.optimizers import adam, AdamAnneal
  8 | # noinspection PyUnresolvedReferences
  9 | from models import LangModel, Graph, optimizers, categorical_crossentropy, objective_fnc, slice_X, \
 10 |     np, theano, TableSampler, Split, containers, T, LookupProb, logger, Embedding, math, make_batches, \
 11 |     PartialSoftmax, Dense, LogInfo, MAX_SETN_LEN, grouped_sentences, time, chunk_sentences, LangLSTMLayer
 12 | from layers import ActivationLayer
 13 | 
 14 | 
 15 | class NCELangModelV0(Graph, LangModel):
 16 |     def __init__(self, vocab_size, nb_negative, embed_dims=128, context_dims=128,
 17 |                  negprob_table=None, optimizer='adam'):
 18 |         super(NCELangModelV0, self).__init__(weighted_inputs=False)
 19 |         self.vocab_size = vocab_size
 20 |         self.embed_dim = embed_dims
 21 |         self.optimizer = optimizers.get(optimizer)
 22 |         self.nb_negative = nb_negative
 23 |         self.loss = categorical_crossentropy
 24 |         self.loss_fnc = objective_fnc(self.loss)
 25 | 
 26 |         if negprob_table is None:
 27 |             negprob_table_ = np.ones(shape=(vocab_size,), dtype=theano.config.floatX)/vocab_size
 28 |             negprob_table = theano.shared(negprob_table_)
 29 |             self.neg_prob_table = negprob_table_
 30 |         else:
 31 |             self.neg_prob_table = negprob_table.astype(theano.config.floatX)
 32 |             negprob_table = theano.shared(negprob_table.astype(theano.config.floatX))
 33 | 
 34 |         self.sampler = TableSampler(self.neg_prob_table)
 35 | 
 36 |         self.add_input(name='idxes', ndim=3, dtype='int32')
 37 |         self.add_node(Split(split_at=1, split_axis=0), name=('pos_sents', ''), inputs='idxes')
 38 | 
 39 |         seq = containers.Sequential()
 40 |         seq.add(self.nodes['pos_sents'])
 41 |         seq.add(Embedding(vocab_size, embed_dims))
 42 |         seq.add(LangLSTMLayer(embed_dims, output_dim=context_dims))
 43 |         # seq.add(Dropout(0.5))
 44 | 
 45 |         self.add_node(seq, name='seq')
 46 |         self.add_node(PartialSoftmax(input_dim=context_dims, output_dim=vocab_size),
 47 |                       name='part_prob', inputs=('idxes', 'seq'))
 48 |         self.add_node(LookupProb(negprob_table), name='lookup_prob', inputs='idxes')
 49 | 
 50 |         test_node = Dense(input_dim=context_dims, output_dim=vocab_size, activation='exponential')
 51 |         test_node.params = []
 52 |         test_node.W = self.nodes['part_prob'].W
 53 |         test_node.b = self.nodes['part_prob'].b
 54 |         self.add_node(test_node, name='true_unrm_prob', inputs='seq')
 55 |         self.add_node(ActivationLayer(name='normalization'), name='true_prob', inputs='true_unrm_prob')
 56 | 
 57 |         self.add_output('pos_prob', node='part_prob')
 58 |         self.add_output('neg_prob', node='lookup_prob')
 59 |         self.add_output('pred_prob', node='true_prob')
 60 |         self.add_output('unrm_prob', node='true_unrm_prob')
 61 | 
 62 |     # noinspection PyMethodOverriding
 63 |     def compile(self):
 64 |         pos_prob_layer = self.outputs['pos_prob']
 65 |         neg_prob_layer = self.outputs['neg_prob']
 66 |         pre_prob_layer = self.outputs['pred_prob']
 67 |         unrm_pro_layer = self.outputs['unrm_prob']
 68 | 
 69 |         pos_prob_trn = pos_prob_layer.get_output(train=True)
 70 |         neg_prob_trn = neg_prob_layer.get_output(train=True) * self.nb_negative
 71 |         pos_prob_tst = pos_prob_layer.get_output(train=False)
 72 |         neg_prob_tst = neg_prob_layer.get_output(train=False) * self.nb_negative
 73 |         pre_prob_tst = pre_prob_layer.get_output(train=False)
 74 |         unrm_pro_tst = unrm_pro_layer.get_output(train=False)
 75 | 
 76 |         partition = T.sum(unrm_pro_tst, axis=-1)
 77 |         sum_unrm = T.sum(partition)
 78 |         squre_urm = T.sum(partition * partition)
 79 | 
 80 |         eps = 1.0e-37
 81 |         #TODO: mask not supported here
 82 |         nb_words = pos_prob_trn[0].size.astype(theano.config.floatX)
 83 |         sum_pos_neg_trn = pos_prob_trn + neg_prob_trn
 84 |         sum_pos_neg_tst = pos_prob_tst + neg_prob_tst
 85 |         y_train = T.sum(T.log(eps + pos_prob_trn[0] / sum_pos_neg_trn[0])) / nb_words
 86 |         y_train += T.sum(T.log(eps + neg_prob_trn[1:] / sum_pos_neg_trn[1:])) / nb_words
 87 |         y_test = T.sum(T.log(eps + pos_prob_tst[0] / sum_pos_neg_tst[0])) / nb_words
 88 |         y_test += T.sum(T.log(eps + neg_prob_tst[1:] / sum_pos_neg_tst[1:])) / nb_words
 89 | 
 90 |         true_labels = self.inputs['idxes'].get_output()[0]
 91 |         encode_len, nb_words = self.encode_length(true_labels, pre_prob_tst)
 92 | 
 93 |         train_loss = -y_train
 94 |         test_loss = -y_test
 95 |         for r in self.regularizers:
 96 |             train_loss = r(train_loss)
 97 |         updates = self.optimizer.get_updates(self.params, self.constraints, train_loss)
 98 |         updates += self.updates
 99 | 
100 |         self._train = theano.function([self.inputs['idxes'].get_output(True)], outputs=train_loss,
101 |                                       updates=updates)
102 |         self._test = theano.function([self.inputs['idxes'].get_output(False)],
103 |                                      outputs=[test_loss, encode_len, nb_words, sum_unrm, squre_urm])
104 | 
105 |         self._train.out_labels = ('loss', )
106 |         self._test.out_labels = ('loss', 'encode_len', 'nb_words')
107 |         self.all_metrics = ['loss', 'val_loss', 'val_ppl']
108 | 
109 |         def __summarize_outputs(outs, batch_sizes):
110 |             """
111 |                 :param outs: outputs of the _test* function. It is a list, and each element a list of
112 |                 values of the outputs of the _test* function on corresponding batch.
113 |                 :type outs: list
114 |                 :param batch_sizes: batch sizes. A list with the same length with outs. Each element
115 |                 is a size of corresponding batch.
116 |                 :type batch_sizes: list
117 |                 Aggregate outputs of batches as if the test function evaluates
118 |                 the metric values on the union of the batches.
119 |                 Note this function must be redefined for each specific problem
120 |             """
121 |             out = np.array(outs, dtype=theano.config.floatX)
122 |             loss, encode_len, nb_words, sum_unrm, squre_urm = out
123 |             batch_size = np.array(batch_sizes, dtype=theano.config.floatX)
124 | 
125 |             smry_loss = np.sum(loss * batch_size)/batch_size.sum()
126 |             smry_encode_len = encode_len.sum()
127 |             smry_nb_words = nb_words.sum()
128 |             smry_unrm = sum_unrm.sum()
129 |             smry_sq_unrm = squre_urm.sum()
130 |             return [smry_loss, smry_encode_len, smry_nb_words, smry_unrm, smry_sq_unrm]
131 | 
132 |         self._test.summarize_outputs = __summarize_outputs
133 | 
134 |     def negative_sample(self, X, order=0):
135 |         if order == 0:
136 |             ret = np.empty(shape=(self.nb_negative+1,) + X.shape, dtype=X.dtype)
137 |             ret[0] = X
138 |             ret[1:] = self.sampler.sample(shape=ret[1:].shape)
139 |         else:
140 |             raise NotImplementedError('Only support order=0 now')
141 |         return ret
142 | 
143 |     def _loop_train(self, data, batch_size):
144 |         nb = data.shape[1]
145 |         nb_words = data[0].size
146 |         loss = 0.0
147 |         for start in xrange(0, nb, batch_size):
148 |             end = start + batch_size
149 |             ins = data[:, start:end]
150 |             loss_ = self._train(ins)
151 |             loss += loss_ * ins[0].size
152 | 
153 |         loss /= nb_words
154 |         return loss
155 | 
156 |     def train(self, data_file='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=None,
157 |               batch_size=256, train_nb_words=100000000, val_nb_words=100000, train_val_nb=100000,
158 |               validation_interval=1800, log_file=None):
159 |         opt_info = self.optimizer.get_config()
160 |         opt_info = ', '.join(["{}: {}".format(n, v) for n, v in opt_info.items()])
161 | 
162 |         logger.info('training with file: %s' % data_file)
163 |         logger.info('training with batch size %d' % batch_size)
164 |         logger.info('training with %d words; validate with %d words during training; '
165 |                     'evaluate with %d words after training' % (train_nb_words, train_val_nb, val_nb_words))
166 |         logger.info('validate every %f seconds' % float(validation_interval))
167 |         logger.info('optimizer: %s' % opt_info)
168 | 
169 |         log_file = LogInfo(log_file)
170 |         log_file.info('training with file: %s' % data_file)
171 |         log_file.info('training with batch size %d' % batch_size)
172 |         log_file.info('training with %d words; validate with %d words during training; '
173 |                       'evaluate with %d words after training' % (train_nb_words, train_val_nb, val_nb_words))
174 |         log_file.info('validate every %f seconds' % float(validation_interval))
175 |         log_file.info('optimizer: %s' % opt_info)
176 | 
177 |         sentences = [None for _ in range(MAX_SETN_LEN)]  # TODO: sentences longer than 64 are ignored.
178 | 
179 |         max_vocab = self.vocab_size - 1
180 |         nb_trained = 0.
181 |         nb_words_trained = 0.0
182 |         sent_gen = grouped_sentences(data_file)
183 |         val_sents = self.get_val_data(sent_gen, val_nb_words)
184 |         train_val_sents = self.get_val_data(sent_gen, train_val_nb)
185 | 
186 |         self.validation(train_val_sents, batch_size, log_file)
187 |         start_ = time()
188 |         next_val_time = start_ + validation_interval
189 |         for sents in sent_gen:
190 |             mask = (sents > max_vocab)
191 |             sents[mask] = max_vocab
192 |             chunk = chunk_sentences(sentences, sents, batch_size)
193 |             if chunk is None:
194 |                 continue
195 | 
196 |             # loss, ce, nb_wrd = self._train(chunk, chunk)
197 |             x = self.negative_sample(chunk)
198 |             loss = self._loop_train(x, batch_size)
199 |             nb_trained += chunk.shape[0]
200 |             nb_words_trained += chunk.size
201 |             end_ = time()
202 |             elapsed = float(end_ - start_)
203 |             speed1 = nb_trained/elapsed
204 |             speed2 = nb_words_trained/elapsed
205 |             eta = (train_nb_words - nb_words_trained) / speed2
206 |             eta_h = int(math.floor(eta/3600))
207 |             eta_m = int(math.ceil((eta - eta_h * 3600)/60.))
208 |             logger.info('%s:Train - ETA: %02d:%02d - loss: %5.1f - speed: %.1f sent/s %.1f words/s' %
209 |                         (self.__class__.__name__, eta_h, eta_m, loss, speed1, speed2))
210 |             log_file.info('%s:Train - time: %f - loss: %.6f' % (self.__class__.__name__, end_, loss))
211 | 
212 |             if end_ > next_val_time:
213 |                 # noinspection PyUnresolvedReferences
214 |                 self.validation(train_val_sents, batch_size, log_file)
215 |                 next_val_time = time() + validation_interval
216 | 
217 |             if nb_words_trained >= train_nb_words:
218 |                 logger.info('Training finished. Evaluating ...')
219 |                 log_file.info('Training finished. Evaluating ...')
220 |                 self.validation(val_sents, batch_size, log_file)
221 |                 if save_path is not None:
222 |                     self.save_params(save_path)
223 |                 break
224 | 
225 |         log_file.close()
226 | 
227 |     def validation(self, val_sents, batch_size, log_file=None):
228 |         """
229 |         :param val_sents: validation sentences.
230 |         :type val_sents: a list, each element a ndarray
231 |         :return: tuple
232 |         """
233 |         code_len = 0.
234 |         nb_words = 0.
235 |         loss = 0.0
236 |         unrm = 0.0
237 |         sq_unrm = 0.0
238 | 
239 |         for sents in val_sents:
240 |             x = [self.negative_sample(sents)]
241 |             loss_, code_len_, nb_words_, unrm_, sq_unrm_ = self._test_loop(self._test, x, batch_size)
242 |             nb_words += nb_words_
243 |             code_len += code_len_
244 |             loss += loss_ * nb_words_
245 |             unrm += unrm_
246 |             sq_unrm += sq_unrm_
247 | 
248 |         loss /= nb_words
249 |         ppl = math.exp(code_len/nb_words)
250 |         mean_unrm = unrm / nb_words
251 |         mean_sq_unrm = sq_unrm / nb_words
252 |         std_unrm = mean_sq_unrm - mean_unrm * mean_unrm
253 |         logger.info('%s:Val val_loss: %.2f - val_ppl: %.2f - partition: mean: %.2f std: %.2f' %
254 |                     (self.__class__.__name__, loss, ppl, mean_sq_unrm, std_unrm))
255 |         log_file.info('%s:Val val_loss: %.6f - val_ppl: %.6f - partition: mean: %.6f std: %.6f' %
256 |                       (self.__class__.__name__, loss, ppl, mean_sq_unrm, std_unrm))
257 | 
258 |         return loss, ppl, mean_unrm, std_unrm
259 | 
260 |     @staticmethod
261 |     def _test_loop(f, ins, batch_size=128, verbose=0):
262 |         nb_sample = ins[0].shape[1]
263 |         outs = [[] for _ in range(f.n_returned_outputs)]
264 |         batch_info = []
265 |         batches = make_batches(nb_sample, batch_size)
266 |         for batch_index, (batch_start, batch_end) in enumerate(batches):
267 |             ins_batch = slice_X(ins, start_=batch_start, end_=batch_end, axis=1)
268 |             batch_outs = f(*ins_batch)
269 |             for idx, v in enumerate(batch_outs):
270 |                 outs[idx].append(v)
271 |             batch_info.append(batch_end - batch_start)
272 | 
273 |         outs = f.summarize_outputs(outs, batch_info)
274 |         return outs
275 | 
276 | 
277 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
278 | NB_RUN_WORDS = 100000000
279 | NB_VOCAB = 10000
280 | NB_RUN_VAL = 100000
281 | NB_EVALUATE = 5000000
282 | BATCH_SIZE = 256
283 | 
284 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
285 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
286 |                   help="learning rate")
287 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
288 |                   help="amount of training data (number of words)")
289 | parser.add_option("-V", "--vocab-size", type="int", dest="vocab_size", default=NB_VOCAB,
290 |                   help="vocabulary size")
291 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
292 |                   help="running validation words")
293 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
294 |                   help="running validation words")
295 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
296 |                   help="decaying rate")
297 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
298 |                   help="decaying rate")
299 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
300 |                   help="decay lr or not")
301 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
302 |                   help="amount of training data (number of words)")
303 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128,
304 |                   help="amount of training data (number of words)")
305 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
306 |                   help="amount of training data (number of words)")
307 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
308 |                   help="amount of training data (number of words)")
309 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
310 |                   help="decaying rate")
311 | parser.add_option("-s", "--save", type="str", dest="save", default='',
312 |                   help="amount of training data (number of words)")
313 | options, args = parser.parse_args()
314 | 
315 | nb_run_words = options.running_words
316 | nb_vocab = options.vocab_size
317 | nb_run_val = options.val_run
318 | nb_evaluate = options.nb_evaluation
319 | 
320 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB)
321 | 
322 | if options.decay:
323 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
324 | else:
325 |     opt = adam(lr=options.lr)
326 | 
327 | if options.log_file == '':
328 |     log_file = None
329 | else:
330 |     log_file = options.log_file
331 | 
332 | if options.save == '':
333 |     save_path = None
334 | else:
335 |     save_path = options.save
336 | 
337 | model = NCELangModelV0(vocab_size=nb_vocab, nb_negative=options.negative,
338 |                        embed_dims=options.embed_size, context_dims=options.context_size,
339 |                        negprob_table=unigram_table, optimizer=opt)
340 | model.compile()
341 | model.train(data_file=DATA_PATH,
342 |             save_path=save_path,
343 |             batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
344 |             val_nb_words=nb_evaluate, train_val_nb=nb_run_val,
345 |             validation_interval=options.interval, log_file=log_file)


--------------------------------------------------------------------------------
/real/exp_nce2_norm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | __author__ = 'Yunchuan Chen'
  4 | 
  5 | from utils import get_unigram_probtable
  6 | from keras.optimizers import AdamAnneal, adam
  7 | import optparse
  8 | from layers import ActivationLayer
  9 | # noinspection PyUnresolvedReferences
 10 | from models import Graph, LangModel, LogInfo, optimizers, categorical_crossentropy, \
 11 |     objective_fnc, np, theano, T, TableSampler, logger, grouped_sentences, MAX_SETN_LEN, \
 12 |     chunk_sentences, time, math, make_batches, slice_X, containers, Embedding, PartialSoftmax,\
 13 |     Split, LangLSTMLayer, LookupProb, Dense
 14 | 
 15 | 
 16 | class NCELangModelV2(Graph, LangModel):
 17 |     def __init__(self, vocab_size, nb_negative, embed_dims=128, context_dims=128,
 18 |                  negprob_table=None, optimizer='adam'):
 19 |         super(NCELangModelV2, self).__init__(weighted_inputs=False)
 20 |         self.vocab_size = vocab_size
 21 |         self.embed_dim = embed_dims
 22 |         self.optimizer = optimizers.get(optimizer)
 23 |         self.nb_negative = nb_negative
 24 |         self.loss = categorical_crossentropy
 25 |         self.loss_fnc = objective_fnc(self.loss)
 26 | 
 27 |         if negprob_table is None:
 28 |             negprob_table_ = np.ones(shape=(vocab_size,), dtype=theano.config.floatX)/vocab_size
 29 |             negprob_table = theano.shared(negprob_table_)
 30 |             self.neg_prob_table = negprob_table_
 31 |         else:
 32 |             self.neg_prob_table = negprob_table.astype(theano.config.floatX)
 33 |             negprob_table = theano.shared(negprob_table.astype(theano.config.floatX))
 34 | 
 35 |         self.sampler = TableSampler(self.neg_prob_table)
 36 | 
 37 |         self.add_input(name='idxes', ndim=3, dtype='int32')
 38 |         self.add_node(Split(split_at=1, split_axis=0), name=('pos_sents', ''), inputs='idxes')
 39 | 
 40 |         seq = containers.Sequential()
 41 |         seq.add(self.nodes['pos_sents'])
 42 |         seq.add(Embedding(vocab_size, embed_dims))
 43 |         seq.add(LangLSTMLayer(embed_dims, output_dim=context_dims))
 44 |         # seq.add(Dropout(0.5))
 45 | 
 46 |         self.add_node(seq, name='seq')
 47 |         self.add_node(PartialSoftmax(input_dim=context_dims, output_dim=vocab_size),
 48 |                       name='part_prob', inputs=('idxes', 'seq'))
 49 |         self.add_node(Dense(input_dim=context_dims, output_dim=1, activation='exponential'),
 50 |                       name='normalizer', inputs='seq')
 51 |         self.add_node(LookupProb(negprob_table), name='lookup_prob', inputs='idxes')
 52 | 
 53 |         test_node = Dense(input_dim=context_dims, output_dim=vocab_size, activation='exponential')
 54 |         test_node.params = []
 55 |         test_node.W = self.nodes['part_prob'].W
 56 |         test_node.b = self.nodes['part_prob'].b
 57 |         self.add_node(test_node, name='true_unrm_prob', inputs='seq')
 58 |         # self.add_node(ActivationLayer(name='normalization'), name='true_prob', inputs='true_unrm_prob')
 59 | 
 60 |         self.add_output('pos_prob', node='part_prob')
 61 |         self.add_output('neg_prob', node='lookup_prob')
 62 |         # self.add_output('pred_prob', node='true_prob')
 63 |         self.add_output('normalizer', node='normalizer')
 64 |         self.add_output('unrm_prob', node='true_unrm_prob')
 65 | 
 66 |     # noinspection PyMethodOverriding
 67 |     def compile(self):
 68 |         pos_prob_layer = self.outputs['pos_prob']
 69 |         neg_prob_layer = self.outputs['neg_prob']
 70 |         # pre_prob_layer = self.outputs['pred_prob']
 71 |         normlzer_layer = self.outputs['normalizer']
 72 |         unrm_pro_layer = self.outputs['unrm_prob']
 73 | 
 74 | 
 75 |         pos_prob_trn = pos_prob_layer.get_output(train=True)
 76 |         neg_prob_trn = neg_prob_layer.get_output(train=True) * self.nb_negative
 77 |         pos_prob_tst = pos_prob_layer.get_output(train=False)
 78 |         neg_prob_tst = neg_prob_layer.get_output(train=False) * self.nb_negative
 79 |         # pre_prob_tst = pre_prob_layer.get_output(train=False)
 80 |         unrm_pro_tst = unrm_pro_layer.get_output(train=False)
 81 | 
 82 |         nrm_const = normlzer_layer.get_output(train=True)
 83 |         nrm_const = T.reshape(nrm_const, (nrm_const.shape[0], nrm_const.shape[1]))
 84 |         nrm_const = nrm_const.dimshuffle('x', 0, 1)
 85 |         pos_prob_trn *= nrm_const
 86 | 
 87 |         nrm_const_tst_ = normlzer_layer.get_output(train=False)
 88 |         nrm_const_tst = T.reshape(nrm_const_tst_, (nrm_const_tst_.shape[0], nrm_const_tst_.shape[1]))
 89 |         nrm_const_tst = nrm_const_tst.dimshuffle('x', 0, 1)
 90 |         pos_prob_tst *= nrm_const_tst
 91 | 
 92 |         true_nrm = T.sum(unrm_pro_tst, axis=-1, keepdims=True)
 93 |         pre_prob_tst = unrm_pro_tst / true_nrm
 94 | 
 95 |         unrm_pro_tst *= T.addbroadcast(nrm_const_tst_, 2)
 96 |         partition = T.sum(unrm_pro_tst, axis=-1)
 97 |         sum_unrm = T.sum(partition)
 98 |         squre_urm = T.sum(partition * partition)
 99 | 
100 |         eps = 1.0e-37
101 |         z = 1./(nrm_const_tst.ravel() + eps)
102 |         z_pred = T.sum(z)
103 |         z_true = T.sum(true_nrm.ravel())
104 |         z_err = T.sum(T.abs_(z - true_nrm.ravel()))
105 |         z_sq = T.sum(true_nrm * true_nrm)
106 | 
107 |         #TODO: mask not supported here
108 |         nb_words = pos_prob_trn[0].size.astype(theano.config.floatX)
109 |         sum_pos_neg_trn = pos_prob_trn + neg_prob_trn
110 |         sum_pos_neg_tst = pos_prob_tst + neg_prob_tst
111 |         y_train = T.sum(T.log(eps + pos_prob_trn[0] / sum_pos_neg_trn[0])) / nb_words
112 |         y_train += T.sum(T.log(eps + neg_prob_trn[1:] / sum_pos_neg_trn[1:])) / nb_words
113 |         y_test = T.sum(T.log(eps + pos_prob_tst[0] / sum_pos_neg_tst[0])) / nb_words
114 |         y_test += T.sum(T.log(eps + neg_prob_tst[1:] / sum_pos_neg_tst[1:])) / nb_words
115 | 
116 |         true_labels = self.inputs['idxes'].get_output()[0]
117 |         encode_len, nb_words = self.encode_length(true_labels, pre_prob_tst)
118 | 
119 |         train_loss = -y_train
120 |         test_loss = -y_test
121 |         for r in self.regularizers:
122 |             train_loss = r(train_loss)
123 |         updates = self.optimizer.get_updates(self.params, self.constraints, train_loss)
124 |         updates += self.updates
125 | 
126 |         self._train = theano.function([self.inputs['idxes'].get_output(True)], outputs=train_loss,
127 |                                       updates=updates)
128 |         self._test = theano.function([self.inputs['idxes'].get_output(False)],
129 |                                      outputs=[test_loss, encode_len, nb_words, sum_unrm,
130 |                                               squre_urm, z_pred, z_true, z_err, z_sq])
131 | 
132 |         self._train.out_labels = ('loss', )
133 |         self._test.out_labels = ('loss', 'encode_len', 'nb_words', 'unrm', 'square_unrm',
134 |                                  'z_pred', 'z_true', 'z_err', 'z_sq')
135 |         self.all_metrics = ['loss', 'val_loss', 'val_ppl']
136 | 
137 |         def __summarize_outputs(outs, batch_sizes):
138 |             """
139 |                 :param outs: outputs of the _test* function. It is a list, and each element a list of
140 |                 values of the outputs of the _test* function on corresponding batch.
141 |                 :type outs: list
142 |                 :param batch_sizes: batch sizes. A list with the same length with outs. Each element
143 |                 is a size of corresponding batch.
144 |                 :type batch_sizes: list
145 |                 Aggregate outputs of batches as if the test function evaluates
146 |                 the metric values on the union of the batches.
147 |                 Note this function must be redefined for each specific problem
148 |             """
149 |             out = np.array(outs, dtype=theano.config.floatX)
150 |             loss, encode_len, nb_words, unrm, sq_urm, z_pred, z_true, z_err, z_sq = out
151 |             batch_size = np.array(batch_sizes, dtype=theano.config.floatX)
152 | 
153 |             smry_loss = np.sum(loss * batch_size)/batch_size.sum()
154 |             smry_encode_len = encode_len.sum()
155 |             smry_nb_words = nb_words.sum()
156 |             smry_sum_urm = unrm.sum()
157 |             smry_sq_urm = sq_urm.sum()
158 |             smry_z_pred = z_pred.sum()
159 |             smry_z_true = z_true.sum()
160 |             smry_z_err = z_err.sum()
161 |             smry_z_sq = z_sq.sum()
162 |             return [smry_loss, smry_encode_len, smry_nb_words, smry_sum_urm,
163 |                     smry_sq_urm, smry_z_pred, smry_z_true, smry_z_err, smry_z_sq]
164 | 
165 |         self._test.summarize_outputs = __summarize_outputs
166 | 
167 |     def negative_sample(self, X, order=0):
168 |         if order == 0:
169 |             ret = np.empty(shape=(self.nb_negative+1,) + X.shape, dtype=X.dtype)
170 |             ret[0] = X
171 |             ret[1:] = self.sampler.sample(shape=ret[1:].shape)
172 |         else:
173 |             raise NotImplementedError('Only support order=0 now')
174 |         return ret
175 | 
176 |     def _loop_train(self, data, batch_size):
177 |         nb = data.shape[1]
178 |         nb_words = data[0].size
179 |         loss = 0.0
180 |         for start in xrange(0, nb, batch_size):
181 |             end = start + batch_size
182 |             ins = data[:, start:end]
183 |             loss_ = self._train(ins)
184 |             loss += loss_ * ins[0].size
185 | 
186 |         loss /= nb_words
187 |         return loss
188 | 
189 |     def train(self, data_file='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=None,
190 |               batch_size=256, train_nb_words=100000000, val_nb_words=100000, train_val_nb=100000,
191 |               validation_interval=1800, log_file=None):
192 |         opt_info = self.optimizer.get_config()
193 |         opt_info = ', '.join(["{}: {}".format(n, v) for n, v in opt_info.items()])
194 | 
195 |         logger.info('training with file: %s' % data_file)
196 |         logger.info('training with batch size %d' % batch_size)
197 |         logger.info('training with %d words; validate with %d words during training; '
198 |                     'evaluate with %d words after training' % (train_nb_words, train_val_nb, val_nb_words))
199 |         logger.info('validate every %f seconds' % float(validation_interval))
200 |         logger.info('optimizer: %s' % opt_info)
201 | 
202 |         log_file = LogInfo(log_file)
203 |         log_file.info('training with file: %s' % data_file)
204 |         log_file.info('training with batch size %d' % batch_size)
205 |         log_file.info('training with %d words; validate with %d words during training; '
206 |                       'evaluate with %d words after training' % (train_nb_words, train_val_nb, val_nb_words))
207 |         log_file.info('validate every %f seconds' % float(validation_interval))
208 |         log_file.info('optimizer: %s' % opt_info)
209 | 
210 |         sentences = [None for _ in range(MAX_SETN_LEN)]  # TODO: sentences longer than 64 are ignored.
211 | 
212 |         max_vocab = self.vocab_size - 1
213 |         nb_trained = 0.
214 |         nb_words_trained = 0.0
215 |         sent_gen = grouped_sentences(data_file)
216 |         val_sents = self.get_val_data(sent_gen, val_nb_words)
217 |         train_val_sents = self.get_val_data(sent_gen, train_val_nb)
218 | 
219 |         self.validation(train_val_sents, batch_size, log_file)
220 |         start_ = time()
221 |         next_val_time = start_ + validation_interval
222 |         for sents in sent_gen:
223 |             mask = (sents > max_vocab)
224 |             sents[mask] = max_vocab
225 |             chunk = chunk_sentences(sentences, sents, batch_size)
226 |             if chunk is None:
227 |                 continue
228 | 
229 |             # loss, ce, nb_wrd = self._train(chunk, chunk)
230 |             x = self.negative_sample(chunk)
231 |             loss = self._loop_train(x, batch_size)
232 |             nb_trained += chunk.shape[0]
233 |             nb_words_trained += chunk.size
234 |             end_ = time()
235 |             elapsed = float(end_ - start_)
236 |             speed1 = nb_trained/elapsed
237 |             speed2 = nb_words_trained/elapsed
238 |             eta = (train_nb_words - nb_words_trained) / speed2
239 |             eta_h = int(math.floor(eta/3600))
240 |             eta_m = int(math.ceil((eta - eta_h * 3600)/60.))
241 |             logger.info('%s:Train - ETA: %02d:%02d - loss: %5.1f - speed: %.1f sent/s %.1f words/s' %
242 |                         (self.__class__.__name__, eta_h, eta_m, loss, speed1, speed2))
243 |             log_file.info('%s:Train - time: %f - loss: %.6f' % (self.__class__.__name__, end_, loss))
244 | 
245 |             if end_ > next_val_time:
246 |                 # noinspection PyUnresolvedReferences
247 |                 self.validation(train_val_sents, batch_size, log_file)
248 |                 next_val_time = time() + validation_interval
249 | 
250 |             if nb_words_trained >= train_nb_words:
251 |                 logger.info('Training finished. Evaluating ...')
252 |                 log_file.info('Training finished. Evaluating ...')
253 |                 self.validation(val_sents, batch_size, log_file)
254 |                 if save_path is not None:
255 |                     self.save_params(save_path)
256 |                 break
257 |         log_file.close()
258 | 
259 |     def validation(self, val_sents, batch_size, log_file=None):
260 |         """
261 |         :param val_sents: validation sentences.
262 |         :type val_sents: a list, each element a ndarray
263 |         :return: tuple
264 |         """
265 |         code_len = 0.
266 |         nb_words = 0.
267 |         loss = 0.0
268 |         unrm = 0.0
269 |         sq_unrm = 0.0
270 |         z_pred = 0.0
271 |         z_true = 0.0
272 |         z_err = 0.0
273 |         z_sq = 0.0
274 | 
275 |         for sents in val_sents:
276 |             x = [self.negative_sample(sents)]
277 |             loss_, code_len_, nb_words_, unrm_, sq_unrm_, z_pred_, z_true_, z_err_, z_sq_ = \
278 |                 self._test_loop(self._test, x, batch_size)
279 |             nb_words += nb_words_
280 |             code_len += code_len_
281 |             loss += loss_ * nb_words_
282 |             unrm += unrm_
283 |             sq_unrm += sq_unrm_
284 |             z_pred += z_pred_
285 |             z_true += z_true_
286 |             z_err += z_err_
287 |             z_sq += z_sq_
288 | 
289 |         loss /= nb_words
290 |         ppl = math.exp(code_len/nb_words)
291 |         mean_unrm = unrm / nb_words
292 |         mean_sq_unrm = sq_unrm / nb_words
293 |         std_unrm = mean_sq_unrm - mean_unrm * mean_unrm
294 |         z_pred /= nb_words
295 |         z_true /= nb_words
296 |         z_err /= nb_words
297 |         mean_sq_z = z_sq / nb_words
298 |         std_z = mean_sq_z - z_true * z_true
299 |         logger.info('%s:Val val_loss: %.2f - val_ppl: %.2f - partition: mean: %.2f std: %.2f - '
300 |                     'z: pred: %.2f true: %.2f err: %.2f std: %.2f' %
301 |                     (self.__class__.__name__, loss, ppl, mean_sq_unrm, std_unrm, z_pred, z_true, z_err, std_z))
302 |         log_file.info('%s:Val val_loss: %.6f - val_ppl: %.6f - partition: mean: %.6f std: %.6f - '
303 |                       'z: pred: %.6f true: %.6f err: %.6f std: %.6f' %
304 |                       (self.__class__.__name__, loss, ppl, mean_sq_unrm, std_unrm, z_pred, z_true, z_err, std_z))
305 | 
306 |         return loss, ppl, mean_unrm, std_unrm, z_pred, z_true, z_err
307 | 
308 |     @staticmethod
309 |     def _test_loop(f, ins, batch_size=128, verbose=0):
310 |         nb_sample = ins[0].shape[1]
311 |         outs = [[] for _ in range(f.n_returned_outputs)]
312 |         batch_info = []
313 |         batches = make_batches(nb_sample, batch_size)
314 |         for batch_index, (batch_start, batch_end) in enumerate(batches):
315 |             ins_batch = slice_X(ins, start_=batch_start, end_=batch_end, axis=1)
316 |             batch_outs = f(*ins_batch)
317 |             for idx, v in enumerate(batch_outs):
318 |                 outs[idx].append(v)
319 |             batch_info.append(batch_end - batch_start)
320 | 
321 |         outs = f.summarize_outputs(outs, batch_info)
322 |         return outs
323 | 
324 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
325 | NB_RUN_WORDS = 100000000
326 | NB_VOCAB = 10000
327 | NB_RUN_VAL = 100000
328 | NB_EVALUATE = 5000000
329 | BATCH_SIZE = 256
330 | 
331 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
332 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
333 |                   help="learning rate")
334 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
335 |                   help="amount of training data (number of words)")
336 | parser.add_option("-V", "--vocab-size", type="int", dest="vocab_size", default=NB_VOCAB,
337 |                   help="vocabulary size")
338 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
339 |                   help="running validation words")
340 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
341 |                   help="running validation words")
342 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
343 |                   help="decaying rate")
344 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
345 |                   help="decaying rate")
346 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
347 |                   help="decay lr or not")
348 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
349 |                   help="amount of training data (number of words)")
350 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128,
351 |                   help="amount of training data (number of words)")
352 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
353 |                   help="amount of training data (number of words)")
354 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
355 |                   help="amount of training data (number of words)")
356 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
357 |                   help="decaying rate")
358 | parser.add_option("-s", "--save", type="str", dest="save", default='',
359 |                   help="amount of training data (number of words)")
360 | options, args = parser.parse_args()
361 | 
362 | nb_run_words = options.running_words
363 | nb_vocab = options.vocab_size
364 | nb_run_val = options.val_run
365 | nb_evaluate = options.nb_evaluation
366 | 
367 | unigram_table = get_unigram_probtable(nb_words=nb_vocab,
368 |                                       save_path='../data/wiki-unigram-prob-size%d.pkl' %
369 |                                                 nb_vocab)
370 | if options.decay:
371 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
372 | else:
373 |     opt = adam(lr=options.lr)
374 | 
375 | if options.log_file == '':
376 |     log_file = None
377 | else:
378 |     log_file = options.log_file
379 | 
380 | if options.save == '':
381 |     save_path = None
382 | else:
383 |     save_path = options.save
384 |     
385 | model = NCELangModelV2(vocab_size=nb_vocab, nb_negative=options.negative,
386 |                        embed_dims=options.embed_size, context_dims=options.context_size,
387 |                        negprob_table=unigram_table, optimizer=opt)
388 | model.compile()
389 | model.train(data_file=DATA_PATH,
390 |             save_path=save_path,
391 |             batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
392 |             val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 
393 |             validation_interval=options.interval, log_file=log_file)
394 | 


--------------------------------------------------------------------------------
/real/main_lblv1-pku1.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_lblv1.py"
13 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
14 | context_size=5
15 | embed_size=200
16 | 
17 | # test different vocab size
18 | lr='0.02'
19 | lr_min='0.002'
20 | gamma='0.03'
21 | for ((nb_vocab=10000; nb_vocab<32000; nb_vocab+=2000)); do
22 |     log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log"
23 |     command_line_="$python_command -C ${context_size} -E ${embed_size} \
24 |        --lr=${lr} --lr-min=${lr_min} \
25 |        -d --gamma=${gamma} \
26 |        --log-file $log_file \
27 |        -D $data_file -V $nb_vocab "
28 |     command_line=`echo "$command_line_" | tr -s " "`
29 |     ${command_prefix} nohup sh -c "$command_line" &
30 |     sleep 40
31 | done
32 | 
33 | 
34 | #for ((nb_vocab=30000; nb_vocab<=50000; nb_vocab+=2000)); do
35 | #    log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log"
36 | #    command_line_="$python_command -C ${context_size} -E ${embed_size} \
37 | #       --lr=${lr} --lr-min=${lr_min} \
38 | #       -d --gamma=${gamma} \
39 | #       --log-file $log_file \
40 | #       -D $data_file -V $nb_vocab "
41 | #    command_line=`echo "$command_line_" | tr -s " "`
42 | #    ${command_prefix} nohup sh -c "$command_line" &
43 | #    sleep 40
44 | #done
45 | 
46 | # test different lr:
47 | #lr_min='0.002'
48 | #gamma='0.003'
49 | #nb_neg=50
50 | #nb_vocab=30000
51 | #for lr in 0.04 0.03 0.02; do #0.01 0.008 0.006; do
52 | #    log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log"
53 | #    command_line_="$python_command -C ${context_size} -E ${embed_size} \
54 | #       --lr=${lr} --lr-min=${lr_min} \
55 | #       -d --gamma=${gamma} \
56 | #       --log-file $log_file \
57 | #       -D $data_file -V $nb_vocab "
58 | #    command_line=`echo "$command_line_" | tr -s " "`
59 | #    ${command_prefix} nohup sh -c "$command_line" &
60 | #    sleep 40
61 | #done
62 | 
63 | #
64 | #lr='0.01'
65 | #for gamma in 0.001 0.002 0.004; do
66 | #    log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log"
67 | #    command_line_="$python_command -C ${context_size} -E ${embed_size} \
68 | #       --lr=${lr} --lr-min=${lr_min} \
69 | #       -d --gamma=${gamma} \
70 | #       --log-file $log_file \
71 | #       -D $data_file -V $nb_vocab "
72 | #    command_line=`echo "$command_line_" | tr -s " "`
73 | #    ${command_prefix} nohup sh -c "$command_line" &
74 | #    sleep 40
75 | #done
76 | 


--------------------------------------------------------------------------------
/real/main_lblv1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | import optparse
 5 | from keras.optimizers import adam, AdamAnneal
 6 | from models import LBLangModelV1
 7 | # noinspection PyUnresolvedReferences
 8 | from SparseEmbed.cu_gen_sparse import compose_dense_repr
 9 | 
10 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
11 | EMBED_FILE = '../data/models/embeddings/rw2vec_embeddings-size200.pkl'
12 | NB_RUN_WORDS = 100000000
13 | NB_VOCAB = 10000
14 | NB_RUN_VAL = 100000
15 | NB_EVALUATE = 5000000
16 | BATCH_SIZE = 512
17 | 
18 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
19 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
20 |                   help="learning rate")
21 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
22 |                   help="amount of training data (number of words)")
23 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
24 |                   help="running validation words")
25 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
26 |                   help="running validation words")
27 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
28 |                   help="decaying rate")
29 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
30 |                   help="decaying rate")
31 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
32 |                   help="decay lr or not")
33 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=5,
34 |                   help="amount of training data (number of words)")
35 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=200,
36 |                   help="amount of training data (number of words)")
37 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
38 |                   help="amount of training data (number of words)")
39 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=900.,
40 |                   help="decaying rate")
41 | parser.add_option("-s", "--save", type="str", dest="save", default='',
42 |                   help="amount of training data (number of words)")
43 | parser.add_option("-V", "--nb-vocab", type="int", dest="nb_vocab", default=30000,
44 |                   help="Number of vocabulary")
45 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH,
46 |                   help="binarized corpus file")
47 | options, args = parser.parse_args()
48 | 
49 | nb_run_words = options.running_words
50 | nb_run_val = options.val_run
51 | nb_evaluate = options.nb_evaluation
52 | 
53 | if options.decay:
54 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
55 | else:
56 |     opt = adam(lr=options.lr)
57 | 
58 | if options.log_file == '':
59 |     log_file = None
60 | else:
61 |     log_file = options.log_file
62 | 
63 | if options.save == '':
64 |     save_path = None
65 | else:
66 |     save_path = options.save
67 | 
68 | model = LBLangModelV1(vocab_size=options.nb_vocab,
69 |                       context_size=options.context_size,
70 |                       embed_dims=options.embed_size)
71 | model.compile(opt)
72 | model.train(data_file=options.corpus,
73 |             save_path=save_path,
74 |             batch_size=BATCH_SIZE,
75 |             train_nb_words=nb_run_words,
76 |             val_nb_words=nb_evaluate,
77 |             train_val_nb=nb_run_val,
78 |             validation_interval=options.interval,
79 |             log_file=log_file)


--------------------------------------------------------------------------------
/real/main_lblv1.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_lblv1.py"
13 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
14 | context_size=5
15 | embed_size=200
16 | 
17 | # test different vocab size
18 | lr='0.02'
19 | lr_min='0.002'
20 | gamma='0.03'
21 | for ((nb_vocab=10000; nb_vocab<32000; nb_vocab+=2000)); do
22 |     log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log"
23 |     command_line_="$python_command -C ${context_size} -E ${embed_size} \
24 |        --lr=${lr} --lr-min=${lr_min} \
25 |        -d --gamma=${gamma} \
26 |        --log-file $log_file \
27 |        -D $data_file -V $nb_vocab "
28 |     command_line=`echo "$command_line_" | tr -s " "`
29 |     ${command_prefix} nohup sh -c "$command_line" &
30 |     sleep 40
31 | done
32 | 
33 | 
34 | #for ((nb_vocab=30000; nb_vocab<=50000; nb_vocab+=2000)); do
35 | #    log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log"
36 | #    command_line_="$python_command -C ${context_size} -E ${embed_size} \
37 | #       --lr=${lr} --lr-min=${lr_min} \
38 | #       -d --gamma=${gamma} \
39 | #       --log-file $log_file \
40 | #       -D $data_file -V $nb_vocab "
41 | #    command_line=`echo "$command_line_" | tr -s " "`
42 | #    ${command_prefix} nohup sh -c "$command_line" &
43 | #    sleep 40
44 | #done
45 | 
46 | # test different lr:
47 | #lr_min='0.002'
48 | #gamma='0.003'
49 | #nb_neg=50
50 | #nb_vocab=30000
51 | #for lr in 0.04 0.03 0.02; do #0.01 0.008 0.006; do
52 | #    log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log"
53 | #    command_line_="$python_command -C ${context_size} -E ${embed_size} \
54 | #       --lr=${lr} --lr-min=${lr_min} \
55 | #       -d --gamma=${gamma} \
56 | #       --log-file $log_file \
57 | #       -D $data_file -V $nb_vocab "
58 | #    command_line=`echo "$command_line_" | tr -s " "`
59 | #    ${command_prefix} nohup sh -c "$command_line" &
60 | #    sleep 40
61 | #done
62 | 
63 | #
64 | #lr='0.01'
65 | #for gamma in 0.001 0.002 0.004; do
66 | #    log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log"
67 | #    command_line_="$python_command -C ${context_size} -E ${embed_size} \
68 | #       --lr=${lr} --lr-min=${lr_min} \
69 | #       -d --gamma=${gamma} \
70 | #       --log-file $log_file \
71 | #       -D $data_file -V $nb_vocab "
72 | #    command_line=`echo "$command_line_" | tr -s " "`
73 | #    ${command_prefix} nohup sh -c "$command_line" &
74 | #    sleep 40
75 | #done
76 | 


--------------------------------------------------------------------------------
/real/main_lblv2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | __author__ = 'Yunchuan Chen'
  4 | 
  5 | from utils import get_unigram_probtable
  6 | import optparse
  7 | from keras.optimizers import adam, AdamAnneal
  8 | from models import LBLangModelV2, logger
  9 | import cPickle as pickle
 10 | import numpy as np
 11 | # noinspection PyUnresolvedReferences
 12 | from SparseEmbed.cu_gen_sparse import compose_dense_repr
 13 | 
 14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
 15 | EMBED_FILE = '../data/models/embeddings/rw2vec_embeddings-size200.pkl'
 16 | NB_RUN_WORDS = 100000000
 17 | NB_VOCAB = 10000
 18 | NB_RUN_VAL = 100000
 19 | NB_EVALUATE = 5000000
 20 | BATCH_SIZE = 512
 21 | 
 22 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
 23 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
 24 |                   help="learning rate")
 25 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
 26 |                   help="amount of training data (number of words)")
 27 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file",
 28 |                   help="sparse coding file (pickle)")
 29 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", default='',
 30 |                   help="initial embedding file (pickle)")
 31 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
 32 |                   help="running validation words")
 33 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
 34 |                   help="running validation words")
 35 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
 36 |                   help="decaying rate")
 37 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
 38 |                   help="decaying rate")
 39 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
 40 |                   help="decay lr or not")
 41 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
 42 |                   help="amount of training data (number of words)")
 43 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=5,
 44 |                   help="amount of training data (number of words)")
 45 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=200,
 46 |                   help="amount of training data (number of words)")
 47 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
 48 |                   help="amount of training data (number of words)")
 49 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=900.,
 50 |                   help="decaying rate")
 51 | parser.add_option("-s", "--save", type="str", dest="save", default='',
 52 |                   help="amount of training data (number of words)")
 53 | parser.add_option("-V", "--nb-vocab", type="int", dest="nb_vocab", default=30000,
 54 |                   help="Number of vocabulary")
 55 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH,
 56 |                   help="binarized corpus file")
 57 | parser.add_option("-w", "--nb-workers", type="int", dest="nb_workers", default=3,
 58 |                   help="number of data workers")
 59 | options, args = parser.parse_args()
 60 | 
 61 | nb_run_words = options.running_words
 62 | nb_run_val = options.val_run
 63 | nb_evaluate = options.nb_evaluation
 64 | embedding_file = options.embedding_file
 65 | 
 66 | with file(options.coding_file, 'rb') as f:
 67 |     sparse_coding = pickle.load(f)
 68 |     # print sparse_coding.dtype
 69 | 
 70 | nb_vocab = options.nb_vocab
 71 | sparse_coding = sparse_coding[nb_vocab//1000]
 72 | nb_vocab, nb_base = sparse_coding.shape
 73 | nb_base -= 1
 74 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab)
 75 | 
 76 | if embedding_file != '':
 77 |     with file('../data/wiki-wordmap-trunc300k.wp', 'rb') as f:
 78 |         wp = pickle.load(f)
 79 |     freq = wp['idx2wc']
 80 |     logger.info('Using word2vec to initialize word embeddings %s ' % embedding_file)
 81 |     embed = compose_dense_repr(nb_base, nb_vocab, freq, embedding_file)
 82 |     embed = np.vstack([embed, np.zeros((options.context_size, options.embed_size))])
 83 |     ini_embeds = [embed]
 84 | else:
 85 |     ini_embeds = None
 86 | 
 87 | if options.decay:
 88 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
 89 | else:
 90 |     opt = adam(lr=options.lr)
 91 | 
 92 | if options.log_file == '':
 93 |     log_file = None
 94 | else:
 95 |     log_file = options.log_file
 96 | 
 97 | if options.save == '':
 98 |     save_path = None
 99 | else:
100 |     save_path = options.save
101 | 
102 | model = LBLangModelV2(sparse_coding=sparse_coding,
103 |                       context_size=options.context_size,
104 |                       nb_negative=options.negative,
105 |                       embed_dims=options.embed_size,
106 |                       init_embeddings=ini_embeds,
107 |                       negprob_table=unigram_table,
108 |                       optimizer=opt)
109 | model.compile()
110 | model.train(data_file=options.corpus,
111 |             save_path=save_path,
112 |             batch_size=BATCH_SIZE,
113 |             train_nb_words=nb_run_words,
114 |             val_nb_words=nb_evaluate,
115 |             train_val_nb=nb_run_val,
116 |             validation_interval=options.interval,
117 |             log_file=log_file,
118 |             nb_data_workers=options.nb_workers)


--------------------------------------------------------------------------------
/real/main_lblv2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_lblv2.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=5
17 | embed_size=200
18 | 
19 | # test different vocab size
20 | lr='0.004'
21 | lr_min='0.002'
22 | gamma='0.03'
23 | nb_neg=50
24 | for ((nb_vocab=10000; nb_vocab<30000; nb_vocab+=2000)); do
25 |     log_file="../logs/main-lblv2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log"
26 |     command_line_="$python_command -C ${context_size} -E ${embed_size} \
27 |        --lr=${lr} --lr-min=${lr_min} \
28 |        -d --gamma=${gamma} -N ${nb_neg} \
29 |        -S $coding_file -e $embed_file --log-file $log_file \
30 |        -D $data_file -V $nb_vocab "
31 |     command_line=`echo "$command_line_" | tr -s " "`
32 |     ${command_prefix} nohup sh -c "$command_line" &
33 |     sleep 40
34 | done
35 | 
36 | 
37 | #for ((nb_vocab=30000; nb_vocab<=50000; nb_vocab+=2000)); do
38 | #    log_file="../logs/main-lblv2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log"
39 | #    command_line_="$python_command -C ${context_size} -E ${embed_size} \
40 | #       --lr=${lr} --lr-min=${lr_min} \
41 | #       -d --gamma=${gamma} -N ${nb_neg} \
42 | #       -S $coding_file -e $embed_file --log-file $log_file \
43 | #       -D $data_file -V $nb_vocab "
44 | #    command_line=`echo "$command_line_" | tr -s " "`
45 | #    ${command_prefix} nohup sh -c "$command_line" &
46 | #    sleep 40
47 | #done
48 | 
49 | # test different lr:
50 | #lr_min='0.002'
51 | #gamma='0.003'
52 | #nb_neg=50
53 | #nb_vocab=30000
54 | #for lr in 0.04 0.03 0.02; do #0.01 0.008 0.006; do
55 | #    log_file="../logs/main-lblv2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log"
56 | #    command_line_="$python_command -C ${context_size} -E ${embed_size} \
57 | #       --lr=${lr} --lr-min=${lr_min} \
58 | #       -d --gamma=${gamma} -N ${nb_neg} \
59 | #       -S $coding_file -e $embed_file --log-file $log_file \
60 | #       -D $data_file -V $nb_vocab "
61 | #    command_line=`echo "$command_line_" | tr -s " "`
62 | #    ${command_prefix} nohup sh -c "$command_line" &
63 | #    sleep 40
64 | #done
65 | 
66 | #
67 | #lr='0.01'
68 | #for gamma in 0.001 0.002 0.004; do
69 | #    log_file="../logs/main-lblv2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log"
70 | #    command_line_="$python_command -C ${context_size} -E ${embed_size} \
71 | #       --lr=${lr} --lr-min=${lr_min} \
72 | #       -d --gamma=${gamma} -N ${nb_neg} \
73 | #       -S $coding_file -e $embed_file --log-file $log_file \
74 | #       -D $data_file -V $nb_vocab "
75 | #    command_line=`echo "$command_line_" | tr -s " "`
76 | #    ${command_prefix} nohup sh -c "$command_line" &
77 | #    sleep 40
78 | #done
79 | 


--------------------------------------------------------------------------------
/real/main_nce2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | from models import NCELangModelV2
 7 | from keras.optimizers import AdamAnneal, adam
 8 | import optparse
 9 | 
10 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
11 | NB_RUN_WORDS = 100000000
12 | NB_VOCAB = 10000
13 | NB_RUN_VAL = 100000
14 | NB_EVALUATE = 5000000
15 | BATCH_SIZE = 256
16 | 
17 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
18 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
19 |                   help="learning rate")
20 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
21 |                   help="amount of training data (number of words)")
22 | parser.add_option("-V", "--vocab-size", type="int", dest="vocab_size", default=NB_VOCAB,
23 |                   help="vocabulary size")
24 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
25 |                   help="running validation words")
26 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
27 |                   help="running validation words")
28 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
29 |                   help="decaying rate")
30 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
31 |                   help="decaying rate")
32 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
33 |                   help="decay lr or not")
34 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
35 |                   help="amount of training data (number of words)")
36 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128,
37 |                   help="amount of training data (number of words)")
38 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
39 |                   help="amount of training data (number of words)")
40 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
41 |                   help="amount of training data (number of words)")
42 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
43 |                   help="decaying rate")
44 | parser.add_option("-s", "--save", type="str", dest="save", default='',
45 |                   help="amount of training data (number of words)")
46 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH,
47 |                   help="binarized corpus file")
48 | options, args = parser.parse_args()
49 | 
50 | nb_run_words = options.running_words
51 | nb_vocab = options.vocab_size
52 | nb_run_val = options.val_run
53 | nb_evaluate = options.nb_evaluation
54 | 
55 | unigram_table = get_unigram_probtable(nb_words=nb_vocab,
56 |                                       save_path='../data/wiki-unigram-prob-size%d.pkl' %
57 |                                                 nb_vocab)
58 | if options.decay:
59 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
60 | else:
61 |     opt = adam(lr=options.lr)
62 | 
63 | if options.log_file == '':
64 |     log_file = None
65 | else:
66 |     log_file = options.log_file
67 | 
68 | if options.save == '':
69 |     save_path = None
70 | else:
71 |     save_path = options.save
72 | 
73 | model = NCELangModelV2(vocab_size=nb_vocab,
74 |                        nb_negative=options.negative,
75 |                        embed_dims=options.embed_size,
76 |                        context_dims=options.context_size,
77 |                        negprob_table=unigram_table,
78 |                        optimizer=opt)
79 | model.compile()
80 | model.train(data_file=options.corpus,
81 |             save_path=save_path,
82 |             batch_size=BATCH_SIZE,
83 |             train_nb_words=nb_run_words,
84 |             val_nb_words=nb_evaluate,
85 |             train_val_nb=nb_run_val,
86 |             validation_interval=options.interval,
87 |             log_file=log_file)
88 | 
89 | 


--------------------------------------------------------------------------------
/real/main_nce2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../../lm:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce2.py"
13 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
14 | context_size=200
15 | embed_size=200
16 | 
17 | # test different vocab size
18 | lr='0.01'
19 | lr_min='0.002'
20 | gamma='0.003'
21 | nb_neg=50
22 | for ((nb_vocab=10000; nb_vocab<30000; nb_vocab+=2000)); do
23 |     log_file="../logs/main-nce2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log"
24 |     command_line_="$python_command -C ${context_size} -E ${embed_size} \
25 |         --lr=${lr} --lr-min=${lr_min} -d --gamma=${gamma} -N ${nb_neg} \
26 |         --log-file $log_file -D $data_file -V $nb_vocab "
27 |     command_line=`echo "$command_line_" | tr -s " "`
28 |     ${command_prefix} nohup sh -c "$command_line" &
29 | done
30 | 
31 | ## test different lr:
32 | #lr_min='0.002'
33 | #gamma='0.003'
34 | #nb_neg=50
35 | #nb_vocab=30000
36 | #for lr in 0.04 0.03 0.02 0.01 0.008 0.006; do
37 | #    log_file="../logs/main-nce2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log"
38 | #    command_line_="$python_command -C ${context_size} -E ${embed_size} \
39 | #        --lr=${lr} --lr-min=${lr_min} -d --gamma=${gamma} -N ${nb_neg} \
40 | #        --log-file $log_file -D $data_file -V $nb_vocab "
41 | #    command_line=`echo "$command_line_" | tr -s " "`
42 | #    ${command_prefix} nohup sh -c "$command_line" &
43 | #done
44 | ##
45 | #lr='0.01'
46 | #for gamma in 0.001 0.002 0.004; do
47 | #    log_file="../logs/main-nce2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log"
48 | #    command_line_="$python_command -C ${context_size} -E ${embed_size} \
49 | #        --lr=${lr} --lr-min=${lr_min} -d --gamma=${gamma} -N ${nb_neg} \
50 | #        --log-file $log_file -D $data_file -V $nb_vocab "
51 | #    command_line=`echo "$command_line_" | tr -s " "`
52 | #    ${command_prefix} nohup sh -c "$command_line" &
53 | #done
54 | 


--------------------------------------------------------------------------------
/real/main_nce4.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | __author__ = 'Yunchuan Chen'
  4 | 
  5 | from utils import get_unigram_probtable
  6 | import optparse
  7 | from keras.optimizers import adam, AdamAnneal
  8 | from models import NCELangModelV4, logger
  9 | import cPickle as pickle
 10 | import sys
 11 | # noinspection PyUnresolvedReferences
 12 | from SparseEmbed.cu_gen_sparse import compose_dense_repr
 13 | 
 14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
 15 | EMBED_FILE = '../data/models/embeddings/rw2vec_embeddings-size200.pkl'
 16 | NB_RUN_WORDS = 100000000
 17 | NB_VOCAB = 10000
 18 | NB_RUN_VAL = 100000
 19 | NB_EVALUATE = 5000000
 20 | BATCH_SIZE = 256
 21 | 
 22 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
 23 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
 24 |                   help="learning rate")
 25 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
 26 |                   help="amount of training data (number of words)")
 27 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file",
 28 |                   help="sparse coding file (pickle)")
 29 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", default='',
 30 |                   help="initial embedding file (pickle)")
 31 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
 32 |                   help="running validation words")
 33 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
 34 |                   help="running validation words")
 35 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
 36 |                   help="decaying rate")
 37 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
 38 |                   help="decaying rate")
 39 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
 40 |                   help="decay lr or not")
 41 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
 42 |                   help="amount of training data (number of words)")
 43 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128,
 44 |                   help="amount of training data (number of words)")
 45 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
 46 |                   help="amount of training data (number of words)")
 47 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
 48 |                   help="amount of training data (number of words)")
 49 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
 50 |                   help="decaying rate")
 51 | parser.add_option("-s", "--save", type="str", dest="save", default='',
 52 |                   help="amount of training data (number of words)")
 53 | parser.add_option("-V", "--nb-vocab", type="int", dest="nb_vocab", default=30000,
 54 |                   help="Number of vocabulary")
 55 | 
 56 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH,
 57 |                   help="binarized corpus file")
 58 | options, args = parser.parse_args()
 59 | 
 60 | nb_run_words = options.running_words
 61 | nb_run_val = options.val_run
 62 | nb_evaluate = options.nb_evaluation
 63 | embedding_file = options.embedding_file
 64 | 
 65 | with file(options.coding_file, 'rb') as f:
 66 |     sparse_coding = pickle.load(f)
 67 |     # print sparse_coding.dtype
 68 | 
 69 | nb_vocab = options.nb_vocab
 70 | sparse_coding = sparse_coding[nb_vocab//1000]
 71 | nb_vocab, nb_base = sparse_coding.shape
 72 | nb_base -= 1
 73 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab)
 74 | 
 75 | if embedding_file != '':
 76 |     with file('../data/wiki-wordmap-trunc300k.wp', 'rb') as f:
 77 |         wp = pickle.load(f)
 78 |     freq = wp['idx2wc']
 79 |     logger.info('Using word2vec to initialize word embeddings %s ' % embedding_file)
 80 |     ini_embeds = [compose_dense_repr(nb_base, nb_vocab, freq, embedding_file)]
 81 | else:
 82 |     ini_embeds = None
 83 | 
 84 | if options.decay:
 85 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
 86 | else:
 87 |     opt = adam(lr=options.lr)
 88 | 
 89 | if options.log_file == '':
 90 |     log_file = None
 91 | else:
 92 |     log_file = options.log_file
 93 | 
 94 | if options.save == '':
 95 |     save_path = None
 96 | else:
 97 |     save_path = options.save
 98 | 
 99 | model = NCELangModelV4(sparse_coding=sparse_coding, nb_negative=options.negative,
100 |                        embed_dims=options.embed_size, context_dims=options.context_size,
101 |                        init_embeddings=ini_embeds, negprob_table=unigram_table, optimizer=opt)
102 | model.compile()
103 | model.train(data_file=options.corpus,
104 |             save_path=save_path,
105 |             batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
106 |             val_nb_words=nb_evaluate, train_val_nb=nb_run_val,
107 |             validation_interval=options.interval, log_file=log_file)


--------------------------------------------------------------------------------
/real/main_nce4.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce4.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | # test different vocab size
20 | lr='0.01'
21 | lr_min='0.002'
22 | gamma='0.003'
23 | nb_neg=50
24 | for ((nb_vocab=10000; nb_vocab<30000; nb_vocab+=2000)); do
25 |     log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log"
26 |     command_line_="$python_command -C ${context_size} -E ${embed_size} \
27 |         --lr=${lr} --lr-min=${lr_min} \
28 |         -d --gamma=${gamma} -N ${nb_neg} \
29 |         -S $coding_file -e $embed_file --log-file $log_file \
30 |         -D $data_file -V $nb_vocab "
31 |     command_line=`echo "$command_line_" | tr -s " "`
32 |     ${command_prefix} nohup sh -c "$command_line" &
33 | done
34 | 
35 | ## test different lr:
36 | #lr_min='0.002'
37 | #gamma='0.003'
38 | #nb_neg=50
39 | #nb_vocab=30000
40 | #for lr in 0.04 0.03 0.02 0.01 0.008 0.006; do
41 | #    log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log"
42 | #    $command_prefix $python_command -C ${context_size} -E ${embed_size} \
43 | #        --lr=${lr} --lr-min=${lr_min} \
44 | #        -d --gamma=${gamma} -N ${nb_neg} \
45 | #        -S $coding_file -e $embed_file --log-file $log_file \
46 | #        -D $data_file -V $nb_vocab "
47 | #done
48 | #
49 | #lr='0.01'
50 | #for gamma in 0.001 0.002 0.004; do
51 | #    log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-N${nb_neg}.log"
52 | #    $command_prefix $python_command -C ${context_size} -E ${embed_size} \
53 | #        --lr=${lr} --lr-min=${lr_min} \
54 | #        -d --gamma=${gamma} -N ${nb_neg} \
55 | #        -S $coding_file -e $embed_file --log-file $log_file \
56 | #        -D $data_file -V $nb_vocab \
57 | #        $command_postfix
58 | #done
59 | 


--------------------------------------------------------------------------------
/real/main_nce4_lab_proxy_pku2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce4.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | lr='0.002'
20 | nb_neg=50
21 | for ((nb_vocab=32000; nb_vocab<36000; nb_vocab+=2000)); do
22 |     log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
23 |     command_line_="$python_command -V $nb_vocab  -C ${context_size} -E ${embed_size} \
24 |         --lr=${lr}   \
25 |         -N ${nb_neg} \
26 |         -S $coding_file -e $embed_file --log-file $log_file \
27 |         -D $data_file "
28 |     command_line=`echo "$command_line_" | tr -s " "`
29 |     ${command_prefix} sh -c "$command_line" &
30 |     sleep 80
31 | done
32 | 


--------------------------------------------------------------------------------
/real/main_nce4_pku1.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce4.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | lr='0.002'
20 | nb_neg=50
21 | for ((nb_vocab=16000; nb_vocab<30000; nb_vocab+=2000)); do
22 |     log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
23 |     command_line_="$python_command -V $nb_vocab  -C ${context_size} -E ${embed_size} \
24 |         --lr=${lr}   \
25 |         -N ${nb_neg} \
26 |         -S $coding_file -e $embed_file --log-file $log_file \
27 |         -D $data_file "
28 |     command_line=`echo "$command_line_" | tr -s " "`
29 |     ${command_prefix} nohup sh -c "$command_line" &
30 |     sleep 80
31 | done


--------------------------------------------------------------------------------
/real/main_nce4_pku1_proxy_pku2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce4.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | lr='0.002'
20 | nb_neg=50
21 | for ((nb_vocab=40000; nb_vocab<44000; nb_vocab+=2000)); do
22 |     log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
23 |     command_line_="$python_command -V $nb_vocab  -C ${context_size} -E ${embed_size} \
24 |         --lr=${lr}   \
25 |         -N ${nb_neg} \
26 |         -S $coding_file -e $embed_file --log-file $log_file \
27 |         -D $data_file "
28 |     command_line=`echo "$command_line_" | tr -s " "`
29 |     ${command_prefix} nohup sh -c "$command_line" &
30 |     sleep 80
31 | done
32 | 


--------------------------------------------------------------------------------
/real/main_nce4_pku2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce4.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | lr='0.002'
20 | nb_neg=50
21 | for ((nb_vocab=30000; nb_vocab<44000; nb_vocab+=2000)); do
22 |     log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
23 |     command_line_="$python_command -V $nb_vocab  -C ${context_size} -E ${embed_size} \
24 |         --lr=${lr}   \
25 |         -N ${nb_neg} \
26 |         -S $coding_file -e $embed_file --log-file $log_file \
27 |         -D $data_file "
28 |     command_line=`echo "$command_line_" | tr -s " "`
29 |     ${command_prefix} nohup sh -c "$command_line" &
30 |     sleep 80
31 | done


--------------------------------------------------------------------------------
/real/main_nce4_pku3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce4.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | lr='0.002'
20 | nb_neg=50
21 | for ((nb_vocab=44000; nb_vocab<52000; nb_vocab+=2000)); do
22 |     log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
23 |     command_line_="$python_command -V $nb_vocab  -C ${context_size} -E ${embed_size} \
24 |         --lr=${lr}   \
25 |         -N ${nb_neg} \
26 |         -S $coding_file -e $embed_file --log-file $log_file \
27 |         -D $data_file "
28 |     command_line=`echo "$command_line_" | tr -s " "`
29 |     ${command_prefix} nohup sh -c "$command_line" &
30 |     sleep 80
31 | done


--------------------------------------------------------------------------------
/real/main_nce4_pku3_proxy_pku2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce4.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | lr='0.002'
20 | nb_neg=50
21 | for ((nb_vocab=36000; nb_vocab<40000; nb_vocab+=2000)); do
22 |     log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
23 |     command_line_="$python_command -V $nb_vocab  -C ${context_size} -E ${embed_size} \
24 |         --lr=${lr}   \
25 |         -N ${nb_neg} \
26 |         -S $coding_file -e $embed_file --log-file $log_file \
27 |         -D $data_file "
28 |     command_line=`echo "$command_line_" | tr -s " "`
29 |     ${command_prefix} nohup sh -c "$command_line" &
30 |     sleep 80
31 | done
32 | 


--------------------------------------------------------------------------------
/real/main_nce7.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | __author__ = 'Yunchuan Chen'
  4 | 
  5 | from utils import get_unigram_probtable
  6 | import optparse
  7 | from keras.optimizers import adam, AdamAnneal
  8 | from models import NCELangModelV7, logger
  9 | import cPickle as pickle
 10 | import sys
 11 | # noinspection PyUnresolvedReferences
 12 | from SparseEmbed.cu_gen_sparse import compose_dense_repr
 13 | 
 14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
 15 | EMBED_FILE = '../data/models/embeddings/rw2vec_embeddings-size200.pkl'
 16 | NB_RUN_WORDS = 100000000
 17 | NB_VOCAB = 10000
 18 | NB_RUN_VAL = 100000
 19 | NB_EVALUATE = 5000000
 20 | BATCH_SIZE = 256
 21 | 
 22 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
 23 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
 24 |                   help="learning rate")
 25 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
 26 |                   help="amount of training data (number of words)")
 27 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file",
 28 |                   help="sparse coding file (pickle)")
 29 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", default='',
 30 |                   help="initial embedding file (pickle)")
 31 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
 32 |                   help="running validation words")
 33 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
 34 |                   help="running validation words")
 35 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
 36 |                   help="decaying rate")
 37 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
 38 |                   help="decaying rate")
 39 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
 40 |                   help="decay lr or not")
 41 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
 42 |                   help="amount of training data (number of words)")
 43 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128,
 44 |                   help="amount of training data (number of words)")
 45 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
 46 |                   help="amount of training data (number of words)")
 47 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
 48 |                   help="amount of training data (number of words)")
 49 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
 50 |                   help="decaying rate")
 51 | parser.add_option("-s", "--save", type="str", dest="save", default='',
 52 |                   help="amount of training data (number of words)")
 53 | parser.add_option("-V", "--nb-vocab", type="int", dest="nb_vocab", default=30000,
 54 |                   help="Number of vocabulary")
 55 | 
 56 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH,
 57 |                   help="binarized corpus file")
 58 | options, args = parser.parse_args()
 59 | 
 60 | nb_run_words = options.running_words
 61 | nb_run_val = options.val_run
 62 | nb_evaluate = options.nb_evaluation
 63 | embedding_file = options.embedding_file
 64 | 
 65 | with file(options.coding_file, 'rb') as f:
 66 |     sparse_coding = pickle.load(f)
 67 |     # print sparse_coding.dtype
 68 | 
 69 | nb_vocab = options.nb_vocab
 70 | sparse_coding = sparse_coding[nb_vocab//1000]
 71 | nb_vocab, nb_base = sparse_coding.shape
 72 | nb_base -= 1
 73 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab)
 74 | 
 75 | if embedding_file != '':
 76 |     with file('../data/wiki-wordmap-trunc300k.wp', 'rb') as f:
 77 |         wp = pickle.load(f)
 78 |     freq = wp['idx2wc']
 79 |     logger.info('Using word2vec to initialize word embeddings %s ' % embedding_file)
 80 |     ini_embeds = [compose_dense_repr(nb_base, nb_vocab, freq, embedding_file)]
 81 | else:
 82 |     ini_embeds = None
 83 | 
 84 | if options.decay:
 85 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
 86 | else:
 87 |     opt = adam(lr=options.lr)
 88 | 
 89 | if options.log_file == '':
 90 |     log_file = None
 91 | else:
 92 |     log_file = options.log_file
 93 | 
 94 | if options.save == '':
 95 |     save_path = None
 96 | else:
 97 |     save_path = options.save
 98 | 
 99 | model = NCELangModelV7(sparse_coding=sparse_coding, nb_negative=options.negative,
100 |                        embed_dims=options.embed_size, context_dims=options.context_size,
101 |                        init_embeddings=ini_embeds, negprob_table=unigram_table, optimizer=opt)
102 | model.compile()
103 | model.train(data_file=options.corpus,
104 |             save_path=save_path,
105 |             batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
106 |             val_nb_words=nb_evaluate, train_val_nb=nb_run_val,
107 |             validation_interval=options.interval, log_file=log_file)


--------------------------------------------------------------------------------
/real/main_nce7.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce7.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | # test different vocab size
20 | lr='0.006'
21 | lr_min='0.002'
22 | gamma='0.003'
23 | nb_neg=50
24 | for ((nb_vocab=10000; nb_vocab<30000; nb_vocab+=2000)); do
25 |     log_file="../logs/main-nce7-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log"
26 |     command_line_="$python_command -C ${context_size} -E ${embed_size} \
27 |         --lr=${lr} --lr-min=${lr_min} \
28 |         -d --gamma=${gamma} -N ${nb_neg} \
29 |         -S $coding_file -e $embed_file --log-file $log_file \
30 |         -D $data_file -V $nb_vocab "
31 |     command_line=`echo "$command_line_" | tr -s " "`
32 |     ${command_prefix} nohup sh -c "$command_line" &
33 |     sleep 120
34 | done
35 | 


--------------------------------------------------------------------------------
/real/main_nce7_pku1.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce7.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | # test different vocab size
20 | lr='0.002'
21 | nb_neg=50
22 | for ((nb_vocab=16000; nb_vocab<30000; nb_vocab+=2000)); do
23 |     log_file="../logs/main-nce7-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
24 |     command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \
25 |         --lr=${lr}  \
26 |         -N ${nb_neg} \
27 |         -S $coding_file -e $embed_file --log-file $log_file \
28 |         -D $data_file  "
29 |     command_line=`echo "$command_line_" | tr -s " "`
30 |     ${command_prefix} nohup sh -c "$command_line" &
31 |     sleep 80
32 | done
33 | 


--------------------------------------------------------------------------------
/real/main_nce7_pku2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce7.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | # test different vocab size
20 | lr='0.002'
21 | nb_neg=50
22 | for ((nb_vocab=30000; nb_vocab<44000; nb_vocab+=2000)); do
23 |     log_file="../logs/main-nce7-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
24 |     command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \
25 |         --lr=${lr}  \
26 |         -N ${nb_neg} \
27 |         -S $coding_file -e $embed_file --log-file $log_file \
28 |         -D $data_file  "
29 |     command_line=`echo "$command_line_" | tr -s " "`
30 |     ${command_prefix} nohup sh -c "$command_line" &
31 |     sleep 80
32 | done
33 | 


--------------------------------------------------------------------------------
/real/main_nce7_pku3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce7.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | # test different vocab size
20 | lr='0.002'
21 | nb_neg=50
22 | for ((nb_vocab=44000; nb_vocab<52000; nb_vocab+=2000)); do
23 |     log_file="../logs/main-nce7-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
24 |     command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \
25 |         --lr=${lr}  \
26 |         -N ${nb_neg} \
27 |         -S $coding_file -e $embed_file --log-file $log_file \
28 |         -D $data_file  "
29 |     command_line=`echo "$command_line_" | tr -s " "`
30 |     ${command_prefix} nohup sh -c "$command_line" &
31 |     sleep 80
32 | done
33 | 


--------------------------------------------------------------------------------
/real/main_nce8.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | __author__ = 'Yunchuan Chen'
  4 | 
  5 | from utils import get_unigram_probtable
  6 | import optparse
  7 | from keras.optimizers import adam, AdamAnneal
  8 | from models import NCELangModelV8, logger
  9 | import cPickle as pickle
 10 | import sys
 11 | # noinspection PyUnresolvedReferences
 12 | from SparseEmbed.cu_gen_sparse import compose_dense_repr
 13 | 
 14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
 15 | EMBED_FILE = '../data/models/embeddings/rw2vec_embeddings-size200.pkl'
 16 | NB_RUN_WORDS = 100000000
 17 | NB_VOCAB = 10000
 18 | NB_RUN_VAL = 100000
 19 | NB_EVALUATE = 5000000
 20 | BATCH_SIZE = 256
 21 | 
 22 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
 23 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
 24 |                   help="learning rate")
 25 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
 26 |                   help="amount of training data (number of words)")
 27 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file",
 28 |                   help="sparse coding file (pickle)")
 29 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", default='',
 30 |                   help="initial embedding file (pickle)")
 31 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
 32 |                   help="running validation words")
 33 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
 34 |                   help="running validation words")
 35 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
 36 |                   help="decaying rate")
 37 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
 38 |                   help="decaying rate")
 39 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
 40 |                   help="decay lr or not")
 41 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
 42 |                   help="amount of training data (number of words)")
 43 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128,
 44 |                   help="amount of training data (number of words)")
 45 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
 46 |                   help="amount of training data (number of words)")
 47 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
 48 |                   help="amount of training data (number of words)")
 49 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
 50 |                   help="decaying rate")
 51 | parser.add_option("-s", "--save", type="str", dest="save", default='',
 52 |                   help="amount of training data (number of words)")
 53 | parser.add_option("-V", "--nb-vocab", type="int", dest="nb_vocab", default=30000,
 54 |                   help="Number of vocabulary")
 55 | 
 56 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH,
 57 |                   help="binarized corpus file")
 58 | options, args = parser.parse_args()
 59 | 
 60 | nb_run_words = options.running_words
 61 | nb_run_val = options.val_run
 62 | nb_evaluate = options.nb_evaluation
 63 | embedding_file = options.embedding_file
 64 | 
 65 | with file(options.coding_file, 'rb') as f:
 66 |     sparse_coding = pickle.load(f)
 67 |     # print sparse_coding.dtype
 68 | 
 69 | nb_vocab = options.nb_vocab
 70 | sparse_coding = sparse_coding[nb_vocab//1000]
 71 | nb_vocab, nb_base = sparse_coding.shape
 72 | nb_base -= 1
 73 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab)
 74 | 
 75 | if embedding_file != '':
 76 |     with file('../data/wiki-wordmap-trunc300k.wp', 'rb') as f:
 77 |         wp = pickle.load(f)
 78 |     freq = wp['idx2wc']
 79 |     logger.info('Using word2vec to initialize word embeddings %s ' % embedding_file)
 80 |     ini_embeds = [compose_dense_repr(nb_base, nb_vocab, freq, embedding_file)]
 81 | else:
 82 |     ini_embeds = None
 83 | 
 84 | if options.decay:
 85 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
 86 | else:
 87 |     opt = adam(lr=options.lr)
 88 | 
 89 | if options.log_file == '':
 90 |     log_file = None
 91 | else:
 92 |     log_file = options.log_file
 93 | 
 94 | if options.save == '':
 95 |     save_path = None
 96 | else:
 97 |     save_path = options.save
 98 | 
 99 | model = NCELangModelV8(sparse_coding=sparse_coding, nb_negative=options.negative,
100 |                        embed_dims=options.embed_size, context_dims=options.context_size,
101 |                        init_embeddings=ini_embeds, negprob_table=unigram_table, optimizer=opt)
102 | model.compile()
103 | model.train(data_file=options.corpus,
104 |             save_path=save_path,
105 |             batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
106 |             val_nb_words=nb_evaluate, train_val_nb=nb_run_val,
107 |             validation_interval=options.interval, log_file=log_file)


--------------------------------------------------------------------------------
/real/main_nce8.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce8.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | # test different vocab size
20 | lr='0.002'
21 | nb_neg=50
22 | for ((nb_vocab=10000; nb_vocab<30000; nb_vocab+=2000)); do
23 |     log_file="../logs/main-nce8-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
24 |     command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \
25 |         --lr=${lr} \
26 |         -N ${nb_neg} \
27 |         -S $coding_file -e $embed_file --log-file $log_file \
28 |         -D $data_file "
29 |     command_line=`echo "$command_line_" | tr -s " "`
30 |     ${command_prefix} nohup sh -c "$command_line" &
31 |     sleep 80
32 | done
33 | 
34 | 


--------------------------------------------------------------------------------
/real/main_nce8_nodecay_lab.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce8.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | # test different vocab size
20 | lr='0.002'
21 | nb_neg=50
22 | for ((nb_vocab=10000; nb_vocab<16000; nb_vocab+=2000)); do
23 |     log_file="../logs/main-nce8-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
24 |     command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \
25 |         --lr=${lr}\
26 |         -N ${nb_neg} \
27 |         -S $coding_file -e $embed_file --log-file $log_file \
28 |         -D $data_file "
29 |     command_line=`echo "$command_line_" | tr -s " "`
30 |     ${command_prefix} sh -c "$command_line" &
31 |     sleep 120
32 | done
33 | 
34 | 


--------------------------------------------------------------------------------
/real/main_nce8_pku1.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce8.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | # test different vocab size
20 | lr='0.002'
21 | nb_neg=50
22 | for ((nb_vocab=38000; nb_vocab<46000; nb_vocab+=2000)); do
23 |     log_file="../logs/main-nce8-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
24 |     command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \
25 |         --lr=${lr} \
26 |         -N ${nb_neg} \
27 |         -S $coding_file -e $embed_file --log-file $log_file \
28 |         -D $data_file "
29 |     command_line=`echo "$command_line_" | tr -s " "`
30 |     ${command_prefix} nohup sh -c "$command_line" &
31 |     sleep 80
32 | done
33 | 
34 | 


--------------------------------------------------------------------------------
/real/main_nce8_pku2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce8.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | # test different vocab size
20 | lr='0.002'
21 | nb_neg=50
22 | for ((nb_vocab=16000; nb_vocab<38000; nb_vocab+=2000)); do
23 |     log_file="../logs/main-nce8-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
24 |     command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \
25 |         --lr=${lr} \
26 |         -N ${nb_neg} \
27 |         -S $coding_file -e $embed_file --log-file $log_file \
28 |         -D $data_file "
29 |     command_line=`echo "$command_line_" | tr -s " "`
30 |     ${command_prefix} nohup sh -c "$command_line" &
31 |     sleep 80
32 | done
33 | 
34 | 


--------------------------------------------------------------------------------
/real/main_nce8_pku3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | if [ "x$1" = "x--dry-run" ]; then
 3 |     command_prefix="echo "
 4 | else
 5 |     command_prefix=
 6 | fi
 7 | 
 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}"
 9 | export OMP_NUM_THREADS=2
10 | export MKL_NUM_THREADS=2
11 | 
12 | python_command="python main_nce8.py"
13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl"
14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl"
15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2"
16 | context_size=200
17 | embed_size=200
18 | 
19 | # test different vocab size
20 | lr='0.002'
21 | nb_neg=50
22 | for ((nb_vocab=46000; nb_vocab<52000; nb_vocab+=2000)); do
23 |     log_file="../logs/main-nce8-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log"
24 |     command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \
25 |         --lr=${lr} \
26 |         -N ${nb_neg} \
27 |         -S $coding_file -e $embed_file --log-file $log_file \
28 |         -D $data_file "
29 |     command_line=`echo "$command_line_" | tr -s " "`
30 |     ${command_prefix} nohup sh -c "$command_line" &
31 |     sleep 80
32 | done
33 | 
34 | 


--------------------------------------------------------------------------------
/real/run_batch.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | source ../environ.sh
 3 | 
 4 | models_dir="../data/models/lang"
 5 | log_dir="../logs"
 6 | 
 7 | python run_nce0.py --lr 0.04 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.04.pkl \
 8 |                    --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.04.log
 9 | python run_nce0.py --lr 0.02 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.02.pkl \
10 |                    --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.02.log
11 | python run_nce0.py --lr 0.01 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.01.pkl \
12 |                    --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.01.log
13 | python run_nce0.py --lr 0.005 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.005.pkl \
14 |                    --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.005.log
15 | 
16 | python run_nce0.py --lr 0.04 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.04-d.pkl \
17 |                    --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.04-d.log -d --lr-min 0.005
18 | python run_nce0.py --lr 0.02 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.02-d.pkl \
19 |                    --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.02-d.log -d --lr-min 0.005
20 | python run_nce0.py --lr 0.01 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.01-d.pkl \
21 |                    --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.01-d.log -d --lr-min 0.005


--------------------------------------------------------------------------------
/real/run_nce0.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | import optparse
 7 | from keras.optimizers import adam, AdamAnneal
 8 | from models import NCELangModel
 9 | 
10 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
11 | NB_RUN_WORDS = 100000000
12 | NB_VOCAB = 10000
13 | NB_RUN_VAL = 100000
14 | NB_EVALUATE = 5000000
15 | BATCH_SIZE = 256
16 | 
17 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
18 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
19 |                   help="learning rate")
20 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
21 |                   help="amount of training data (number of words)")
22 | parser.add_option("-V", "--vocab-size", type="int", dest="vocab_size", default=NB_VOCAB,
23 |                   help="vocabulary size")
24 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
25 |                   help="running validation words")
26 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
27 |                   help="running validation words")
28 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
29 |                   help="decaying rate")
30 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
31 |                   help="decaying rate")
32 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
33 |                   help="decay lr or not")
34 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
35 |                   help="amount of training data (number of words)")
36 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128,
37 |                   help="amount of training data (number of words)")
38 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
39 |                   help="amount of training data (number of words)")
40 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
41 |                   help="amount of training data (number of words)")
42 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
43 |                   help="decaying rate")
44 | parser.add_option("-s", "--save", type="str", dest="save", default='',
45 |                   help="amount of training data (number of words)")
46 | options, args = parser.parse_args()
47 | 
48 | nb_run_words = options.running_words
49 | nb_vocab = options.vocab_size
50 | nb_run_val = options.val_run
51 | nb_evaluate = options.nb_evaluation
52 | 
53 | # unigram_table = get_unigram_probtable(nb_words=nb_vocab)
54 | unigram_table = get_unigram_probtable(nb_words=nb_vocab,
55 |                                       save_path='../data/wiki-unigram-prob-size%d.pkl' %
56 |                                                 nb_vocab)
57 | 
58 | if options.decay:
59 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
60 | else:
61 |     opt = adam(lr=options.lr)
62 | 
63 | if options.log_file == '':
64 |     log_file = None
65 | else:
66 |     log_file = options.log_file
67 | 
68 | if options.save == '':
69 |     save_path = None
70 | else:
71 |     save_path = options.save
72 | 
73 | model = NCELangModel(vocab_size=nb_vocab, nb_negative=options.negative, 
74 |                      embed_dims=options.embed_size, context_dims=options.context_size,
75 |                      negprob_table=unigram_table, optimizer=opt)
76 | model.compile()
77 | model.train(data_file=DATA_PATH,
78 |             save_path=save_path,
79 |             batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
80 |             val_nb_words=nb_evaluate, train_val_nb=nb_run_val,
81 |             validation_interval=options.interval, log_file=log_file)


--------------------------------------------------------------------------------
/real/run_nce0_default.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | from models import NCELangModel
 7 | 
 8 | NB_RUN_WORDS = 100000000
 9 | NB_VOCAB = 10000
10 | NB_RUN_VAL = 100000
11 | NB_EVALUATE = 5000000
12 | SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128.pkl'
13 | 
14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
15 | BATCH_SIZE = 256
16 | VAL_INTER = 1200
17 | 
18 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB)
19 | 
20 | model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128,
21 |                      negprob_table=unigram_table, optimizer='adam')
22 | model.compile()
23 | model.train(data_file=DATA_PATH,
24 |             save_path=SAVE_PATH,
25 |             batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS,
26 |             val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)


--------------------------------------------------------------------------------
/real/run_nce0_neg100_default.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | from models import NCELangModel
 7 | 
 8 | NB_RUN_WORDS = 100000000
 9 | NB_VOCAB = 10000
10 | NB_RUN_VAL = 100000
11 | NB_EVALUATE = 5000000
12 | SAVE_PATH = '../data/models/lang/nce0-neg100-e128-c128.pkl'
13 | 
14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
15 | BATCH_SIZE = 256
16 | VAL_INTER = 1200
17 | 
18 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB)
19 | 
20 | model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=100, embed_dims=128, context_dims=128,
21 |                      negprob_table=unigram_table, optimizer='adam')
22 | model.compile()
23 | # model.train(data_file='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2',
24 | #             save_path='../data/models/lang/nce-neg50-e128-c128.pkl',
25 | #             batch_size=256, train_nb_words=NB_RUN_WORDS//100,
26 | #             val_nb_words=NB_EVALUATE//10, train_val_nb=NB_RUN_VAL//5, validation_interval=40)
27 | model.train(data_file=DATA_PATH, save_path=SAVE_PATH,
28 |             batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS,
29 |             val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)


--------------------------------------------------------------------------------
/real/run_nce0_neg50_lr0.005.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | from models import NCELangModel
 7 | from keras.optimizers import adam
 8 | 
 9 | NB_RUN_WORDS = 100000000
10 | NB_VOCAB = 10000
11 | NB_RUN_VAL = 100000
12 | NB_EVALUATE = 5000000
13 | SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128-lr0.005.pkl'
14 | 
15 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
16 | BATCH_SIZE = 256
17 | VAL_INTER = 1200
18 | 
19 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB)
20 | 
21 | opt = adam(lr=0.005)
22 | model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128,
23 |                      negprob_table=unigram_table, optimizer=opt)
24 | model.compile()
25 | model.train(data_file=DATA_PATH,
26 |             save_path=SAVE_PATH,
27 |             batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS,
28 |             val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)


--------------------------------------------------------------------------------
/real/run_nce0_neg50_lr0.01.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | from models import NCELangModel
 7 | from keras.optimizers import adam
 8 | 
 9 | NB_RUN_WORDS = 100000000
10 | NB_VOCAB = 10000
11 | NB_RUN_VAL = 100000
12 | NB_EVALUATE = 5000000
13 | SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128-lr0.01.pkl'
14 | 
15 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
16 | BATCH_SIZE = 256
17 | VAL_INTER = 1200
18 | 
19 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB)
20 | 
21 | opt = adam(lr=0.01)
22 | model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128,
23 |                      negprob_table=unigram_table, optimizer=opt)
24 | model.compile()
25 | model.train(data_file=DATA_PATH,
26 |             save_path=SAVE_PATH,
27 |             batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS,
28 |             val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)


--------------------------------------------------------------------------------
/real/run_nce0_neg50_lr0.01_g0.001.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | from models import NCELangModel
 7 | from keras.optimizers import AdamAnneal
 8 | 
 9 | NB_RUN_WORDS = 100000000
10 | NB_VOCAB = 10000
11 | NB_RUN_VAL = 100000
12 | NB_EVALUATE = 5000000
13 | SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128-lr0.01-gamma0.001.pkl'
14 | 
15 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
16 | BATCH_SIZE = 256
17 | VAL_INTER = 1200
18 | 
19 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB)
20 | 
21 | opt = AdamAnneal(lr=0.01, lr_min=0.0045, gamma=0.001)
22 | model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128,
23 |                      negprob_table=unigram_table, optimizer=opt)
24 | model.compile()
25 | model.train(data_file=DATA_PATH,
26 |             save_path=SAVE_PATH,
27 |             batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS,
28 |             val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)


--------------------------------------------------------------------------------
/real/run_nce1_neg50_default.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | from models import NCELangModelV1
 7 | 
 8 | NB_RUN_WORDS = 100000000
 9 | NB_VOCAB = 10000
10 | NB_RUN_VAL = 100000
11 | NB_EVALUATE = 5000000
12 | SAVE_PATH = '../data/models/lang/nce1-neg50-e128-c128.pkl'
13 | 
14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
15 | BATCH_SIZE = 256
16 | VAL_INTER = 1200
17 | 
18 | # NB_RUN_WORDS = 5000000
19 | # NB_VOCAB = 10000
20 | # NB_RUN_VAL = 100000
21 | # NB_EVALUATE = 500000
22 | 
23 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB)
24 | 
25 | model = NCELangModelV1(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128,
26 |                        negprob_table=unigram_table, optimizer='adam')
27 | model.compile()
28 | # model.train(data_file='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2',
29 | #             save_path='../data/models/lang/nce-neg50-e128-c128.pkl',
30 | #             batch_size=256, train_nb_words=NB_RUN_WORDS//100,
31 | #             val_nb_words=NB_EVALUATE//10, train_val_nb=NB_RUN_VAL//5, validation_interval=40)
32 | model.train(data_file=DATA_PATH, save_path=SAVE_PATH,
33 |             batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS,
34 |             val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)


--------------------------------------------------------------------------------
/real/run_nce2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | from models import NCELangModelV2
 7 | from keras.optimizers import AdamAnneal, adam
 8 | import optparse
 9 | 
10 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
11 | NB_RUN_WORDS = 100000000
12 | NB_VOCAB = 10000
13 | NB_RUN_VAL = 100000
14 | NB_EVALUATE = 5000000
15 | BATCH_SIZE = 256
16 | 
17 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
18 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
19 |                   help="learning rate")
20 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
21 |                   help="amount of training data (number of words)")
22 | parser.add_option("-V", "--vocab-size", type="int", dest="vocab_size", default=NB_VOCAB,
23 |                   help="vocabulary size")
24 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
25 |                   help="running validation words")
26 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
27 |                   help="running validation words")
28 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
29 |                   help="decaying rate")
30 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
31 |                   help="decaying rate")
32 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
33 |                   help="decay lr or not")
34 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
35 |                   help="amount of training data (number of words)")
36 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128,
37 |                   help="amount of training data (number of words)")
38 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
39 |                   help="amount of training data (number of words)")
40 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
41 |                   help="amount of training data (number of words)")
42 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
43 |                   help="decaying rate")
44 | parser.add_option("-s", "--save", type="str", dest="save", default='',
45 |                   help="amount of training data (number of words)")
46 | options, args = parser.parse_args()
47 | 
48 | nb_run_words = options.running_words
49 | nb_vocab = options.vocab_size
50 | nb_run_val = options.val_run
51 | nb_evaluate = options.nb_evaluation
52 | 
53 | # unigram_table = get_unigram_probtable(nb_words=nb_vocab)
54 | unigram_table = get_unigram_probtable(nb_words=nb_vocab,
55 |                                       save_path='../data/wiki-unigram-prob-size%d.pkl' %
56 |                                                 nb_vocab)
57 | if options.decay:
58 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
59 | else:
60 |     opt = adam(lr=options.lr)
61 | 
62 | if options.log_file == '':
63 |     log_file = None
64 | else:
65 |     log_file = options.log_file
66 | 
67 | if options.save == '':
68 |     save_path = None
69 | else:
70 |     save_path = options.save
71 | 
72 | model = NCELangModelV2(vocab_size=nb_vocab, nb_negative=options.negative,
73 |                        embed_dims=options.embed_size, context_dims=options.context_size,
74 |                        negprob_table=unigram_table, optimizer=opt)
75 | model.compile()
76 | model.train(data_file=DATA_PATH,
77 |             save_path=save_path,
78 |             batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
79 |             val_nb_words=nb_evaluate, train_val_nb=nb_run_val,
80 |             validation_interval=options.interval, log_file=log_file)
81 | 
82 | 


--------------------------------------------------------------------------------
/real/run_nce2_neg50_lr0.01_g0.001.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | from models import NCELangModelV2
 7 | from keras.optimizers import AdamAnneal
 8 | 
 9 | NB_RUN_WORDS = 100000000
10 | NB_VOCAB = 10000
11 | NB_RUN_VAL = 100000
12 | NB_EVALUATE = 5000000
13 | SAVE_PATH = '../data/models/lang/nce2-neg50-e128-c128-lr0.01-gamma0.001.pkl'
14 | 
15 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
16 | BATCH_SIZE = 256
17 | VAL_INTER = 1200
18 | 
19 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB)
20 | 
21 | opt = AdamAnneal(lr=0.01, lr_min=0.0045, gamma=0.001)
22 | model = NCELangModelV2(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128,
23 |                        negprob_table=unigram_table, optimizer=opt)
24 | model.compile()
25 | model.train(data_file=DATA_PATH,
26 |             save_path=SAVE_PATH,
27 |             batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS,
28 |             val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)


--------------------------------------------------------------------------------
/real/run_nce3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | import optparse
 7 | from keras.optimizers import adam, AdamAnneal
 8 | from models import NCELangModelV3
 9 | import cPickle as pickle
10 | 
11 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
12 | NB_RUN_WORDS = 100000000
13 | NB_VOCAB = 10000
14 | NB_RUN_VAL = 100000
15 | NB_EVALUATE = 5000000
16 | BATCH_SIZE = 256
17 | 
18 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
19 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
20 |                   help="learning rate")
21 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
22 |                   help="amount of training data (number of words)")
23 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file",
24 |                   help="sparse coding file (pickle)")
25 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file",
26 |                   help="initial embedding file (pickle)")
27 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
28 |                   help="running validation words")
29 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
30 |                   help="running validation words")
31 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
32 |                   help="decaying rate")
33 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
34 |                   help="decaying rate")
35 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
36 |                   help="decay lr or not")
37 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
38 |                   help="amount of training data (number of words)")
39 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128,
40 |                   help="amount of training data (number of words)")
41 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
42 |                   help="amount of training data (number of words)")
43 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
44 |                   help="amount of training data (number of words)")
45 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
46 |                   help="decaying rate")
47 | parser.add_option("-s", "--save", type="str", dest="save", default='',
48 |                   help="amount of training data (number of words)")
49 | options, args = parser.parse_args()
50 | 
51 | nb_run_words = options.running_words
52 | nb_run_val = options.val_run
53 | nb_evaluate = options.nb_evaluation
54 | 
55 | 
56 | with file(options.coding_file, 'rb') as f:
57 |     sparse_coding = pickle.load(f)
58 |     # print sparse_coding.dtype
59 | 
60 | nb_vocab = sparse_coding.shape[0]
61 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab)
62 | 
63 | if options.embedding_file != '':
64 |     with file(options.embedding_file, 'rb') as f:
65 |         ini_embeds = pickle.load(f)
66 |     # print ini_embeds.dtype
67 |     # print ini_embeds.shape
68 |     # import sys
69 |     # sys.exit(0)
70 | else:
71 |     ini_embeds = None
72 | 
73 | if options.decay:
74 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
75 | else:
76 |     opt = adam(lr=options.lr)
77 | 
78 | if options.log_file == '':
79 |     log_file = None
80 | else:
81 |     log_file = options.log_file
82 | 
83 | if options.save == '':
84 |     save_path = None
85 | else:
86 |     save_path = options.save
87 | 
88 | model = NCELangModelV3(sparse_coding=sparse_coding, nb_negative=options.negative,
89 |                        embed_dims=options.embed_size, context_dims=options.context_size,
90 |                        init_embeddings=[ini_embeds], negprob_table=unigram_table, optimizer=opt)
91 | model.compile()
92 | model.train(data_file=DATA_PATH,
93 |             save_path=save_path,
94 |             batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
95 |             val_nb_words=nb_evaluate, train_val_nb=nb_run_val,
96 |             validation_interval=options.interval, log_file=log_file)


--------------------------------------------------------------------------------
/real/run_nce4.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | import optparse
 7 | from keras.optimizers import adam, AdamAnneal
 8 | from models import NCELangModelV4
 9 | import cPickle as pickle
10 | 
11 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
12 | NB_RUN_WORDS = 100000000
13 | NB_VOCAB = 10000
14 | NB_RUN_VAL = 100000
15 | NB_EVALUATE = 5000000
16 | BATCH_SIZE = 256
17 | 
18 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
19 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
20 |                   help="learning rate")
21 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
22 |                   help="amount of training data (number of words)")
23 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file",
24 |                   help="sparse coding file (pickle)")
25 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file",
26 |                   help="initial embedding file (pickle)")
27 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
28 |                   help="running validation words")
29 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
30 |                   help="running validation words")
31 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
32 |                   help="decaying rate")
33 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
34 |                   help="decaying rate")
35 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
36 |                   help="decay lr or not")
37 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
38 |                   help="amount of training data (number of words)")
39 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128,
40 |                   help="amount of training data (number of words)")
41 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
42 |                   help="amount of training data (number of words)")
43 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
44 |                   help="amount of training data (number of words)")
45 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
46 |                   help="decaying rate")
47 | parser.add_option("-s", "--save", type="str", dest="save", default='',
48 |                   help="amount of training data (number of words)")
49 | options, args = parser.parse_args()
50 | 
51 | nb_run_words = options.running_words
52 | nb_run_val = options.val_run
53 | nb_evaluate = options.nb_evaluation
54 | 
55 | 
56 | with file(options.coding_file, 'rb') as f:
57 |     sparse_coding = pickle.load(f)
58 |     # print sparse_coding.dtype
59 | 
60 | nb_vocab = sparse_coding.shape[0]
61 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab)
62 | 
63 | if options.embedding_file != '':
64 |     with file(options.embedding_file, 'rb') as f:
65 |         ini_embeds = pickle.load(f)
66 |     # print ini_embeds.dtype
67 |     # print ini_embeds.shape
68 |     # import sys
69 |     # sys.exit(0)
70 | else:
71 |     ini_embeds = None
72 | 
73 | if options.decay:
74 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
75 | else:
76 |     opt = adam(lr=options.lr)
77 | 
78 | if options.log_file == '':
79 |     log_file = None
80 | else:
81 |     log_file = options.log_file
82 | 
83 | if options.save == '':
84 |     save_path = None
85 | else:
86 |     save_path = options.save
87 | 
88 | model = NCELangModelV4(sparse_coding=sparse_coding, nb_negative=options.negative,
89 |                        embed_dims=options.embed_size, context_dims=options.context_size,
90 |                        init_embeddings=[ini_embeds], negprob_table=unigram_table, optimizer=opt)
91 | model.compile()
92 | model.train(data_file=DATA_PATH,
93 |             save_path=save_path,
94 |             batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
95 |             val_nb_words=nb_evaluate, train_val_nb=nb_run_val,
96 |             validation_interval=options.interval, log_file=log_file)


--------------------------------------------------------------------------------
/real/run_nce5.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from utils import get_unigram_probtable
 6 | import optparse
 7 | from keras.optimizers import adam, AdamAnneal
 8 | from models import NCELangModelV5
 9 | import cPickle as pickle
10 | 
11 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
12 | NB_RUN_WORDS = 100000000
13 | NB_VOCAB = 10000
14 | NB_RUN_VAL = 100000
15 | NB_EVALUATE = 5000000
16 | BATCH_SIZE = 256
17 | 
18 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
19 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
20 |                   help="learning rate")
21 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
22 |                   help="amount of training data (number of words)")
23 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file",
24 |                   help="sparse coding file (pickle)")
25 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file",
26 |                   help="initial embedding file (pickle)")
27 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
28 |                   help="running validation words")
29 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
30 |                   help="running validation words")
31 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
32 |                   help="decaying rate")
33 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
34 |                   help="decaying rate")
35 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
36 |                   help="decay lr or not")
37 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
38 |                   help="amount of training data (number of words)")
39 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
40 |                   help="amount of training data (number of words)")
41 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
42 |                   help="amount of training data (number of words)")
43 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
44 |                   help="decaying rate")
45 | parser.add_option("-s", "--save", type="str", dest="save", default='',
46 |                   help="amount of training data (number of words)")
47 | options, args = parser.parse_args()
48 | 
49 | nb_run_words = options.running_words
50 | nb_run_val = options.val_run
51 | nb_evaluate = options.nb_evaluation
52 | 
53 | 
54 | with file(options.coding_file, 'rb') as f:
55 |     sparse_coding = pickle.load(f)
56 |     # print sparse_coding.dtype
57 | 
58 | nb_vocab = sparse_coding.shape[0]
59 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab)
60 | 
61 | if options.embedding_file != '':
62 |     with file(options.embedding_file, 'rb') as f:
63 |         ini_embeds = pickle.load(f)
64 |     # print ini_embeds.dtype
65 |     # print ini_embeds.shape
66 |     # import sys
67 |     # sys.exit(0)
68 | else:
69 |     ini_embeds = None
70 | 
71 | if options.decay:
72 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
73 | else:
74 |     opt = adam(lr=options.lr)
75 | 
76 | if options.log_file == '':
77 |     log_file = None
78 | else:
79 |     log_file = options.log_file
80 | 
81 | if options.save == '':
82 |     save_path = None
83 | else:
84 |     save_path = options.save
85 | 
86 | model = NCELangModelV5(sparse_coding=sparse_coding, nb_negative=options.negative,
87 |                        embed_dims=options.embed_size, init_embeddings=[ini_embeds],
88 |                        negprob_table=unigram_table, optimizer=opt)
89 | model.compile()
90 | model.train(data_file=DATA_PATH,
91 |             save_path=save_path,
92 |             batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
93 |             val_nb_words=nb_evaluate, train_val_nb=nb_run_val,
94 |             validation_interval=options.interval, log_file=log_file)


--------------------------------------------------------------------------------
/real/run_nce6.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | __author__ = 'Yunchuan Chen'
  4 | 
  5 | from utils import get_unigram_probtable
  6 | import optparse
  7 | from keras.optimizers import adam, AdamAnneal
  8 | from models import NCELangModelV6
  9 | import cPickle as pickle
 10 | 
 11 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
 12 | NB_RUN_WORDS = 100000000
 13 | NB_VOCAB = 10000
 14 | NB_RUN_VAL = 100000
 15 | NB_EVALUATE = 5000000
 16 | BATCH_SIZE = 256
 17 | 
 18 | parser = optparse.OptionParser(usage="%prog [OPTIONS]")
 19 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01,
 20 |                   help="learning rate")
 21 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS,
 22 |                   help="amount of training data (number of words)")
 23 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file",
 24 |                   help="sparse coding file (pickle)")
 25 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file",
 26 |                   help="initial embedding file (pickle)")
 27 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL,
 28 |                   help="running validation words")
 29 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE,
 30 |                   help="running validation words")
 31 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001,
 32 |                   help="decaying rate")
 33 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005,
 34 |                   help="decaying rate")
 35 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False,
 36 |                   help="decay lr or not")
 37 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50,
 38 |                   help="amount of training data (number of words)")
 39 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128,
 40 |                   help="amount of training data (number of words)")
 41 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='',
 42 |                   help="amount of training data (number of words)")
 43 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200.,
 44 |                   help="decaying rate")
 45 | parser.add_option("-s", "--save", type="str", dest="save", default='',
 46 |                   help="amount of training data (number of words)")
 47 | parser.add_option("-p", "--init", type="str", dest="init", default='first',
 48 |                   help="init scheme")
 49 | options, args = parser.parse_args()
 50 | 
 51 | nb_run_words = options.running_words
 52 | nb_run_val = options.val_run
 53 | nb_evaluate = options.nb_evaluation
 54 | 
 55 | 
 56 | with file(options.coding_file, 'rb') as f:
 57 |     sparse_coding = pickle.load(f)
 58 |     # print sparse_coding.dtype
 59 | 
 60 | nb_vocab = sparse_coding.shape[0]
 61 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab)
 62 | 
 63 | if options.embedding_file != '':
 64 |     with file(options.embedding_file, 'rb') as f:
 65 |         ini_embeds = pickle.load(f)
 66 | 
 67 |     if options.init == 'first':
 68 |         init_e = [ini_embeds]
 69 |     else:
 70 |         init_e = [ini_embeds] * 4
 71 |     # print ini_embeds.dtype
 72 |     # print ini_embeds.shape
 73 |     # import sys
 74 |     # sys.exit(0)
 75 | else:
 76 |     init_e = None
 77 | 
 78 | if options.decay:
 79 |     opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma)
 80 | else:
 81 |     opt = adam(lr=options.lr)
 82 | 
 83 | if options.log_file == '':
 84 |     log_file = None
 85 | else:
 86 |     log_file = options.log_file
 87 | 
 88 | if options.save == '':
 89 |     save_path = None
 90 | else:
 91 |     save_path = options.save
 92 | 
 93 | model = NCELangModelV6(sparse_coding=sparse_coding, nb_negative=options.negative,
 94 |                        embed_dims=options.embed_size, init_embeddings=init_e,
 95 |                        negprob_table=unigram_table, optimizer=opt)
 96 | model.compile()
 97 | model.train(data_file=DATA_PATH,
 98 |             save_path=save_path,
 99 |             batch_size=BATCH_SIZE, train_nb_words=nb_run_words,
100 |             val_nb_words=nb_evaluate, train_val_nb=nb_run_val,
101 |             validation_interval=options.interval, log_file=log_file)


--------------------------------------------------------------------------------
/real/run_tree_huffman_lr0.01_g0.001.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from models import TreeLangModel
 6 | from keras.optimizers import AdamAnneal
 7 | import cPickle as pickle
 8 | 
 9 | NB_RUN_WORDS = 100000000
10 | NB_VOCAB = 10000
11 | NB_RUN_VAL = 100000
12 | NB_EVALUATE = 5000000
13 | 
14 | # NB_RUN_WORDS = 1000000
15 | # NB_VOCAB = 10000
16 | # NB_RUN_VAL = 10000
17 | # NB_EVALUATE = 50000
18 | SAVE_PATH = '../data/models/lang/huffman-e128-c128-lr0.01-gamma0.001.pkl'
19 | 
20 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
21 | BATCH_SIZE = 256
22 | VAL_INTER = 1200
23 | 
24 | with file('../data/wiki-huffman-tree-info-Vsize10000.pkl', 'rb') as f:
25 |     tree_info = pickle.load(f)
26 | 
27 | wrd2cls = tree_info['idx2cls']
28 | wrd2bitstr = tree_info['idx2bitstr']
29 | 
30 | opt = AdamAnneal(lr=0.01, lr_min=0.0045, gamma=0.001)
31 | model = TreeLangModel(vocab_size=NB_VOCAB, embed_dim=128, cntx_dim=128,
32 |                       word2class=wrd2cls, word2bitstr=wrd2bitstr, optimizer=opt)
33 | model.compile()
34 | model.train(data_file=DATA_PATH,
35 |             save_path=SAVE_PATH,
36 |             batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS,
37 |             val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER)


--------------------------------------------------------------------------------
/real/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | __author__ = 'Yunchuan Chen'
4 | 
5 | from .utils import floatX, categorical_crossentropy, objective_fnc, chunk_sentences,\
6 |     slice_X, get_unigram_probtable, TableSampler, load_huffman_tree, save_tree, create_tree,\
7 |     LangModelLogger, LangHistory, epsilon
8 | from .preprocess import data4sri


--------------------------------------------------------------------------------
/real/utils/check_maps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | 
 6 | def check_maps(map1, map2):
 7 |     for w1, w2 in zip(map1['idx2word'], map2['idx2word']):
 8 |         if w1 != w2:
 9 |             raise Exception('idx2word: map not equal')
10 | 
11 |     for i, m in enumerate([map1, map2]):
12 |         for idx, w in enumerate(m['idx2word']):
13 |             if idx != m['word2idx'][w]:
14 |                 raise Exception('map%d not consistent' % i)
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     import cPickle as pickle
19 |     wp_file = '../../data/wiki-wordmap-trunc300k.wp'
20 |     embeds_file = '/home/cyc/Data/models/embeddings/rw2vec_embeddings-size200.pkl'
21 | 
22 |     with file(wp_file, 'rb') as f:
23 |         wp = pickle.load(f)
24 | 
25 |     with file(embeds_file, 'rb') as f:
26 |         em = pickle.load(f)
27 | 
28 |     check_maps(wp, em)
29 | 
30 | 


--------------------------------------------------------------------------------
/real/utils/preprocess.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from bz2 import BZ2File
  5 | import unittest
  6 | import os
  7 | import numpy as np
  8 | import cPickle as pickle
  9 | import logging
 10 | import re
 11 | from utils import chunk_sentences
 12 | 
 13 | __author__ = 'Yunchuan Chen'
 14 | MAX_SETN_LEN = 65
 15 | DATA_ROOT = '../../data/'
 16 | 
 17 | 
 18 | class ReadFileTest(unittest.TestCase):
 19 |     def test_prprcs_wrt(self):
 20 |         if not os.path.exists(DATA_ROOT+'corpus/wiki-sg-norm-lc-drop.bz2'):
 21 |             return
 22 |         with BZ2File(DATA_ROOT+'corpus/wiki-sg-norm-lc-drop.bz2') as f:
 23 |             f.readline()
 24 |             line = f.readline()
 25 |             self.failUnless('it was shortlisted for the booker prize and won several other awards .'.strip() == line.strip(),
 26 |                             'read line: %s not as expected.\n' % line)
 27 | 
 28 |     def test_ixport(self):
 29 |         wpx, flag = export_wordmap()
 30 |         wpi = import_wordmap()
 31 | 
 32 |         self.failUnless(flag is True, 'Failure flag received from export map')
 33 |         if wpx is not None:
 34 |             self.failUnless('word2idx' in wpx, 'word2idx key lost for the wordmap.')
 35 |             self.failUnless('idx2word' in wpx, 'idx2word key lost for the wordmap.')
 36 |             self.failUnless('idx2wc' in wpx, 'idx2wc key lost for the wordmap.')
 37 | 
 38 |         self.failUnless('word2idx' in wpi, 'word2idx key lost for the wordmap.')
 39 |         self.failUnless('idx2word' in wpi, 'idx2word key lost for the wordmap.')
 40 |         self.failUnless('idx2wc' in wpi, 'idx2wc key lost for the wordmap.')
 41 | 
 42 | 
 43 | def smart_open(fname, mode='rb', buffering=5*2**20):
 44 |     _, ext = os.path.splitext(fname)
 45 |     if ext == '.bz2':
 46 |         from bz2 import BZ2File
 47 |         return BZ2File(fname, mode, buffering)
 48 |     # if ext == '.gz':
 49 |     #     from gzip import GzipFile
 50 |     #     return GzipFile(fname, mode, buffering)
 51 |     return open(fname, mode, buffering)
 52 | 
 53 | 
 54 | def export_wordmap(dist_file=DATA_ROOT+'wiki-wordmap.wp',
 55 |                    corpus_file=DATA_ROOT+'corpus/wiki-sg-norm-lc.txt', rebuild=False):
 56 |     """
 57 |     :param dist_file: file name to store the wordmap
 58 |     :param corpus_file: corpus source to build wordmap against
 59 |     :param rebuild: whether rebuild wordmap if it already exists.
 60 |     :return: exported model and a flag.
 61 |     """
 62 |     if os.path.exists(dist_file) and not rebuild:
 63 |         return None, True
 64 |     word2cnt = dict()
 65 |     with smart_open(corpus_file, buffering=5*2**20) as f:
 66 |         for sent in f:
 67 |             words = sent.split()
 68 |             for w in words:
 69 |                 try:
 70 |                     word2cnt[w] += 1
 71 |                 except KeyError:
 72 |                     word2cnt[w] = 1
 73 |     kv = sorted(word2cnt.items(), key=lambda x: x[1], reverse=True)
 74 |     idx2word = [w for w, _ in kv]
 75 |     idx2wc = [c for _, c in kv]
 76 |     word2idx = dict((w, idx) for idx, (w, _) in enumerate(kv))
 77 |     model = {'idx2word': idx2word, 'idx2wc': idx2wc, 'word2idx': word2idx}
 78 |     with file(dist_file, 'wb') as f:
 79 |         pickle.dump(model, f, -1)
 80 |     return model, True
 81 | 
 82 | 
 83 | def import_wordmap(fname=DATA_ROOT+'wiki-wordmap.wp'):
 84 |     """
 85 |     :param fname: a string indicate where the wordmap stores.
 86 |     :return: wordmap
 87 |     """
 88 |     with file(fname, 'rb') as f:
 89 |         wp = pickle.load(f)
 90 |     return wp
 91 | 
 92 | 
 93 | def preprocess_corpus(corpus_file=DATA_ROOT+'corpus/wiki-sg-norm-lc.txt',
 94 |                       dist_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop.bz2'):
 95 |     """
 96 |     :param corpus_file: original corpus file name
 97 |     :type corpus_file: str
 98 |     :param dist_file: location to store the preprocessed corpus.
 99 |     :type dist_file: str
100 |     :return: None
101 |     Drop all sentences with length not in [3, 64].
102 |     """
103 |     corpus_file = file(corpus_file)
104 |     dist_file = smart_open(dist_file, mode='w')
105 | 
106 |     assert corpus_file is not None and dist_file is not None
107 |     for line in corpus_file:
108 |         words = line.split()
109 |         if not (3 <= len(words) <= 64):
110 |             continue
111 |         dist_file.write(line)
112 | 
113 |     corpus_file.close()
114 |     dist_file.close()
115 | 
116 | 
117 | def binarize_corpus(group_size=20000, corpus_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop.bz2',
118 |                     dist_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-bin.bz2',
119 |                     max_len=64, wordmap=DATA_ROOT+'wiki-wordmap.wp'):
120 |     """
121 |     :param group_size: group size. We repeatedly read group size of sentences and
122 |     convert and store them into binary format as a batch.
123 |     :type group_size: int
124 |     :param corpus_file: the corpus to be converted
125 |     :type corpus_file: str
126 |     :param dist_file: the file to store the converted corpus
127 |     :param max_len: maximum length of sentence. Sentences exceeds this length will be dropped.
128 |     :param wordmap: wordmap.
129 |     :return: None
130 |     """
131 |     def _index_sentence(sent):
132 |         """
133 |         :param sent: a sentence as a string
134 |         :type sent: str
135 |         :return: a list of word index
136 |         Represents a sentence using word indexes.
137 |         """
138 |         words = sent.split()
139 |         return [word2idx[w] for w in words]
140 | 
141 |     def _commit_result():
142 |         for idx_sent in result[3:]:
143 |             if len(idx_sent) > 0:
144 |                 sents = np.array(idx_sent, dtype=np.int32)
145 |                 shape = np.array(sents.shape, dtype=np.int32)
146 |                 dist_file.write(shape.tobytes())
147 |                 dist_file.write(sents.tobytes())
148 | 
149 |         for j in range(len(result)):
150 |             result[j] = []
151 | 
152 |     dist_file = smart_open(dist_file, 'wb')
153 |     assert dist_file is not None
154 |     if isinstance(wordmap, str):
155 |         wp = import_wordmap(fname=wordmap)
156 |     elif isinstance(wordmap, dict):
157 |         wp = wordmap
158 |     else:
159 |         logging.error('can not recognize wordmap type')
160 |         raise TypeError('wordamp must be dict or str')
161 |     word2idx = wp['word2idx']
162 |     result = [[] for _ in range(max_len + 1)]
163 |     with smart_open(corpus_file) as f:
164 |         for i, sent in enumerate(f, start=1):
165 |             idxs = _index_sentence(sent)
166 |             try:
167 |                 result[len(idxs)].append(idxs)
168 |                 if i % group_size == 0:
169 |                     _commit_result()
170 |             except IndexError:
171 |                 continue
172 |         _commit_result()
173 | 
174 |     dist_file.close()
175 | 
176 | 
177 | def grouped_sentences(binary_corpus=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-bin.bz2'):
178 |     with smart_open(binary_corpus) as f:
179 |         while True:
180 |             shape_data = f.read(2*4)
181 |             if shape_data == "":
182 |                 return
183 |             shape = np.frombuffer(shape_data, dtype=np.uint32)
184 |             siz = shape[0] * shape[1] * 4
185 |             sents = np.frombuffer(f.read(siz), dtype=np.uint32)
186 |             # noinspection PyTypeChecker
187 |             sents_ = np.reshape(sents, shape)
188 |             yield sents_.copy().astype('int32')
189 | 
190 | 
191 | def show_grouped_sentences(group_sents, wordmap=DATA_ROOT+'wiki-wordmap.wp'):
192 |     """
193 |     :param group_sents: a matrix represents a set of sentences' indexes
194 |     :type group_sents: numpy.ndarray
195 |     :param wordmap: word_ to index_ map and vise versa
196 |     :return: list, a list of string representation of the sentences.
197 |     """
198 |     if isinstance(wordmap, str):
199 |         # import logging
200 |         logger = logging.getLogger('Preprocess')
201 |         logger.warn('It would be inefficient if repeatedly call this function with wordmap name')
202 |         wordmap = import_wordmap(fname=wordmap)
203 |         idx2word = wordmap['idx2word']
204 |     elif isinstance(wordmap, dict):
205 |         idx2word = wordmap['idx2word']
206 |     else:
207 |         raise TypeError('wordmap must be a string representing the map location or '
208 |                         'a dictionary containing the map')
209 |     ret = [None] * group_sents.shape[0]
210 |     for i in range(len(ret)):
211 |         ret[i] = [idx2word[j] for j in group_sents[i]]
212 | 
213 |     return ret
214 | 
215 | 
216 | def get_fake_data_meta(fname=DATA_ROOT+'fake', trn_regex=re.compile(r'\d{3}.bz2')):
217 |     data_path = os.path.abspath(fname)
218 |     meta_file = os.path.join(data_path, 'meta.pkl')
219 |     if not os.path.isfile(meta_file):
220 |         train_files_ = [os.path.join(data_path, f) for f in os.listdir(data_path) if trn_regex.match(f)]
221 |         train_files = [f for f in train_files_ if os.path.isfile(f)]
222 |         nb_total = 0
223 |         nb_bin = np.zeros((15,), dtype='int32')
224 | 
225 |         for f in train_files:
226 |             X = np.loadtxt(f, dtype='int32')
227 |             nb_bin += np.bincount(X.ravel(), minlength=15)
228 |             nb_total += np.prod(X.shape)
229 | 
230 |         rel_freq = nb_bin.astype('float32')/nb_total
231 |         ret = {'freq': nb_bin, 'rel_freq': rel_freq, 'nb_total': nb_total}
232 |         with file(meta_file, 'wb') as mf:
233 |             pickle.dump(ret, mf)
234 |     else:
235 |         with file(meta_file, 'rb') as mf:
236 |             ret = pickle.load(mf)
237 | 
238 |     return ret
239 | 
240 | 
241 | def truncate_wordmap(wp, max_size=300000, dist=DATA_ROOT+'wiki-wordmap-trunc300k.wp'):
242 |     idx2word = wp['idx2word'][:max_size]
243 |     idx2wc = wp['idx2wc'][:max_size]
244 | 
245 |     word2idx = dict((w, idx) for idx, w in enumerate(idx2word))
246 |     model = {'idx2word': idx2word, 'idx2wc': idx2wc, 'word2idx': word2idx}
247 |     with file(dist, 'wb') as f:
248 |         pickle.dump(model, f, -1)
249 |     return model
250 | 
251 | 
252 | def get_val_data(data_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-bin.bz2', val_nb_words=100000, max_vocab=10000):
253 |         """
254 |         :param data_file:
255 |         :type data_file: basestring | str | unicode | __generator
256 |         :param val_nb_words:
257 |         :param max_vocab:
258 |         :return:
259 |         """
260 |         if isinstance(data_file, basestring):
261 |             sent_gen = grouped_sentences(data_file)
262 |         else:
263 |             sent_gen = data_file
264 | 
265 |         val_sents = [None for _ in range(MAX_SETN_LEN)]
266 |         val_nb = 0
267 |         for sents in sent_gen:
268 |             val_nb += sents.size
269 |             chunk_sentences(val_sents, sents, 1000000, no_return=True)
270 |             if val_nb >= val_nb_words:
271 |                 break
272 |         val_sents_ = [None for _ in range(MAX_SETN_LEN)]
273 |         for idx in range(MAX_SETN_LEN):
274 |             if val_sents[idx]:
275 |                 val_sents_[idx] = np.vstack(val_sents[idx]['sents'])
276 | 
277 |         val_sents = [sents for sents in val_sents_ if sents is not None]
278 |         for sents in val_sents:
279 |             mask = (sents > max_vocab)
280 |             sents[mask] = max_vocab
281 | 
282 |         return val_sents
283 | 
284 | 
285 | def data4sri(src_corpus=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=DATA_ROOT+'corpus/sri',
286 |              train_nb_words=100000000, val_nb_words=5000000, train_val_nb=100000, max_vocab=10000):
287 |     def bin2txt(sents, dist_file):
288 |         for i in xrange(sents.shape[0]):
289 |             words = [str(idx) for idx in sents[i]]
290 |             sent = ' '.join(words)
291 |             dist_file.writelines([sent, '\n'])
292 | 
293 |     sent_gen = grouped_sentences(src_corpus)
294 |     val_sents = get_val_data(sent_gen, val_nb_words, max_vocab)
295 |     get_val_data(sent_gen, train_val_nb)
296 | 
297 |     if train_nb_words >= 1000000:
298 |         trn_name = 'wiki-trn-R%dm-V%dk.txt' % (train_nb_words // 1000000, max_vocab//1000)
299 |     elif train_nb_words >= 1000:
300 |         trn_name = 'wiki-trn-R%dk-V%dk.txt' % (train_nb_words // 1000, max_vocab//1000)
301 |     else:
302 |         trn_name = 'wiki-trn-R%d-V%dk.txt' % (train_nb_words, max_vocab//1000)
303 | 
304 |     if val_nb_words >= 1000000:
305 |         val_name = 'wiki-val-R%dm-V%dk.txt' % (val_nb_words // 1000000, max_vocab//1000)
306 |     elif val_nb_words >= 1000:
307 |         val_name = 'wiki-val-R%dk-V%dk.txt' % (val_nb_words // 1000, max_vocab//1000)
308 |     else:
309 |         val_name = 'wiki-val-R%d-V%dk.txt' % (val_nb_words, max_vocab//1000)
310 | 
311 |     val_file = file(os.path.join(save_path, val_name), 'w')
312 |     for sents in val_sents:
313 |         bin2txt(sents, val_file)
314 | 
315 |     trn_file = file(os.path.join(save_path, trn_name), 'w')
316 |     nb_exported = 0
317 |     for sents in sent_gen:
318 |         mask = (sents > max_vocab)
319 |         sents[mask] = max_vocab
320 |         bin2txt(sents, trn_file)
321 |         nb_exported += sents.size
322 |         if nb_exported >= train_nb_words:
323 |             break
324 | 
325 |     val_file.close()
326 |     trn_file.close()
327 | 
328 | if __name__ == '__main__':
329 |     logging.basicConfig(level=logging.INFO)
330 |     if not os.path.exists(DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-0.bz2'):
331 |         export_wordmap()
332 |         preprocess_corpus(dist_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-0.bz2')
333 |     if not os.path.exists(DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-0-bin.bz2'):
334 |         binarize_corpus(dist_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-0-bin.bz2')
335 | 
336 |     # unittest.main()


--------------------------------------------------------------------------------
/real/utils/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | __author__ = 'Yunchuan Chen'
  4 | 
  5 | import math
  6 | import os
  7 | import cPickle as pickle
  8 | from scipy.stats import rv_discrete
  9 | from keras.callbacks import History, BaseLogger
 10 | from keras.utils.generic_utils import Progbar
 11 | import theano
 12 | import theano.tensor as T
 13 | import numpy as np
 14 | import Queue
 15 | import re
 16 | 
 17 | floatX = theano.config.floatX
 18 | epsilon = 1.0e-9
 19 | # if floatX == 'float64':
 20 | #     epsilon = 1.0e-9
 21 | # else:
 22 | #     epsilon = 1.0e-7
 23 | 
 24 | 
 25 | def categorical_crossentropy2d(y_true, y_pred):
 26 |     """
 27 |     :param y_true: true index labels with shape (ns, nt)
 28 |     :param y_pred: predicted probabilities with shape (ns, nt, V)
 29 |     :return: cce
 30 |     """
 31 |     y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
 32 |     # scale preds so that the class probas of each sample sum to 1
 33 |     y_pred /= y_pred.sum(axis=-1, keepdims=True)
 34 | 
 35 |     ns = y_true.shape[0]
 36 |     nt = y_true.shape[1]
 37 |     sample_idx = T.reshape(T.arange(ns), (ns, 1))
 38 |     time_idx = T.reshape(T.arange(nt), (1, nt))
 39 |     probs_ = y_pred[sample_idx, time_idx, y_true]
 40 |     return -T.log(probs_)
 41 | 
 42 | 
 43 | def categorical_crossentropy1d(y_true, y_pred):
 44 |     """
 45 |     :param y_true: true index labels with shape (n, )
 46 |     :param y_pred: predicted probabilities with shape (n, V)
 47 |     :return: cce
 48 |     """
 49 |     y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon)
 50 |     # scale preds so that the class probas of each sample sum to 1
 51 |     y_pred /= y_pred.sum(axis=-1, keepdims=True)
 52 | 
 53 |     n = y_true.shape[0]
 54 |     sample_idx = T.arange(n)
 55 |     probs_ = y_pred[sample_idx, y_true]
 56 |     return -T.log(probs_)
 57 | 
 58 | 
 59 | def categorical_crossentropy(y_true, y_pred):
 60 |     if y_true.ndim == 1:
 61 |         return categorical_crossentropy1d(y_true, y_pred)
 62 |     elif y_true.ndim == 2:
 63 |         return categorical_crossentropy2d(y_true, y_pred)
 64 |     else:
 65 |         raise NotImplementedError('not implemented for 3D or larger dimensions')
 66 | 
 67 | 
 68 | def objective_fnc(fn):
 69 |     def symvar(y_true, y_pred, mask=None):
 70 |         obj_output = fn(y_true, y_pred)
 71 |         if mask is None:
 72 |             # return obj_output.mean(dtype=theano.config.floatX)
 73 |             return T.sum(obj_output) / obj_output.shape[0].astype(floatX)
 74 |         else:
 75 |             # obj_output = obj_output[mask.nonzero()]
 76 |             # return obj_output.mean(dtype=theano.config.floatX)
 77 |             obj_output = T.sum(obj_output * mask)
 78 |             return obj_output / mask.shape[0].astype(floatX)
 79 |     return symvar
 80 | 
 81 | 
 82 | def chunk_sentences(old_sentences, new_sentences, chunk_size, no_return=False, min_nb_ch=5):
 83 |     """
 84 |     :param old_sentences: [{nb_sents: x, sents: [...]}, ...]
 85 |     :param new_sentences:
 86 |     :param chunk_size:
 87 |     :param no_return:
 88 |     :return:
 89 |     """
 90 |     sent_len = new_sentences.shape[1]
 91 | 
 92 |     if old_sentences[sent_len]:
 93 |         nb_sents = old_sentences[sent_len]['nb_sents'] + new_sentences.shape[0]
 94 |         old_sentences[sent_len]['nb_sents'] = nb_sents
 95 |         old_sentences[sent_len]['sents'].append(new_sentences)
 96 | 
 97 |     else:
 98 |         nb_sents = new_sentences.shape[0]
 99 |         old_sentences[sent_len] = {'nb_sents': nb_sents,
100 |                                    'sents': [new_sentences]}
101 | 
102 |     if nb_sents >= chunk_size*min_nb_ch and not no_return:
103 |         nb_chunks = nb_sents // chunk_size
104 |         nb_ret = nb_chunks * chunk_size
105 |         tmp = np.vstack(old_sentences[sent_len]['sents'])
106 |         old_sentences[sent_len]['sents'] = [tmp[nb_ret:]]
107 |         old_sentences[sent_len]['nb_sents'] = old_sentences[sent_len]['sents'][0].shape[0]
108 |         return tmp[:nb_ret]
109 |     else:
110 |         return None
111 | 
112 | 
113 | def slice_X(X, start_, end_=None, axis=1):
114 |     if end_ is None:
115 |         return [x.take(start_, axis=axis) for x in X]
116 |     else:
117 |         ret = []
118 |         for y in X:
119 |             s = [slice(None) for _ in range(y.ndim)]
120 |             s[axis] = slice(start_, end_)
121 |             s = tuple(s)
122 |             ret.append(y[s])
123 |         return ret
124 | 
125 | 
126 | def get_unigram_probtable(nb_words, wordmap='../data/wiki-wordmap-trunc300k.wp',
127 |                           save_path='../data/wiki-unigram-prob-size10000.pkl'):
128 |     if os.path.exists(save_path):
129 |         with file(save_path, 'rb') as f:
130 |             freq = pickle.load(f)
131 |         return freq
132 | 
133 |     with file(wordmap, 'rb') as f:
134 |         wp = pickle.load(f)
135 | 
136 |     idx2wc = wp['idx2wc']
137 |     idx2wc[nb_words-1] = sum(idx2wc[nb_words-1:])
138 |     nb_total = sum(idx2wc[:nb_words])
139 | 
140 |     freq = np.array(idx2wc[:nb_words], dtype=floatX)/nb_total
141 |     freq_reduce = freq[nb_words-1] * 2.0/3.0
142 |     freq[nb_words-1] -= freq_reduce
143 |     pivot = nb_words // 2
144 |     nb = nb_words - pivot
145 |     gain = freq_reduce / nb
146 |     freq[pivot:nb_words] += gain
147 |     freq = freq / freq.sum()
148 |     with file(save_path, 'wb') as f:
149 |         pickle.dump(freq, f, -1)
150 | 
151 |     return freq
152 | 
153 | 
154 | def prefix_generator(s, start=0, end=None):
155 |     if end is None:
156 |         end = len(s) + 1
157 |     for idx in range(start, end):
158 |         yield s[:idx]
159 | 
160 | 
161 | def pad_bitstr(bitstr):
162 |     """
163 |     :param bitstr:
164 |     :type bitstr: list
165 |     :return: padded list of bits
166 |     """
167 |     max_bit_len = 0
168 |     for bits in bitstr:
169 |         if len(bits) > max_bit_len:
170 |             max_bit_len = len(bits)
171 |     for bits in bitstr:
172 |         bits.extend([0] * (max_bit_len-len(bits)))
173 | 
174 |     return bitstr
175 | 
176 | 
177 | def pad_virtual_class(clses, pad_value):
178 |     max_cls_len = 0
179 |     for nodes in clses:
180 |         if len(nodes) > max_cls_len:
181 |             max_cls_len = len(nodes)
182 |     for nodes in clses:
183 |         nodes.extend([pad_value] * (max_cls_len-len(nodes)))
184 | 
185 |     return clses
186 | 
187 | 
188 | class HuffmanNode(object):
189 |     def __init__(self, left=None, right=None, root=None):
190 |         self.left = left
191 |         self.right = right
192 |         self.root = root     # Why?  Not needed for anything.
193 | 
194 |     def children(self):
195 |         return self.left, self.right
196 | 
197 |     def preorder(self, path=None, left_code=0, right_code=1, collector=None):
198 |         if collector is None:
199 |             collector = []
200 |         if path is None:
201 |             path = []
202 |         if self.left is not None:
203 |             if isinstance(self.left[1], HuffmanNode):
204 |                 self.left[1].preorder(path+[left_code], left_code, right_code, collector)
205 |             else:
206 |                 # print(self.left[1], path+[left_code])
207 |                 collector.append((self.left[1], self.left[0], path+[left_code]))
208 |         if self.right is not None:
209 |             if isinstance(self.right[1], HuffmanNode):
210 |                 self.right[1].preorder(path+[right_code], left_code, right_code, collector)
211 |             else:
212 |                 # print(self.right[1], path+[right_code])
213 |                 collector.append((self.right[1], self.right[0], path+[right_code]))
214 | 
215 |         return collector
216 | 
217 | 
218 | def create_tree(frequencies):
219 |     p = Queue.PriorityQueue()
220 |     for value in frequencies:     # 1. Create a leaf node for each symbol
221 |         p.put(value)              #    and add it to the priority queue
222 |     while p.qsize() > 1:          # 2. While there is more than one node
223 |         l, r = p.get(), p.get()   # 2a. remove two highest nodes
224 |         node = HuffmanNode(l, r)  # 2b. create internal node with children
225 |         p.put((l[0]+r[0], node))  # 2c. add new node to queue
226 |     return p.get()                # 3. tree is complete - return root node
227 | 
228 | 
229 | def load_huffman_tree(prob_table):
230 |     rel_freq = prob_table
231 |     freq = zip(rel_freq, range(len(rel_freq)))
232 |     tree = create_tree(freq)[1]
233 |     x = tree.preorder(left_code=-1, right_code=1)
234 |     y = sorted(x, key=lambda z: z[1], reverse=True)
235 |     # bitstr = []
236 |     # for _, _, bitstr_ in y:
237 |     #     bitstr.append(bitstr_[:-1])
238 | 
239 |     z = [(wrdidx, bits, list(prefix_generator(bits, end=len(bits)))) for wrdidx, _, bits in y]
240 |     clses = set()
241 |     for _, _, ele in z:
242 |         for i in ele:
243 |             clses.add(''.join('%+d' % j for j in i))
244 |     idx2clses = sorted(clses, key=lambda ele: len(ele))
245 |     cls2idx = dict(((cls, idx) for idx, cls in enumerate(idx2clses)))
246 |     w = map(lambda x: (x[0], x[1], [cls2idx[''.join('%+d' % j for j in p)] for p in x[2]]), z)
247 | 
248 |     tmp1, tmp2 = [], []
249 |     for _, bits, cls_idx in w:
250 |         tmp1.append(bits)
251 |         tmp2.append(cls_idx)
252 |     pad_bitstr(tmp1)
253 |     pad_virtual_class(tmp2, pad_value=len(idx2clses)-1)
254 |     assert len(freq) == len(w)
255 |     idx2cls = [None] * len(freq)
256 |     idx2bitstr = [None] * len(freq)
257 |     for idx, bitstr_, cls_ in w:
258 |         idx2cls[idx] = cls_
259 |         idx2bitstr[idx] = bitstr_
260 | 
261 |     idx2cls = np.array(idx2cls, dtype='int32')
262 |     idx2bitstr = np.array(idx2bitstr, dtype='int8')
263 | 
264 |     return idx2cls, idx2bitstr, idx2bitstr != 0
265 | 
266 | 
267 | def save_tree(fn, idx2cls, idx2bitstr, mask):
268 |     with file(fn, 'wb') as f:
269 |         pickle.dump({'idx2cls': idx2cls, 'idx2bitstr': idx2bitstr, 'mask': mask}, f)
270 | 
271 | 
272 | _VAL_LINE = re.compile(r'INFO:.*:Val val_loss: (\d*\.\d*) - val_ppl: (\d*\.\d)')
273 | _TRAIN_LINE = re.compile(r'INFO:NCELangModelV4:Train - time: (\d*\.\d*) - loss: (\d*\.\d*)')
274 | 
275 | 
276 | def convert_logs(log, val_line=_VAL_LINE, trn_line=_TRAIN_LINE):
277 |     f = file(log, 'r')
278 |     val_loss = []
279 |     val_ppl = []
280 |     t_trn = []
281 |     trn_loss = []
282 | 
283 |     for line in f:
284 |         val_mat = val_line.match(line)
285 |         if val_mat is not None:
286 |             loss = float(val_mat.group(1))
287 |             ppl = float(val_mat.group(2))
288 |             val_loss.append(loss)
289 |             val_ppl.append(ppl)
290 |             continue
291 |         trn_mat = trn_line.match(line)
292 |         if trn_mat is not None:
293 |             t = float(trn_mat.group(1))
294 |             loss = float(trn_mat.group(2))
295 |             t_trn.append(t)
296 |             trn_loss.append(loss)
297 |     f.close()
298 | 
299 |     t_trn = np.array(t_trn)
300 |     t_trn -= t_trn[0]
301 |     trn_loss = np.array(trn_loss)
302 |     val_loss = np.array(val_loss[:-1])
303 |     val_ppl = np.array(val_ppl[:-1])
304 | 
305 |     return t_trn, trn_loss, val_loss, val_ppl
306 | 
307 | 
308 | class TableSampler(rv_discrete):
309 |     def __init__(self, table):
310 |         nk = np.arange(len(table))
311 |         super(TableSampler, self).__init__(b=len(table)-1, values=(nk, table))
312 | 
313 |     def sample(self, shape, dtype='int32'):
314 |         return self.rvs(size=shape).astype(dtype)
315 | 
316 | 
317 | class LangHistory(History):
318 | 
319 |     # def on_train_begin(self, logs=None):
320 |     #     # logs = {} if logs is None else logs
321 |     #     self.epoch = []
322 |     #     self.history = {}
323 |     #
324 |     # def on_epoch_begin(self, epoch, logs=None):
325 |     #     self.seen = 0
326 |     #     self.totals = {}
327 | 
328 |     def on_batch_end(self, batch, logs=None):
329 |         logs = {} if logs is None else logs
330 |         batch_size = logs.get('size', 0)
331 |         self.seen += batch_size
332 | 
333 |         for k, v in logs.items():
334 |             if k == 'encode_len' or 'nb_words':
335 |                 try:
336 |                     self.totals[k] += v
337 |                 except KeyError:
338 |                     self.totals[k] = v
339 |                 continue
340 | 
341 |             try:
342 |                 self.totals[k] += v * batch_size
343 |             except KeyError:
344 |                 self.totals[k] = v * batch_size
345 | 
346 |     def on_epoch_end(self, epoch, logs=None):
347 |         if hasattr(self.totals, 'encode_len') and hasattr(self, 'nb_words'):
348 |             ppl = math.exp(self.totals['encode_len']/float(self.totals['nb_words']))
349 |             k = 'ppl'
350 |             try:
351 |                 self.history[k].append(ppl)
352 |             except KeyError:
353 |                 self.history[k] = [ppl]
354 | 
355 |         if hasattr(self.totals, 'val_encode_len') and hasattr(self, 'val_nb_words'):
356 |             val_ppl = math.exp(self.totals['val_encode_len']/float(self.totals['val_nb_words']))
357 |             k = 'val_ppl'
358 |             try:
359 |                 self.history[k].append(val_ppl)
360 |             except KeyError:
361 |                 self.history[k] = [val_ppl]
362 | 
363 |         k = 'loss'
364 |         v = self.totals[k]
365 |         try:
366 |             self.history[k].append(v/float(self.seen))
367 |         except KeyError:
368 |             self.history[k] = [v/float(self.seen)]
369 | 
370 | 
371 | class LangModelLogger(BaseLogger):
372 |     def __init__(self):
373 |         super(LangModelLogger, self).__init__()
374 |         self.verbose = None
375 |         self.nb_epoch = None
376 |         self.seen = 0
377 |         self.totals = {}
378 |         self.progbar = None
379 |         self.log_values = []
380 | 
381 |     # def on_train_begin(self, logs=None):
382 |     #     logger.debug('Begin training...')
383 |     #     self.verbose = self.params['verbose']
384 |     #     self.nb_epoch = self.params['nb_epoch']
385 |     #
386 |     # def on_epoch_begin(self, epoch, logs=None):
387 |     #     # print('Epoch %d/%d' % (epoch + 1, self.nb_epoch))
388 |     #     self.progbar = Progbar(target=self.params['nb_sample'], verbose=1)
389 |     #     self.seen = 0
390 |     #     self.totals = {}
391 |     #
392 |     # def on_batch_begin(self, batch, logs=None):
393 |     #     if self.seen < self.params['nb_sample']:
394 |     #         self.log_values = []
395 |     #         self.params['metrics'] = ['loss', 'ppl', 'val_loss', 'val_ppl']
396 | 
397 |     def on_batch_end(self, batch, logs=None):
398 |         logs = {} if logs is None else logs
399 |         batch_size = logs.get('size', 0)
400 |         self.seen += batch_size
401 | 
402 |         for k, v in logs.items():
403 |             if k == 'encode_len' or 'nb_words':
404 |                 try:
405 |                     self.totals[k] += v
406 |                 except KeyError:
407 |                     self.totals[k] = v
408 |                 continue
409 | 
410 |             try:
411 |                 self.totals[k] += v * batch_size
412 |             except KeyError:
413 |                 self.totals[k] = v * batch_size
414 | 
415 |         if 'encode_len' in self.totals and 'nb_words' in self.totals and 'ppl' in self.params['metrics']:
416 |             self.totals['ppl'] = math.exp(self.totals['encode_len']/float(self.totals['nb_words']))
417 |             self.log_values.append(('ppl', self.totals['ppl']))
418 |         for k in self.params['metrics']:
419 |             if k in logs:
420 |                 self.log_values.append((k, logs[k]))
421 | 
422 |         # skip progbar update for the last batch; will be handled by on_epoch_end
423 |             if self.seen < self.params['nb_sample']:
424 |                 self.progbar.update(self.seen, self.log_values)
425 | 
426 |     def on_epoch_begin(self, epoch, logs=None):
427 |         if self.verbose:
428 |             self.progbar = Progbar(target=self.params['nb_sample'],
429 |                                    verbose=self.verbose)
430 |         self.seen = 0
431 |         self.totals = {}
432 | 
433 |     def on_epoch_end(self, epoch, logs=None):
434 |         logs = {} if logs is None else logs
435 |         # logger.debug('log keys: %s' % str(logs.keys()))
436 |         for k in self.params['metrics']:
437 |             if k in self.totals:
438 |                 if k != 'ppl':
439 |                     self.log_values.append((k, self.totals[k] / self.seen))
440 |                 else:
441 |                     self.totals['ppl'] = math.exp(self.totals['encode_len']/float(self.totals['nb_words']))
442 |                     self.log_values.append((k, self.totals['ppl']))
443 |             if k in logs:
444 |                 self.log_values.append((k, logs[k]))
445 |         if 'val_encode_len' in logs and 'val_nb_words' in logs:
446 |             val_ppl = math.exp(logs['val_encode_len']/float(logs['val_nb_words']))
447 |             self.log_values.append(('val_ppl', val_ppl))
448 | 
449 |         self.progbar.update(self.seen, self.log_values)


--------------------------------------------------------------------------------
/real/workspace/export_sri_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | # noinspection PyUnresolvedReferences
 5 | from lm.real.utils import data4sri
 6 | 
 7 | DATA_ROOT = '../../data/'
 8 | # data4sri(src_corpus=DATA_ROOT+'/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=DATA_ROOT+'corpus/sri',
 9 | #          train_nb_words=100000000, val_nb_words=5000000, train_val_nb=100000, max_vocab=10000)
10 | 
11 | # data4sri(src_corpus=DATA_ROOT+'/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=DATA_ROOT+'corpus/sri',
12 | #          train_nb_words=100000000, val_nb_words=5000000, train_val_nb=100000, max_vocab=50000)
13 | 
14 | for k in range(10000, 52000, 2000):
15 |     data4sri(src_corpus=DATA_ROOT+'/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=DATA_ROOT+'corpus/sri',
16 |              train_nb_words=100000000, val_nb_words=5000000, train_val_nb=100000, max_vocab=k)
17 | 
18 | data4sri(src_corpus=DATA_ROOT+'/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=DATA_ROOT+'corpus/sri',
19 |          train_nb_words=100000000, val_nb_words=5000000, train_val_nb=100000, max_vocab=100000000)


--------------------------------------------------------------------------------
/real/workspace/extract_learning_curv_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | import sys
 6 | import os
 7 | import re
 8 | from scipy.io import savemat
 9 | import numpy as np
10 | 
11 | log_dir = sys.argv[1]
12 | pat = re.compile(r"main-nce4-.*-V(\d+)-N\d+.log")
13 | file_pat = re.compile(sys.argv[2]) if len(sys.argv) >= 3 else pat
14 | # INFO:NCELangModelV4:Train - time: 1453042236.299597 - loss: 4.672819
15 | # INFO:NCELangModelV4:Val val_loss: 4.653410 - val_ppl: 351.053158
16 | trn_pat = re.compile(r'.*:Train - time: (\d+\.\d+) - loss: (\d+\.\d+)')
17 | val_pat = re.compile(r'.*:Val val_loss: (\d+\.\d+) - val_ppl: (\d+\.\d+)')
18 | log_files = os.listdir(log_dir)
19 | 
20 | loss_data = {}
21 | val_data = {}
22 | for file_name in os.listdir(log_dir):
23 |     m_k = pat.match(file_name)
24 |     if m_k is None:
25 |         continue
26 |     k = m_k.group(1)
27 |     loss_key = 'lossV'+k
28 |     val_key = 'pplV' + k
29 |     loss_data[loss_key] = []
30 |     val_data[val_key] = []
31 |     with file(log_dir+'/'+file_name, 'r') as f:
32 |         for line in f:
33 |             m = trn_pat.match(line)
34 |             if m:
35 |                 loss_data[loss_key].append([float(m.group(1)), float(m.group(2))])
36 |                 continue
37 |             m = val_pat.match(line)
38 |             if m:
39 |                 val_data[val_key].append([float(m.group(1)), float(m.group(2))])
40 | 
41 | data = {}
42 | for k in loss_data:
43 |     data[k] = np.array(loss_data[k])
44 | for k in val_data:
45 |     data[k] = np.array(val_data[k])
46 | 
47 | savemat(log_dir+'/loss.mat', data)
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/real/workspace/gen_train_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | # noinspection PyUnresolvedReferences
 5 | from lm.utils.preprocess import grouped_sentences, smart_open
 6 | import numpy as np
 7 | 
 8 | DATA_PATH = '../../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'
 9 | DATA_DIST = '../../data/corpus/wiki-sg-norm-lc-drop-bin-sample0.2B.bz2'
10 | 
11 | 
12 | def _commit_result(dist_file, sents):
13 |     shape = np.array(sents.shape, dtype=np.int32)
14 |     dist_file.write(shape.tobytes())
15 |     dist_file.write(sents.tobytes())
16 | 
17 | first_chunk_size = 50000000
18 | next_chunk_start = first_chunk_size * 2
19 | total_size = 200000000
20 | nb_words = 0
21 | 
22 | dist_file = smart_open(DATA_DIST, 'wb')
23 | sents = grouped_sentences(DATA_PATH)
24 | for chunk in sents:
25 |     if nb_words > first_chunk_size:
26 |         break
27 |     nb_words += chunk.size
28 |     _commit_result(dist_file, chunk)
29 | 
30 | nb_words_ = nb_words
31 | for chunk in sents:
32 |     nb_words_ += chunk.size
33 |     if nb_words_ > next_chunk_start:
34 |         break
35 | 
36 | for chunk in sents:
37 |     if nb_words >= total_size:
38 |         break
39 |     nb_words += chunk.size
40 |     _commit_result(dist_file, chunk)
41 | 
42 | dist_file.close()
43 | 
44 | 


--------------------------------------------------------------------------------
/real/workspace/show_time_loss.m:
--------------------------------------------------------------------------------
 1 | % show_time_loss
 2 | 
 3 | % load('logs/loss.mat');
 4 | 
 5 | nb = length(10000:2000:28000);
 6 | loss_data = cell(nb, 1);
 7 | % k = 1;
 8 | % for i=10000:2000:28000
 9 | %     val_name = strcat('lossV', num2str(i));
10 | %     tmp = eval(val_name);
11 | %     loss_data{k} = [tmp(:, 1) - tmp(1, 1), tmp(:, 2)];
12 | %     k = k + 1;
13 | % end
14 | % 
15 | % colmap = hsv(nb);
16 | % figure; hold on;
17 | % for i = 1:nb
18 | %     plot(loss_data{i}(3:20:end, 1), loss_data{i}(3:20:end, 2),...
19 | %         'Color', colmap(i,:));
20 | % end
21 | k = 1;
22 | ppl_data = cell(nb, 1);
23 | for i=10000:2000:28000
24 |     val_name = strcat('pplV', num2str(i));
25 |     tmp = eval(val_name);
26 |     ppl_data{k} = [tmp(:, 1) - tmp(1, 1), tmp(:, 2)];
27 |     k = k + 1;
28 | end
29 | 
30 | hold off;
31 | for i = 1:nb
32 |     figure;
33 |     t = linspace(0, 12, size(ppl_data{i},1)); 
34 |     plotyy(t, ppl_data{i}(:, 1), t, ppl_data{i}(:, 2));
35 | end


--------------------------------------------------------------------------------
/stat/get_stat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | import numpy as np
 5 | from scipy.io import savemat
 6 | 
 7 | 
 8 | def get_sample_sent(number=10, min_len=200, corpus=r'../data/wiki-sg-norm-lc.txt'):
 9 |     samples = []
10 |     with file(corpus) as f:
11 |         for line in f:
12 |             if len(line.split()) >= min_len:
13 |                 samples.append(line)
14 |                 if len(samples) == number:
15 |                     break
16 |     return samples
17 | 
18 | if __name__ == '__main__':
19 |     datafile = r'../data/wiki-sg-norm-lc.txt'
20 |     max_line = 5000000
21 |     len_stat = np.zeros(max_line, dtype='int32')
22 |     with file(datafile) as f:
23 |         for idx, line in enumerate(f):
24 |             if idx == max_line:
25 |                 break
26 |             len_stat[idx] = len(line.split())
27 | 
28 |     savemat('../data/wiki-stats.mat', {'len_stat': len_stat}, oned_as='column')
29 | 


--------------------------------------------------------------------------------
/stat/read_stats.m:
--------------------------------------------------------------------------------
1 | function stat = read_stats(filename)
2 | if nargin == 0
3 |     filename = '../data/wiki-stats.mat';
4 | end
5 | s = load(filename);
6 | stat = s.len_stat;


--------------------------------------------------------------------------------
/stat/show_stats.m:
--------------------------------------------------------------------------------
 1 | stat_all = read_stats;
 2 | stat_lt600 = stat_all(stat_all < 600);
 3 | stat_le96  = stat_lt600(stat_lt600 <= 96);
 4 | sstat_eq1  = sum(stat_le96 == 1);
 5 | sstat_eq2  = sum(stat_le96 == 2);
 6 | sstat_eq3  = sum(stat_le96 == 3);
 7 | sstat_gt96 = sum(stat_all > 96);
 8 | sstat_gt80 = sum(stat_all > 80);
 9 | sstat_gt64 = sum(stat_all > 64);
10 | stat_3t96  = stat_lt600(stat_lt600 <= 96 & stat_lt600 >=3);
11 | stat_3t64  = stat_le96(stat_le96 <=64 & stat_le96 >=3);
12 | sstat_3t8  = sum(stat_3t64 <= 8);
13 | sstat_9t12 = sum(stat_3t64 <= 12 & stat_3t64 >=9);
14 | 
15 | fprintf('percentage of length 1: %.2f%%\n', 100*double(sstat_eq1)/length(stat_all));
16 | fprintf('percentage of length 2: %.2f%%\n', 100*double(sstat_eq2)/length(stat_all));
17 | fprintf('percentage of length 3: %.2f%%\n', 100*double(sstat_eq3)/length(stat_all));
18 | fprintf('percentage of length above 96: %.2f%%\n', 100*double(sstat_gt96)/length(stat_all));
19 | fprintf('percentage of length above 80: %.2f%%\n', 100*double(sstat_gt80)/length(stat_all));
20 | fprintf('percentage of length above 64: %.2f%%\n', 100*double(sstat_gt64)/length(stat_all));
21 | fprintf('percentage of length 3 to 96: %.2f%%\n', 100*double(length(stat_3t96))/length(stat_all));
22 | fprintf('percentage of length 3 to 8: %.2f%%\n', 100*double(sstat_3t8)/length(stat_all));
23 | fprintf('percentage of length above 80: %.2f%%\n', 100*double(sstat_9t12)/length(stat_all));
24 | fprintf('percentage of length 3 to 64: %.2f%%\n', 100*double(length(stat_3t64))/length(stat_all));
25 | 
26 | figure;
27 | hist(stat_all, 1:2500);
28 | title('histogram of sentences'' lengths between 1 to max');
29 | 
30 | figure;
31 | hist(stat_lt600, 1:600);
32 | title('histogram of sentences'' lengths between 1 to 599');
33 | 
34 | figure;
35 | hist(stat_le96, 1:96);
36 | title('histogram of sentences'' length between 1 to 96');
37 | 
38 | figure;
39 | hist(stat_3t96, 3:96);
40 | title('histogram of sentences'' length between 3 to 96');
41 | 
42 | figure;
43 | hist(stat_3t64, 3:64);
44 | title('histogram of sentences'' length between 3 to 64');
45 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | __author__ = 'Yunchuan Chen'
4 | 
5 | 


--------------------------------------------------------------------------------
/test/snippet.py:
--------------------------------------------------------------------------------
 1 | from models import NCELangModel
 2 | import os, re
 3 | import logging
 4 | import numpy as np
 5 | 
 6 | logging.basicConfig(level=logging.DEBUG)
 7 | 
 8 | trn_regex=re.compile(r'\d{3}.bz2')
 9 | dir_ = 'data/fake/test'
10 | train_files = [os.path.join(dir_, f) for f in os.listdir(dir_) if trn_regex.match(f)]
11 | X = np.loadtxt(train_files[0], dtype='int32')
12 | 
13 | model = NCELangModel(vocab_size=15, nb_negative=2, embed_dims=128)
14 | ins, _ = model.prepare_input(X, 0, None)
15 | data = {model.input['idxes']: ins[0]}
16 | model.compile()
17 | 


--------------------------------------------------------------------------------
/test/test_io.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import unittest
 4 | __author__ = 'Yunchuan Chen'
 5 | 
 6 | 
 7 | class ReadFileTest(unittest.TestCase):
 8 |     def test_readlines(self):
 9 |         iter_lines = []
10 |         read_lines = []
11 |         with file('../data/test_data') as f:
12 |             for line in f:
13 |                 iter_lines.append(line)
14 | 
15 |             f.seek(0)
16 |             while True:
17 |                 line = f.readline()
18 |                 if len(line) == 0:
19 |                     break
20 |                 read_lines.append(line)
21 |         self.failUnless(len(iter_lines) == len(read_lines),
22 |                         'Iterating over file is different from readlines\n'
23 |                         'The result of iterating over lines: %s\n'
24 |                         'The result of readlines: %s' % (str(iter_lines), str(read_lines)))
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     unittest.main()
29 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | __author__ = 'Yunchuan Chen'
4 | 


--------------------------------------------------------------------------------
/utils/fake_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | from preprocess import smart_open
 5 | import sys
 6 | import os
 7 | 
 8 | 
 9 | def chunks(l, n):
10 |     """Yield successive n-sized chunks from l."""
11 |     for i in xrange(0, len(l), n):
12 |         yield l[i:i+n]
13 | 
14 | 
15 | def generate(dist_dir, corpus_file='../data/corpus/wiki-sg-norm-lc.tar.bz2', sent_len=64,
16 |              max_size=100*2**20, file_size=2**20):
17 |     def sentence_generator():
18 |         with smart_open(corpus_file) as f:
19 |             for sent in f:
20 |                 words_ = sent.split()
21 |                 words = [w for w in words_ if not w.startswith('_')]
22 |                 chars_ = list(''.join(words))
23 |                 chars = [c for c in chars_ if ord('a') <= ord(c) <= ord('z')]
24 |                 chunk_len = sent_len - 1
25 |                 if len(chars) < chunk_len:
26 |                     continue
27 | 
28 |                 num_chars = [(ord(x)-ord('a'))//2 + 1 for x in chars]
29 | 
30 |                 def prefix_line(prefix_char, line):
31 |                     tmp = [prefix_char]
32 |                     for c in line:
33 |                         tmp.append(str(c))
34 |                     return ' '.join(tmp) + '\n'
35 | 
36 |                 cnks = list(chunks(num_chars, chunk_len))
37 |                 line = cnks[0]
38 |                 yield prefix_line('0', line)
39 |                 for line in cnks[:-1]:
40 |                     yield prefix_line('14', line)
41 |                 line = cnks[-1]
42 |                 if len(cnks) == chunk_len:
43 |                     yield prefix_line('14', line)
44 | 
45 |     def file_name_generator(max_nb_file=100000, spec='%03d.bz2'):
46 |         for idx in xrange(max_nb_file):
47 |             dist_file_ = spec % idx
48 |             yield os.path.join(dist_dir, dist_file_)
49 | 
50 |     dfn_gen = file_name_generator()
51 |     dist_file_name = dfn_gen.next()
52 |     dist_file = smart_open(dist_file_name, mode='wb', buffering=2**10)
53 |     sentences = sentence_generator()
54 | 
55 |     last_size = 0
56 |     nb_line = 0
57 |     while True:
58 |         try:
59 |             next_line = sentences.next()
60 |         except StopIteration:
61 |             break
62 |         dist_file.write(next_line)
63 |         nb_line += 1
64 |         if nb_line % 100 == 0:
65 |             if dist_file.tell() >= file_size:
66 |                 last_size += dist_file.tell()
67 |                 dist_file.close()
68 | 
69 |                 if last_size >= max_size:
70 |                     break
71 | 
72 |                 dist_file_name = dfn_gen.next()
73 |                 dist_file = smart_open(dist_file_name, mode='wb', buffering=2*10)
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/utils/preprocess.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from bz2 import BZ2File
  5 | from copy import copy
  6 | import unittest
  7 | import os
  8 | import numpy as np
  9 | import cPickle as pickle
 10 | import logging
 11 | import re
 12 | 
 13 | __author__ = 'Yunchuan Chen'
 14 | logging.basicConfig(level=logging.INFO)
 15 | 
 16 | 
 17 | class ReadFileTest(unittest.TestCase):
 18 |     def test_prprcs_wrt(self):
 19 |         if not os.path.exists('../data/corpus/wiki-sg-norm-lc-drop.bz2'):
 20 |             return
 21 |         with BZ2File('../data/corpus/wiki-sg-norm-lc-drop.bz2') as f:
 22 |             f.readline()
 23 |             line = f.readline()
 24 |             self.failUnless('it was shortlisted for the booker prize and won several other awards .'.strip() == line.strip(),
 25 |                             'read line: %s not as expected.\n' % line)
 26 | 
 27 |     def test_ixport(self):
 28 |         wpx, flag = export_wordmap()
 29 |         wpi = import_wordmap()
 30 | 
 31 |         self.failUnless(flag is True, 'Failure flag received from export map')
 32 |         if wpx is not None:
 33 |             self.failUnless('word2idx' in wpx, 'word2idx key lost for the wordmap.')
 34 |             self.failUnless('idx2word' in wpx, 'idx2word key lost for the wordmap.')
 35 |             self.failUnless('idx2wc' in wpx, 'idx2wc key lost for the wordmap.')
 36 | 
 37 |         self.failUnless('word2idx' in wpi, 'word2idx key lost for the wordmap.')
 38 |         self.failUnless('idx2word' in wpi, 'idx2word key lost for the wordmap.')
 39 |         self.failUnless('idx2wc' in wpi, 'idx2wc key lost for the wordmap.')
 40 | 
 41 | 
 42 | def smart_open(fname, mode='rb', buffering=5*2**20):
 43 |     _, ext = os.path.splitext(fname)
 44 |     if ext == '.bz2':
 45 |         from bz2 import BZ2File
 46 |         return BZ2File(fname, mode, buffering)
 47 |     # if ext == '.gz':
 48 |     #     from gzip import GzipFile
 49 |     #     return GzipFile(fname, mode, buffering)
 50 |     return open(fname, mode, buffering)
 51 | 
 52 | 
 53 | def export_wordmap(dist_file='../data/wiki-wordmap.wp',
 54 |                    corpus_file='../data/corpus/wiki-sg-norm-lc.txt', rebuild=False):
 55 |     """
 56 |     :param dist_file: file name to store the wordmap
 57 |     :param corpus_file: corpus source to build wordmap against
 58 |     :param rebuild: whether rebuild wordmap if it already exists.
 59 |     :return: exported model and a flag.
 60 |     """
 61 |     if os.path.exists(dist_file) and not rebuild:
 62 |         return None, True
 63 |     word2cnt = dict()
 64 |     with smart_open(corpus_file, buffering=5*2**20) as f:
 65 |         for sent in f:
 66 |             words = sent.split()
 67 |             for w in words:
 68 |                 try:
 69 |                     word2cnt[w] += 1
 70 |                 except KeyError:
 71 |                     word2cnt[w] = 1
 72 |     kv = sorted(word2cnt.items(), key=lambda x: x[1], reverse=True)
 73 |     idx2word = [w for w, _ in kv]
 74 |     idx2wc = [c for _, c in kv]
 75 |     word2idx = dict((w, idx) for idx, (w, _) in enumerate(kv))
 76 |     model = {'idx2word': idx2word, 'idx2wc': idx2wc, 'word2idx': word2idx}
 77 |     with file(dist_file, 'wb') as f:
 78 |         pickle.dump(model, f, -1)
 79 |     return model, True
 80 | 
 81 | 
 82 | def import_wordmap(fname='../data/wiki-wordmap.wp'):
 83 |     """
 84 |     :param fname: a string indicate where the wordmap stores.
 85 |     :return: wordmap
 86 |     """
 87 |     with file(fname, 'rb') as f:
 88 |         wp = pickle.load(f)
 89 |     return wp
 90 | 
 91 | 
 92 | def preprocess_corpus(corpus_file='../data/corpus/wiki-sg-norm-lc.txt',
 93 |                       dist_file='../data/corpus/wiki-sg-norm-lc-drop.bz2'):
 94 |     """
 95 |     :param corpus_file: original corpus file name
 96 |     :type corpus_file: str
 97 |     :param dist_file: location to store the preprocessed corpus.
 98 |     :type dist_file: str
 99 |     :return: None
100 |     Drop all sentences with length not in [3, 64]. Replace words that occurs less than 100 times
101 |     with a special word __rare__.
102 |     """
103 |     corpus_file = file(corpus_file)
104 |     dist_file = smart_open(dist_file, mode='w')
105 | 
106 |     assert corpus_file is not None and dist_file is not None
107 |     wp = import_wordmap()
108 |     for line in corpus_file:
109 |         words = line.split()
110 |         if not (3 <= len(words) <= 64):
111 |             continue
112 |         words_ = copy(words)
113 |         for idx, w in enumerate(words):
114 |             if w not in wp['word2idx']:
115 |                 words_[idx] = '__rare__'
116 |         sentence = ' '.join(words_)
117 |         dist_file.writelines([sentence, '\n'])
118 | 
119 |     corpus_file.close()
120 |     dist_file.close()
121 | 
122 | 
123 | def binarize_corpus(group_size=20000, corpus_file='../data/corpus/wiki-sg-norm-lc-drop.bz2',
124 |                     dist_file='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2',
125 |                     max_len=64, wordmap='../data/wiki-wordmap.wp'):
126 |     """
127 |     :param group_size: group size. We repeatedly read group size of sentences and
128 |     convert and store them into binary format as a batch.
129 |     :type group_size: int
130 |     :param corpus_file: the corpus to be converted
131 |     :type corpus_file: str
132 |     :param dist_file: the file to store the converted corpus
133 |     :param max_len: maximum length of sentence. Sentences exceeds this length will be dropped.
134 |     :param wordmap: wordmap.
135 |     :return: None
136 |     """
137 |     def _index_sentence(sent):
138 |         """
139 |         :param sent: a sentence as a string
140 |         :type sent: str
141 |         :return: a list of word index
142 |         Represents a sentence using word indexes.
143 |         """
144 |         words = sent.split()
145 |         return [word2idx[w] for w in words]
146 | 
147 |     def _commit_result():
148 |         for idx_sent in result[3:]:
149 |             if len(idx_sent) > 0:
150 |                 sents = np.array(idx_sent, dtype=np.int32)
151 |                 shape = np.array(sents.shape, dtype=np.int32)
152 |                 dist_file.write(shape.tobytes())
153 |                 dist_file.write(sents.tobytes())
154 | 
155 |         for j in range(len(result)):
156 |             result[j] = []
157 | 
158 |     dist_file = smart_open(dist_file, 'wb')
159 |     assert dist_file is not None
160 |     if isinstance(wordmap, str):
161 |         wp = import_wordmap(fname=wordmap)
162 |     elif isinstance(wordmap, dict):
163 |         wp = wordmap
164 |     else:
165 |         logging.error('can not recognize wordmap type')
166 |         raise TypeError('wordamp must be dict or str')
167 |     word2idx = wp['word2idx']
168 |     result = [[] for _ in range(max_len + 1)]
169 |     with smart_open(corpus_file) as f:
170 |         for i, sent in enumerate(f, start=1):
171 |             idxs = _index_sentence(sent)
172 |             try:
173 |                 result[len(idxs)].append(idxs)
174 |                 if i % group_size == 0:
175 |                     _commit_result()
176 |             except IndexError:
177 |                 continue
178 |         _commit_result()
179 | 
180 |     dist_file.close()
181 | 
182 | 
183 | def grouped_sentences(binary_corpus='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'):
184 |     with smart_open(binary_corpus) as f:
185 |         while True:
186 |             shape_data = f.read(2*4)
187 |             if shape_data == "":
188 |                 return
189 |             shape = np.frombuffer(shape_data, dtype=np.uint32)
190 |             siz = shape[0] * shape[1] * 4
191 |             sents = np.frombuffer(f.read(siz), dtype=np.uint32)
192 |             # noinspection PyTypeChecker
193 |             sents_ = np.reshape(sents, shape)
194 |             yield sents_.copy().astype('int32')
195 | 
196 | 
197 | def show_grouped_sentences(group_sents, wordmap='../data/wiki-wordmap.wp'):
198 |     """
199 |     :param group_sents: a matrix represents a set of sentences' indexes
200 |     :type group_sents: numpy.ndarray
201 |     :param wordmap: word_ to index_ map and vise versa
202 |     :return: list, a list of string representation of the sentences.
203 |     """
204 |     if isinstance(wordmap, str):
205 |         # import logging
206 |         logger = logging.getLogger('Preprocess')
207 |         logger.warn('It would be inefficient if repeatedly call this function with wordmap name')
208 |         wordmap = import_wordmap(fname=wordmap)
209 |         idx2word = wordmap['idx2word']
210 |     elif isinstance(wordmap, dict):
211 |         idx2word = wordmap['idx2word']
212 |     else:
213 |         raise TypeError('wordmap must be a string representing the map location or '
214 |                         'a dictionary containing the map')
215 |     ret = [None] * group_sents.shape[0]
216 |     for i in range(len(ret)):
217 |         ret[i] = [idx2word[j] for j in group_sents[i]]
218 | 
219 |     return ret
220 | 
221 | 
222 | def get_fake_data_meta(fname='../data/fake', trn_regex=re.compile(r'\d{3}.bz2')):
223 |     data_path = os.path.abspath(fname)
224 |     meta_file = os.path.join(data_path, 'meta.pkl')
225 |     if not os.path.isfile(meta_file):
226 |         train_files_ = [os.path.join(data_path, f) for f in os.listdir(data_path) if trn_regex.match(f)]
227 |         train_files = [f for f in train_files_ if os.path.isfile(f)]
228 |         nb_total = 0
229 |         nb_bin = np.zeros((15,), dtype='int32')
230 | 
231 |         for f in train_files:
232 |             X = np.loadtxt(f, dtype='int32')
233 |             nb_bin += np.bincount(X.ravel(), minlength=15)
234 |             nb_total += np.prod(X.shape)
235 | 
236 |         rel_freq = nb_bin.astype('float32')/nb_total
237 |         ret = {'freq': nb_bin, 'rel_freq': rel_freq, 'nb_total': nb_total}
238 |         with file(meta_file, 'wb') as mf:
239 |             pickle.dump(ret, mf)
240 |     else:
241 |         with file(meta_file, 'rb') as mf:
242 |             ret = pickle.load(mf)
243 | 
244 |     return ret
245 | 
246 | if __name__ == '__main__':
247 |     if not os.path.exists('../data/corpus/wiki-sg-norm-lc-drop.bz2'):
248 |         export_wordmap()
249 |         preprocess_corpus()
250 |     if not os.path.exists('../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'):
251 |         binarize_corpus()
252 | 
253 |     unittest.main()


--------------------------------------------------------------------------------
/utils/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'Yunchuan Chen'
 4 | 
 5 | from tree_util import load_brown_tree
 6 | 
 7 | idx2cls, idx2bitstr, mask = load_brown_tree('../brown-cluster/fake-c15-p2.out/paths', dict((str(x), x) for x in range(15)))
 8 | 
 9 | print idx2cls
10 | print idx2bitstr
11 | print mask
12 | 


--------------------------------------------------------------------------------
/utils/tree_util.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | __author__ = 'Yunchuan Chen'
  4 | import numpy as np
  5 | import cPickle as pickle
  6 | import Queue
  7 | 
  8 | 
  9 | def prefix_generator(s, start=0, end=None):
 10 |     if end is None:
 11 |         end = len(s) + 1
 12 |     for idx in range(start, end):
 13 |         yield s[:idx]
 14 | 
 15 | 
 16 | #paths_line = re.compile(r'(\d+)\s+(\S+)\s+(\d+)')
 17 | def load_brown_tree(paths_file, word2idx, start=0, to_end=False):
 18 |     """
 19 |     :param paths_file: paths file which is the output of the wcluster program
 20 |     :param word2idx: a dictionary which maps each word in vocabulary to a index
 21 |     :type word2idx: dict
 22 |     :return: a tuple of word index to word cluster id and bit string and a mask
 23 |     """
 24 |     bit_namespace = set()
 25 |     idx2bitstr = [None] * len(word2idx)
 26 |     idx2cls = [None] * len(word2idx)
 27 |     idx2cls_name = [None] * len(word2idx)
 28 |     # cls_idx = -1
 29 |     with file(paths_file, 'r') as f:
 30 |         for line in f:
 31 |             try:
 32 |                 bitstr_, word, _ = line.split()
 33 |             except ValueError:
 34 |                 continue
 35 |             word_clses = []
 36 |             end_ = len(bitstr_) if not to_end else len(bitstr_) + 1
 37 |             for pre in prefix_generator(bitstr_, start=start, end=end_):
 38 |                 if pre not in bit_namespace:
 39 |                     bit_namespace.add(pre)
 40 |                     # cls_idx += 1
 41 |                 word_clses.append(pre)
 42 |             bitstr = [1 if x == '1' else -1 for x in bitstr_[:end_]]
 43 |             word_idx = word2idx[word]
 44 |             idx2bitstr[word_idx] = bitstr
 45 |             idx2cls_name[word_idx] = word_clses
 46 |     node_names = sorted(bit_namespace, key=lambda x: len(x))
 47 |     clsname2idx = dict(((n, idx) for idx, n in enumerate(node_names)))
 48 |     for i in range(len(idx2cls)):
 49 |         idx2cls[i] = [clsname2idx[x] for x in idx2cls_name[i]]
 50 | 
 51 |     idx2cls = np.array(pad_virtual_class(idx2cls, pad_value=len(node_names)), dtype='int32')
 52 |     idx2bitstr = np.array(pad_bitstr(idx2bitstr), dtype='int8')
 53 |     return idx2cls, idx2bitstr, idx2bitstr != 0
 54 | 
 55 | 
 56 | def pad_bitstr(bitstr):
 57 |     """
 58 |     :param bitstr:
 59 |     :type bitstr: list
 60 |     :return: padded list of bits
 61 |     """
 62 |     max_bit_len = 0
 63 |     for bits in bitstr:
 64 |         if len(bits) > max_bit_len:
 65 |             max_bit_len = len(bits)
 66 |     for bits in bitstr:
 67 |         bits.extend([0] * (max_bit_len-len(bits)))
 68 | 
 69 |     return bitstr
 70 | 
 71 | 
 72 | def pad_virtual_class(clses, pad_value):
 73 |     max_cls_len = 0
 74 |     for nodes in clses:
 75 |         if len(nodes) > max_cls_len:
 76 |             max_cls_len = len(nodes)
 77 |     for nodes in clses:
 78 |         nodes.extend([pad_value] * (max_cls_len-len(nodes)))
 79 | 
 80 |     return clses
 81 | 
 82 | 
 83 | def save_tree(fn, idx2cls, idx2bitstr, mask):
 84 |     with file(fn, 'wb') as f:
 85 |         pickle.dump({'idx2cls': idx2cls, 'idx2bitstr': idx2bitstr, 'mask': mask}, f)
 86 | 
 87 | 
 88 | class HuffmanNode(object):
 89 |     def __init__(self, left=None, right=None, root=None):
 90 |         self.left = left
 91 |         self.right = right
 92 |         self.root = root     # Why?  Not needed for anything.
 93 | 
 94 |     def children(self):
 95 |         return self.left, self.right
 96 | 
 97 |     def preorder(self, path=None, left_code=0, right_code=1, collector=None):
 98 |         if collector is None:
 99 |             collector = []
100 |         if path is None:
101 |             path = []
102 |         if self.left is not None:
103 |             if isinstance(self.left[1], HuffmanNode):
104 |                 self.left[1].preorder(path+[left_code], left_code, right_code, collector)
105 |             else:
106 |                 # print(self.left[1], path+[left_code])
107 |                 collector.append((self.left[1], self.left[0], path+[left_code]))
108 |         if self.right is not None:
109 |             if isinstance(self.right[1], HuffmanNode):
110 |                 self.right[1].preorder(path+[right_code], left_code, right_code, collector)
111 |             else:
112 |                 # print(self.right[1], path+[right_code])
113 |                 collector.append((self.right[1], self.right[0], path+[right_code]))
114 | 
115 |         return collector
116 | 
117 | 
118 | def create_tree(frequencies):
119 |     p = Queue.PriorityQueue()
120 |     for value in frequencies:     # 1. Create a leaf node for each symbol
121 |         p.put(value)              #    and add it to the priority queue
122 |     while p.qsize() > 1:          # 2. While there is more than one node
123 |         l, r = p.get(), p.get()   # 2a. remove two highest nodes
124 |         node = HuffmanNode(l, r)  # 2b. create internal node with children
125 |         p.put((l[0]+r[0], node))  # 2c. add new node to queue
126 |     return p.get()                # 3. tree is complete - return root node
127 | 
128 | 
129 | def load_huffman_tree(meta_file):
130 |     import cPickle as pickle
131 |     with file(meta_file, 'rb') as f:
132 |         meta = pickle.load(f)
133 |         rel_freq = meta['rel_freq']
134 |     freq = zip(rel_freq, range(len(rel_freq)))
135 |     tree = create_tree(freq)[1]
136 |     x = tree.preorder(left_code=-1, right_code=1)
137 |     y = sorted(x, key=lambda z: z[1], reverse=True)
138 |     bitstr = []
139 |     for _, _, bitstr_ in y:
140 |         bitstr.append(bitstr_[:-1])
141 | 
142 |     z = [(wrdidx, bits, list(prefix_generator(bits, end=len(bits)))) for wrdidx, _, bits in y]
143 |     clses = set()
144 |     for _, _, ele in z:
145 |         for i in ele:
146 |             clses.add(''.join('%+d' % j for j in i))
147 |     idx2clses = sorted(clses, key=lambda ele: len(ele))
148 |     cls2idx = dict(((cls, idx) for idx, cls in enumerate(idx2clses)))
149 |     w = map(lambda x: (x[0], x[1], [cls2idx[''.join('%+d' % j for j in p)] for p in x[2]]), z)
150 | 
151 |     tmp1, tmp2 = [], []
152 |     for _, bits, cls_idx in w:
153 |         tmp1.append(bits)
154 |         tmp2.append(cls_idx)
155 |     pad_bitstr(tmp1)
156 |     pad_virtual_class(tmp2, pad_value=len(idx2clses))
157 |     assert len(freq) == len(w)
158 |     idx2cls = [None] * len(freq)
159 |     idx2bitstr = [None] * len(freq)
160 |     for idx, bitstr_, cls_ in w:
161 |         idx2cls[idx] = cls_
162 |         idx2bitstr[idx] = bitstr_
163 | 
164 |     idx2cls = np.array(idx2cls, dtype='int32')
165 |     idx2bitstr = np.array(idx2bitstr, dtype='int8')
166 | 
167 |     return idx2cls, idx2bitstr, idx2bitstr != 0
168 | 
169 | if __name__ == '__main__':
170 |     freq = [
171 |         (8.167, 'a'), (1.492, 'b'), (2.782, 'c'), (4.253, 'd'),
172 |         (12.702, 'e'),(2.228, 'f'), (2.015, 'g'), (6.094, 'h'),
173 |         (6.966, 'i'), (0.153, 'j'), (0.747, 'k'), (4.025, 'l'),
174 |         (2.406, 'm'), (6.749, 'n'), (7.507, 'o'), (1.929, 'p'),
175 |         (0.095, 'q'), (5.987, 'r'), (6.327, 's'), (9.056, 't'),
176 |         (2.758, 'u'), (1.037, 'v'), (2.365, 'w'), (0.150, 'x'),
177 |         (1.974, 'y'), (0.074, 'z')]
178 |     node = create_tree(freq)
179 |     print(node)
180 | 
181 | 


--------------------------------------------------------------------------------