├── .gitignore ├── ModelTest.py ├── __init__.py ├── debug.py ├── environ-pku1.sh ├── environ.sh ├── experiments.py ├── models.py ├── real ├── __init__.py ├── baseline │ ├── gen_vocab.sh │ ├── lm_main.sh │ ├── main.sh │ ├── main_kn.sh │ └── sri_env.sh ├── exp_nce0_norm.py ├── exp_nce2_norm.py ├── exp_nce2_zregression.py ├── exp_nce4_norm.py ├── layers.py ├── main_lblv1-pku1.sh ├── main_lblv1.py ├── main_lblv1.sh ├── main_lblv2.py ├── main_lblv2.sh ├── main_nce2.py ├── main_nce2.sh ├── main_nce4.py ├── main_nce4.sh ├── main_nce4_lab_proxy_pku2.sh ├── main_nce4_pku1.sh ├── main_nce4_pku1_proxy_pku2.sh ├── main_nce4_pku2.sh ├── main_nce4_pku3.sh ├── main_nce4_pku3_proxy_pku2.sh ├── main_nce7.py ├── main_nce7.sh ├── main_nce7_pku1.sh ├── main_nce7_pku2.sh ├── main_nce7_pku3.sh ├── main_nce8.py ├── main_nce8.sh ├── main_nce8_nodecay_lab.sh ├── main_nce8_pku1.sh ├── main_nce8_pku2.sh ├── main_nce8_pku3.sh ├── models.py ├── run_batch.sh ├── run_nce0.py ├── run_nce0_default.py ├── run_nce0_neg100_default.py ├── run_nce0_neg50_lr0.005.py ├── run_nce0_neg50_lr0.01.py ├── run_nce0_neg50_lr0.01_g0.001.py ├── run_nce1_neg50_default.py ├── run_nce2.py ├── run_nce2_neg50_lr0.01_g0.001.py ├── run_nce3.py ├── run_nce4.py ├── run_nce5.py ├── run_nce6.py ├── run_tree_huffman_lr0.01_g0.001.py ├── utils │ ├── __init__.py │ ├── check_maps.py │ ├── preprocess.py │ └── utils.py └── workspace │ ├── export_sri_data.py │ ├── extract_learning_curv_data.py │ ├── gen_train_data.py │ └── show_time_loss.m ├── stat ├── get_stat.py ├── read_stats.m └── show_stats.m ├── test ├── __init__.py ├── snippet.py └── test_io.py └── utils ├── __init__.py ├── fake_data.py ├── preprocess.py ├── test.py └── tree_util.py /.gitignore: -------------------------------------------------------------------------------- 1 | brown-cluster 2 | srilm-1.7.1 3 | data 4 | stat/*.fig 5 | stat/*.jpg 6 | .idea 7 | .ipynb_checkpoints 8 | # Created by .ignore support plugin (hsz.mobi) 9 | ### Matlab template 10 | ##--------------------------------------------------- 11 | ## Remove autosaves generated by the Matlab editor 12 | ## We have git for backups! 13 | ##--------------------------------------------------- 14 | 15 | # Windows default autosave extension 16 | *.asv 17 | 18 | # OSX / *nix default autosave extension 19 | *.m~ 20 | 21 | # Compiled MEX binaries (all platforms) 22 | *.mex* 23 | 24 | # Simulink Code Generation 25 | slprj/ 26 | 27 | 28 | ### Python template 29 | # Byte-compiled / optimized / DLL files 30 | __pycache__/ 31 | *.py[cod] 32 | 33 | # C extensions 34 | *.so 35 | 36 | # Distribution / packaging 37 | .Python 38 | env/ 39 | build/ 40 | develop-eggs/ 41 | dist/ 42 | downloads/ 43 | eggs/ 44 | .eggs/ 45 | lib/ 46 | lib64/ 47 | parts/ 48 | sdist/ 49 | var/ 50 | *.egg-info/ 51 | .installed.cfg 52 | *.egg 53 | 54 | # PyInstaller 55 | # Usually these files are written by a python script from a template 56 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 57 | *.manifest 58 | *.spec 59 | 60 | # Installer logs 61 | pip-log.txt 62 | pip-delete-this-directory.txt 63 | 64 | # Unit test / coverage reports 65 | htmlcov/ 66 | .tox/ 67 | .coverage 68 | .coverage.* 69 | .cache 70 | nosetests.xml 71 | coverage.xml 72 | 73 | # Translations 74 | *.mo 75 | *.pot 76 | 77 | # Django stuff: 78 | *.log 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | 87 | ### Vim template 88 | [._]*.s[a-w][a-z] 89 | [._]s[a-w][a-z] 90 | *.un~ 91 | Session.vim 92 | .netrwhist 93 | *~ 94 | 95 | 96 | -------------------------------------------------------------------------------- /ModelTest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from models import SimpleLangModel, NCELangModel, NCELangModelV1, TreeLangModel 6 | from keras.layers.core import Dropout, Dense 7 | import os 8 | import logging 9 | import optparse 10 | import cPickle as pickle 11 | import theano 12 | 13 | floatX = theano.config.floatX 14 | 15 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 16 | parser.add_option("-q", "--quiet", 17 | action="store_false", dest="verbose", default=True, 18 | help="don't print progress bar to stdout") 19 | parser.add_option("-s", "--simple", 20 | action="store_true", dest="train_simple", default=False, 21 | help="Train Simple language model") 22 | parser.add_option("-n", "--nce", 23 | action="store_true", dest="train_nce", default=False, 24 | help="Train NCE based language model") 25 | parser.add_option("-c", "--nce1", 26 | action="store_true", dest="train_nce1", default=False, 27 | help="Train NCE based language model V1") 28 | parser.add_option("-b", "--batch-size", type='int', dest="batch_size", default=256, 29 | help="Batch size") 30 | parser.add_option("-t", "--test", 31 | action="store_true", dest="test", default=False, 32 | help="train on small data set") 33 | parser.add_option("-r", "--tree", 34 | action="store_true", dest="tree", default=False, 35 | help="Train hierarchical softmax language model") 36 | parser.add_option("-m", "--tree-type", type='str', 37 | dest="tree_type", default='huffman', 38 | help="Specify the type of the tree") 39 | parser.add_option("-d", "--debug", 40 | action="store_true", dest="debug", default=False, 41 | help="show debug information") 42 | parser.add_option("-g", "--unigram", 43 | action="store_true", dest="unigram", default=False, 44 | help="Whether use unigram distribution for noise samples") 45 | parser.add_option("-z", "--optimizer", type='str', 46 | dest="optimizer", default='adam', 47 | help="Specify optimizer") 48 | parser.add_option('-a', "--attention", dest="attention", type='str', default='none', 49 | help='Specify attention model') 50 | parser.add_option('-l', '--attention-length', dest='att_len', type='int', default=10, 51 | help='Specify attention bias length') 52 | 53 | options, args = parser.parse_args() 54 | # ==================================================================================== 55 | # if TESTLM environment variable is defined, run the program on a small data set. 56 | if os.environ.get('TESTLM') is not None or options.test: 57 | data_path = os.path.abspath('data/fake/test') 58 | else: 59 | data_path = os.path.abspath('data/fake') 60 | #data_path = os.path.abspath('data/fake/test') 61 | 62 | if options.debug: 63 | logging.basicConfig(level=logging.DEBUG) 64 | else: 65 | logging.basicConfig(level=logging.INFO) 66 | 67 | if options.unigram: 68 | import cPickle as pickle 69 | with file(os.path.join(data_path, 'meta.pkl'), 'rb') as mf: 70 | meta = pickle.load(mf) 71 | negprob_table = meta['rel_freq'] 72 | else: 73 | negprob_table = None 74 | 75 | if options.train_simple: 76 | logging.info('Train simple language model') 77 | model = SimpleLangModel(vocab_size=15, embed_dims=128, context_dims=128, optimizer=options.optimizer) 78 | model.compile() 79 | model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose) 80 | 81 | if options.train_nce: 82 | logging.info('Train NCE based language model') 83 | model = NCELangModel(vocab_size=15, nb_negative=2, embed_dims=128, negprob_table=negprob_table, 84 | optimizer=options.optimizer) 85 | model.compile() 86 | logging.debug('compile success') 87 | model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose) 88 | 89 | if options.train_nce1: 90 | logging.info('Train NCE based language model (1)') 91 | model = NCELangModelV1(vocab_size=15, nb_negative=6, embed_dims=128, negprob_table=negprob_table, 92 | optimizer=options.optimizer) 93 | model.compile() 94 | logging.debug('compile success') 95 | model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose) 96 | 97 | if options.tree: 98 | logging.info('Train hierarchical softmax language model') 99 | 100 | if options.tree_type == 'huffman': 101 | logging.info('train with Huffman Tree') 102 | tree_file = 'data/fake/tree-info-huffman.pkl' 103 | else: 104 | tree_file = 'data/fake/tree-info.pkl' 105 | logging.info('Train with Brown tree') 106 | 107 | with file('data/fake/tree-info.pkl', 'rb') as f: 108 | tree_info = pickle.load(f) 109 | word2cls = tree_info['idx2cls'] 110 | word2bitstr = tree_info['idx2bitstr'].astype(floatX) 111 | 112 | model = TreeLangModel(vocab_size=15, embed_dim=128, cntx_dim=128, word2class=word2cls, word2bitstr=word2bitstr) 113 | model.compile(optimizer=options.optimizer) 114 | logging.debug('compile success') 115 | model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose) 116 | 117 | if options.attention == 'simple': 118 | from models import SimpAttLangModel 119 | model = SimpAttLangModel(vocab_size=15, embed_dims=128, context_dim=128, attention_len=options.att_len) 120 | model.compile() 121 | logging.debug('compile success') 122 | model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose) 123 | 124 | if options.attention == 'parallel': 125 | from models import ParallelAttLangModel 126 | model = ParallelAttLangModel(vocab_size=15, embed_dims=128, context_dim=128, attention_len=options.att_len) 127 | model.compile() 128 | logging.debug('compile success') 129 | model.train_from_dir(data_path, validation_split=0.05, batch_size=options.batch_size, verbose=options.verbose) 130 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | -------------------------------------------------------------------------------- /debug.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from models import TreeLangModel 6 | import cPickle as pickle 7 | import theano 8 | import numpy as np 9 | 10 | floatX = theano.config.floatX 11 | train_file = 'data/fake/001.bz2' 12 | 13 | with file('data/fake/tree-info.pkl', 'rb') as f: 14 | tree_info = pickle.load(f) 15 | word2cls = tree_info['idx2cls'] 16 | word2bitstr = tree_info['idx2bitstr'] 17 | 18 | model = TreeLangModel(vocab_size=15, embed_dim=128, cntx_dim=128, word2class=word2cls, word2bitstr=word2bitstr) 19 | 20 | 21 | X = np.loadtxt(train_file, dtype='int32') 22 | data = X[:256] 23 | del X 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /environ-pku1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # source ~/bin/ch.gcc-4.8.4.sh 3 | export THEANO_FLAGS=mode=FAST_RUN,device=cpu,floatX=float32,nvcc.fastmath=True,scan.allow_gc=True,allow_gc=True 4 | #export PYTHONPATH=/home/cyc/Documents/workspace:$PYTHONPATH 5 | export PYTHONPATH=$HOME/.chen/workspace:$PYTHONPATH 6 | export OMP_NUM_THREADS=4 7 | export SRILM="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/srilm-1.7.1 8 | export PATH=$PATH:$SRILM/bin/i686-m64 9 | export MANPATH=$MANPATH:$SRILM/man 10 | export LC_NUMERIC=C 11 | 12 | -------------------------------------------------------------------------------- /environ.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source ~/bin/ch.gcc-4.8.4.sh 3 | export THEANO_FLAGS=mode=FAST_RUN,device=cpu,floatX=float32,nvcc.fastmath=True,scan.allow_gc=True,allow_gc=True 4 | export PYTHONPATH=/home/cyc/Documents/workspace:$PYTHONPATH 5 | # export PYTHONPATH=$HOME/.chen/workspace:$PYTHONPATH 6 | export OMP_NUM_THREADS=4 7 | export SRILM="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/srilm-1.7.1 8 | export PATH=$PATH:$SRILM/bin/i686-m64 9 | export MANPATH=$MANPATH:$SRILM/man 10 | export LC_NUMERIC=C 11 | 12 | -------------------------------------------------------------------------------- /real/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' -------------------------------------------------------------------------------- /real/baseline/gen_vocab.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | for (( nb_vocab=10000; nb_vocab<52000; nb_vocab+=2000 )); do 3 | let file_name=${nb_vocab}/1000 4 | for (( i=0; i<$nb_vocab; ++i )); do 5 | echo $i 6 | done > ../../data/corpus/sri/${file_name}k.vocab 7 | done 8 | 9 | -------------------------------------------------------------------------------- /real/baseline/lm_main.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DATA_DIR="../../data/corpus/sri" 4 | TEXTFILE="${DATA_DIR}/wiki-trn-R100m-V100k.txt" 5 | TEST_DATA="${DATA_DIR}/wiki-val-R5m-V100k.txt" 6 | COUNT_FILE="${DATA_DIR}/wiki-trn-R100m-order4-gt1-3.count" 7 | OOV="900000" 8 | 9 | ngram-count -text $TEXTFILE -order 4 -write-binary $COUNT_FILE \ 10 | -gt1 ${DATA_DIR}/gt1.params \ 11 | -gt2 ${DATA_DIR}/gt2.params \ 12 | -gt3 ${DATA_DIR}/gt3.params 13 | 14 | for (( nb_vocab=10; nb_vocab <=50; nb_vocab+= 2 )); do 15 | LM="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-gt1-3.lm" 16 | echo "Begin: Test order 4, gt=${gt}, vocab: ${nb_vocab}" 17 | 18 | ngram-count -read $COUNT_FILE -vocab ${nb_vocab}k.vocab -unk -map-unk $OOV \ 19 | -order 4 -write-binary-lm -lm $LM \ 20 | -gt1 ${DATA_DIR}/gt1.params \ 21 | -gt2 ${DATA_DIR}/gt2.params \ 22 | -gt3 ${DATA_DIR}/gt3.params 23 | ngram -unk -map-unk $OOV -lm ${LM} -ppl ${TEST_DATA} 24 | 25 | echo "END: Test order 4, gt=${gt}, vocab: ${nb_vocab}" 26 | done 27 | 28 | # for (( nb_vocab=10; nb_vocab <=50; nb_vocab+= 2 )); do 29 | # TEXTFILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k.txt" 30 | # TEST_DATA="${DATA_DIR}/wiki-val-R5m-V${nb_vocab}k.txt" 31 | # for (( kn = 1; kn <=4; kn += 1 )); do 32 | # COUNT_FILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-kn${kn}.count" 33 | # LM="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-kn${kn}.lm" 34 | # echo "Begin: Test order 4, kn=${kn}, vocab: ${nb_vocab}" 35 | # ngram-count -text $TEXTFILE -kndiscount $kn -order 4 -write-binary $COUNT_FILE 36 | # ngram-count -read $COUNT_FILE -kn-counts-modified -write-binary-lm -lm $LM 37 | # ngram -lm ${LM} -ppl ${TEST_DATA} 38 | # echo "END: Test order 4, kn=${kn}" 39 | # done 40 | # done 41 | -------------------------------------------------------------------------------- /real/baseline/main.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DATA_DIR="../../data/corpus/sri" 4 | 5 | for (( nb_vocab=10; nb_vocab <=50; nb_vocab+= 2 )); do 6 | TEXTFILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k.txt" 7 | TEST_DATA="${DATA_DIR}/wiki-val-R5m-V${nb_vocab}k.txt" 8 | for (( kn = 1; kn <=4; kn += 1 )); do 9 | COUNT_FILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-kn${kn}.count" 10 | LM="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-kn${kn}.lm" 11 | echo "Begin: Test order 4, kn=${kn}, vocab: ${nb_vocab}" 12 | ngram-count -text $TEXTFILE -kndiscount $kn -order 4 -write-binary $COUNT_FILE 13 | ngram-count -read $COUNT_FILE -kn-counts-modified -write-binary-lm -lm $LM 14 | ngram -lm ${LM} -ppl ${TEST_DATA} 15 | echo "END: Test order 4, kn=${kn}" 16 | done 17 | done 18 | 19 | for (( nb_vocab=10; nb_vocab <=50; nb_vocab+= 2 )); do 20 | TEXTFILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k.txt" 21 | TEST_DATA="${DATA_DIR}/wiki-val-R5m-V${nb_vocab}k.txt" 22 | for (( gt = 1; gt <=4; gt += 1 )); do 23 | COUNT_FILE="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-gt${gt}.count" 24 | LM="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-gt${gt}.lm" 25 | PAR="${DATA_DIR}/wiki-trn-R100m-V${nb_vocab}k-order4-gt${gt}.gt" 26 | echo "Begin: Test order 4, gt=${gt}, vocab: ${nb_vocab}" 27 | ngram-count -text $TEXTFILE -order 4 -gt${gt} ${PAR} -write-binary $COUNT_FILE 28 | ngram-count -read $COUNT_FILE -write-binary-lm -lm ${LM} -gt${gt} ${PAR} 29 | ngram -lm ${LM} -ppl ${TEST_DATA} 30 | echo "END: Test order 4, gt=${gt}, vocab: ${nb_vocab}" 31 | done 32 | done -------------------------------------------------------------------------------- /real/baseline/main_kn.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Usage: prog logfile 3 | DATA_DIR="../../data/corpus/sri" 4 | TEXTFILE="$DATA_DIR/wiki-trn-R100m.txt" 5 | TESTFILE="$DATA_DIR/wiki-val-R5m.txt" 6 | 7 | KnParams="-kn1 ${DATA_DIR}/wiki-kn1.param -kn2 ${DATA_DIR}/wiki-kn2.param -kn3 ${DATA_DIR}/wiki-kn3.param -kn4 ${DATA_DIR}/wiki-kn4.param" 8 | CommParams="-unk -map-unk 10000000" 9 | ngram-count -order 4 -text $TEXTFILE $KnParams 10 | 11 | for (( nb_vocab=10; nb_vocab<=50; nb_vocab+=2 )); do 12 | NGRAMS=$DATA_DIR/wiki-V${nb_vocab}k.4grams 13 | VOCAB=$DATA_DIR/${nb_vocab}k.vocab 14 | 15 | ngram-count -order 4 -text $TEXTFILE $CommParams -vocab $VOCAB -write-binary $NGRAMS 16 | for order in 2 3 4; do 17 | LM=$DATA_DIR/wiki-V${nb_vocab}k-order${order}.lm 18 | ngram-count -order $order -read $NGRAMS $CommParams \ 19 | -kndiscount${order} -write-binary-lm -lm $LM -vocab $VOCAB $KnParams 20 | 21 | echo "PPL Results for V=$nb_vocab and order=$order: " | tee -a $1 22 | ngram -lm $LM -ppl $TESTFILE $CommParams | tee -a $1 23 | done 24 | done -------------------------------------------------------------------------------- /real/baseline/sri_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | export SRILM="$( cd "../../$( dirname "${BASH_SOURCE[0]}" )" && pwd )"/srilm-1.7.1 3 | export PATH=$PATH:$SRILM/bin/i686-m64 4 | export MANPATH=$MANPATH:$SRILM/man 5 | export LC_NUMERIC=C -------------------------------------------------------------------------------- /real/exp_nce0_norm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | import optparse 7 | from keras.optimizers import adam, AdamAnneal 8 | # noinspection PyUnresolvedReferences 9 | from models import LangModel, Graph, optimizers, categorical_crossentropy, objective_fnc, slice_X, \ 10 | np, theano, TableSampler, Split, containers, T, LookupProb, logger, Embedding, math, make_batches, \ 11 | PartialSoftmax, Dense, LogInfo, MAX_SETN_LEN, grouped_sentences, time, chunk_sentences, LangLSTMLayer 12 | from layers import ActivationLayer 13 | 14 | 15 | class NCELangModelV0(Graph, LangModel): 16 | def __init__(self, vocab_size, nb_negative, embed_dims=128, context_dims=128, 17 | negprob_table=None, optimizer='adam'): 18 | super(NCELangModelV0, self).__init__(weighted_inputs=False) 19 | self.vocab_size = vocab_size 20 | self.embed_dim = embed_dims 21 | self.optimizer = optimizers.get(optimizer) 22 | self.nb_negative = nb_negative 23 | self.loss = categorical_crossentropy 24 | self.loss_fnc = objective_fnc(self.loss) 25 | 26 | if negprob_table is None: 27 | negprob_table_ = np.ones(shape=(vocab_size,), dtype=theano.config.floatX)/vocab_size 28 | negprob_table = theano.shared(negprob_table_) 29 | self.neg_prob_table = negprob_table_ 30 | else: 31 | self.neg_prob_table = negprob_table.astype(theano.config.floatX) 32 | negprob_table = theano.shared(negprob_table.astype(theano.config.floatX)) 33 | 34 | self.sampler = TableSampler(self.neg_prob_table) 35 | 36 | self.add_input(name='idxes', ndim=3, dtype='int32') 37 | self.add_node(Split(split_at=1, split_axis=0), name=('pos_sents', ''), inputs='idxes') 38 | 39 | seq = containers.Sequential() 40 | seq.add(self.nodes['pos_sents']) 41 | seq.add(Embedding(vocab_size, embed_dims)) 42 | seq.add(LangLSTMLayer(embed_dims, output_dim=context_dims)) 43 | # seq.add(Dropout(0.5)) 44 | 45 | self.add_node(seq, name='seq') 46 | self.add_node(PartialSoftmax(input_dim=context_dims, output_dim=vocab_size), 47 | name='part_prob', inputs=('idxes', 'seq')) 48 | self.add_node(LookupProb(negprob_table), name='lookup_prob', inputs='idxes') 49 | 50 | test_node = Dense(input_dim=context_dims, output_dim=vocab_size, activation='exponential') 51 | test_node.params = [] 52 | test_node.W = self.nodes['part_prob'].W 53 | test_node.b = self.nodes['part_prob'].b 54 | self.add_node(test_node, name='true_unrm_prob', inputs='seq') 55 | self.add_node(ActivationLayer(name='normalization'), name='true_prob', inputs='true_unrm_prob') 56 | 57 | self.add_output('pos_prob', node='part_prob') 58 | self.add_output('neg_prob', node='lookup_prob') 59 | self.add_output('pred_prob', node='true_prob') 60 | self.add_output('unrm_prob', node='true_unrm_prob') 61 | 62 | # noinspection PyMethodOverriding 63 | def compile(self): 64 | pos_prob_layer = self.outputs['pos_prob'] 65 | neg_prob_layer = self.outputs['neg_prob'] 66 | pre_prob_layer = self.outputs['pred_prob'] 67 | unrm_pro_layer = self.outputs['unrm_prob'] 68 | 69 | pos_prob_trn = pos_prob_layer.get_output(train=True) 70 | neg_prob_trn = neg_prob_layer.get_output(train=True) * self.nb_negative 71 | pos_prob_tst = pos_prob_layer.get_output(train=False) 72 | neg_prob_tst = neg_prob_layer.get_output(train=False) * self.nb_negative 73 | pre_prob_tst = pre_prob_layer.get_output(train=False) 74 | unrm_pro_tst = unrm_pro_layer.get_output(train=False) 75 | 76 | partition = T.sum(unrm_pro_tst, axis=-1) 77 | sum_unrm = T.sum(partition) 78 | squre_urm = T.sum(partition * partition) 79 | 80 | eps = 1.0e-37 81 | #TODO: mask not supported here 82 | nb_words = pos_prob_trn[0].size.astype(theano.config.floatX) 83 | sum_pos_neg_trn = pos_prob_trn + neg_prob_trn 84 | sum_pos_neg_tst = pos_prob_tst + neg_prob_tst 85 | y_train = T.sum(T.log(eps + pos_prob_trn[0] / sum_pos_neg_trn[0])) / nb_words 86 | y_train += T.sum(T.log(eps + neg_prob_trn[1:] / sum_pos_neg_trn[1:])) / nb_words 87 | y_test = T.sum(T.log(eps + pos_prob_tst[0] / sum_pos_neg_tst[0])) / nb_words 88 | y_test += T.sum(T.log(eps + neg_prob_tst[1:] / sum_pos_neg_tst[1:])) / nb_words 89 | 90 | true_labels = self.inputs['idxes'].get_output()[0] 91 | encode_len, nb_words = self.encode_length(true_labels, pre_prob_tst) 92 | 93 | train_loss = -y_train 94 | test_loss = -y_test 95 | for r in self.regularizers: 96 | train_loss = r(train_loss) 97 | updates = self.optimizer.get_updates(self.params, self.constraints, train_loss) 98 | updates += self.updates 99 | 100 | self._train = theano.function([self.inputs['idxes'].get_output(True)], outputs=train_loss, 101 | updates=updates) 102 | self._test = theano.function([self.inputs['idxes'].get_output(False)], 103 | outputs=[test_loss, encode_len, nb_words, sum_unrm, squre_urm]) 104 | 105 | self._train.out_labels = ('loss', ) 106 | self._test.out_labels = ('loss', 'encode_len', 'nb_words') 107 | self.all_metrics = ['loss', 'val_loss', 'val_ppl'] 108 | 109 | def __summarize_outputs(outs, batch_sizes): 110 | """ 111 | :param outs: outputs of the _test* function. It is a list, and each element a list of 112 | values of the outputs of the _test* function on corresponding batch. 113 | :type outs: list 114 | :param batch_sizes: batch sizes. A list with the same length with outs. Each element 115 | is a size of corresponding batch. 116 | :type batch_sizes: list 117 | Aggregate outputs of batches as if the test function evaluates 118 | the metric values on the union of the batches. 119 | Note this function must be redefined for each specific problem 120 | """ 121 | out = np.array(outs, dtype=theano.config.floatX) 122 | loss, encode_len, nb_words, sum_unrm, squre_urm = out 123 | batch_size = np.array(batch_sizes, dtype=theano.config.floatX) 124 | 125 | smry_loss = np.sum(loss * batch_size)/batch_size.sum() 126 | smry_encode_len = encode_len.sum() 127 | smry_nb_words = nb_words.sum() 128 | smry_unrm = sum_unrm.sum() 129 | smry_sq_unrm = squre_urm.sum() 130 | return [smry_loss, smry_encode_len, smry_nb_words, smry_unrm, smry_sq_unrm] 131 | 132 | self._test.summarize_outputs = __summarize_outputs 133 | 134 | def negative_sample(self, X, order=0): 135 | if order == 0: 136 | ret = np.empty(shape=(self.nb_negative+1,) + X.shape, dtype=X.dtype) 137 | ret[0] = X 138 | ret[1:] = self.sampler.sample(shape=ret[1:].shape) 139 | else: 140 | raise NotImplementedError('Only support order=0 now') 141 | return ret 142 | 143 | def _loop_train(self, data, batch_size): 144 | nb = data.shape[1] 145 | nb_words = data[0].size 146 | loss = 0.0 147 | for start in xrange(0, nb, batch_size): 148 | end = start + batch_size 149 | ins = data[:, start:end] 150 | loss_ = self._train(ins) 151 | loss += loss_ * ins[0].size 152 | 153 | loss /= nb_words 154 | return loss 155 | 156 | def train(self, data_file='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=None, 157 | batch_size=256, train_nb_words=100000000, val_nb_words=100000, train_val_nb=100000, 158 | validation_interval=1800, log_file=None): 159 | opt_info = self.optimizer.get_config() 160 | opt_info = ', '.join(["{}: {}".format(n, v) for n, v in opt_info.items()]) 161 | 162 | logger.info('training with file: %s' % data_file) 163 | logger.info('training with batch size %d' % batch_size) 164 | logger.info('training with %d words; validate with %d words during training; ' 165 | 'evaluate with %d words after training' % (train_nb_words, train_val_nb, val_nb_words)) 166 | logger.info('validate every %f seconds' % float(validation_interval)) 167 | logger.info('optimizer: %s' % opt_info) 168 | 169 | log_file = LogInfo(log_file) 170 | log_file.info('training with file: %s' % data_file) 171 | log_file.info('training with batch size %d' % batch_size) 172 | log_file.info('training with %d words; validate with %d words during training; ' 173 | 'evaluate with %d words after training' % (train_nb_words, train_val_nb, val_nb_words)) 174 | log_file.info('validate every %f seconds' % float(validation_interval)) 175 | log_file.info('optimizer: %s' % opt_info) 176 | 177 | sentences = [None for _ in range(MAX_SETN_LEN)] # TODO: sentences longer than 64 are ignored. 178 | 179 | max_vocab = self.vocab_size - 1 180 | nb_trained = 0. 181 | nb_words_trained = 0.0 182 | sent_gen = grouped_sentences(data_file) 183 | val_sents = self.get_val_data(sent_gen, val_nb_words) 184 | train_val_sents = self.get_val_data(sent_gen, train_val_nb) 185 | 186 | self.validation(train_val_sents, batch_size, log_file) 187 | start_ = time() 188 | next_val_time = start_ + validation_interval 189 | for sents in sent_gen: 190 | mask = (sents > max_vocab) 191 | sents[mask] = max_vocab 192 | chunk = chunk_sentences(sentences, sents, batch_size) 193 | if chunk is None: 194 | continue 195 | 196 | # loss, ce, nb_wrd = self._train(chunk, chunk) 197 | x = self.negative_sample(chunk) 198 | loss = self._loop_train(x, batch_size) 199 | nb_trained += chunk.shape[0] 200 | nb_words_trained += chunk.size 201 | end_ = time() 202 | elapsed = float(end_ - start_) 203 | speed1 = nb_trained/elapsed 204 | speed2 = nb_words_trained/elapsed 205 | eta = (train_nb_words - nb_words_trained) / speed2 206 | eta_h = int(math.floor(eta/3600)) 207 | eta_m = int(math.ceil((eta - eta_h * 3600)/60.)) 208 | logger.info('%s:Train - ETA: %02d:%02d - loss: %5.1f - speed: %.1f sent/s %.1f words/s' % 209 | (self.__class__.__name__, eta_h, eta_m, loss, speed1, speed2)) 210 | log_file.info('%s:Train - time: %f - loss: %.6f' % (self.__class__.__name__, end_, loss)) 211 | 212 | if end_ > next_val_time: 213 | # noinspection PyUnresolvedReferences 214 | self.validation(train_val_sents, batch_size, log_file) 215 | next_val_time = time() + validation_interval 216 | 217 | if nb_words_trained >= train_nb_words: 218 | logger.info('Training finished. Evaluating ...') 219 | log_file.info('Training finished. Evaluating ...') 220 | self.validation(val_sents, batch_size, log_file) 221 | if save_path is not None: 222 | self.save_params(save_path) 223 | break 224 | 225 | log_file.close() 226 | 227 | def validation(self, val_sents, batch_size, log_file=None): 228 | """ 229 | :param val_sents: validation sentences. 230 | :type val_sents: a list, each element a ndarray 231 | :return: tuple 232 | """ 233 | code_len = 0. 234 | nb_words = 0. 235 | loss = 0.0 236 | unrm = 0.0 237 | sq_unrm = 0.0 238 | 239 | for sents in val_sents: 240 | x = [self.negative_sample(sents)] 241 | loss_, code_len_, nb_words_, unrm_, sq_unrm_ = self._test_loop(self._test, x, batch_size) 242 | nb_words += nb_words_ 243 | code_len += code_len_ 244 | loss += loss_ * nb_words_ 245 | unrm += unrm_ 246 | sq_unrm += sq_unrm_ 247 | 248 | loss /= nb_words 249 | ppl = math.exp(code_len/nb_words) 250 | mean_unrm = unrm / nb_words 251 | mean_sq_unrm = sq_unrm / nb_words 252 | std_unrm = mean_sq_unrm - mean_unrm * mean_unrm 253 | logger.info('%s:Val val_loss: %.2f - val_ppl: %.2f - partition: mean: %.2f std: %.2f' % 254 | (self.__class__.__name__, loss, ppl, mean_sq_unrm, std_unrm)) 255 | log_file.info('%s:Val val_loss: %.6f - val_ppl: %.6f - partition: mean: %.6f std: %.6f' % 256 | (self.__class__.__name__, loss, ppl, mean_sq_unrm, std_unrm)) 257 | 258 | return loss, ppl, mean_unrm, std_unrm 259 | 260 | @staticmethod 261 | def _test_loop(f, ins, batch_size=128, verbose=0): 262 | nb_sample = ins[0].shape[1] 263 | outs = [[] for _ in range(f.n_returned_outputs)] 264 | batch_info = [] 265 | batches = make_batches(nb_sample, batch_size) 266 | for batch_index, (batch_start, batch_end) in enumerate(batches): 267 | ins_batch = slice_X(ins, start_=batch_start, end_=batch_end, axis=1) 268 | batch_outs = f(*ins_batch) 269 | for idx, v in enumerate(batch_outs): 270 | outs[idx].append(v) 271 | batch_info.append(batch_end - batch_start) 272 | 273 | outs = f.summarize_outputs(outs, batch_info) 274 | return outs 275 | 276 | 277 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 278 | NB_RUN_WORDS = 100000000 279 | NB_VOCAB = 10000 280 | NB_RUN_VAL = 100000 281 | NB_EVALUATE = 5000000 282 | BATCH_SIZE = 256 283 | 284 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 285 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 286 | help="learning rate") 287 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 288 | help="amount of training data (number of words)") 289 | parser.add_option("-V", "--vocab-size", type="int", dest="vocab_size", default=NB_VOCAB, 290 | help="vocabulary size") 291 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 292 | help="running validation words") 293 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 294 | help="running validation words") 295 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 296 | help="decaying rate") 297 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 298 | help="decaying rate") 299 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 300 | help="decay lr or not") 301 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 302 | help="amount of training data (number of words)") 303 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128, 304 | help="amount of training data (number of words)") 305 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 306 | help="amount of training data (number of words)") 307 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 308 | help="amount of training data (number of words)") 309 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 310 | help="decaying rate") 311 | parser.add_option("-s", "--save", type="str", dest="save", default='', 312 | help="amount of training data (number of words)") 313 | options, args = parser.parse_args() 314 | 315 | nb_run_words = options.running_words 316 | nb_vocab = options.vocab_size 317 | nb_run_val = options.val_run 318 | nb_evaluate = options.nb_evaluation 319 | 320 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB) 321 | 322 | if options.decay: 323 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 324 | else: 325 | opt = adam(lr=options.lr) 326 | 327 | if options.log_file == '': 328 | log_file = None 329 | else: 330 | log_file = options.log_file 331 | 332 | if options.save == '': 333 | save_path = None 334 | else: 335 | save_path = options.save 336 | 337 | model = NCELangModelV0(vocab_size=nb_vocab, nb_negative=options.negative, 338 | embed_dims=options.embed_size, context_dims=options.context_size, 339 | negprob_table=unigram_table, optimizer=opt) 340 | model.compile() 341 | model.train(data_file=DATA_PATH, 342 | save_path=save_path, 343 | batch_size=BATCH_SIZE, train_nb_words=nb_run_words, 344 | val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 345 | validation_interval=options.interval, log_file=log_file) -------------------------------------------------------------------------------- /real/exp_nce2_norm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | from keras.optimizers import AdamAnneal, adam 7 | import optparse 8 | from layers import ActivationLayer 9 | # noinspection PyUnresolvedReferences 10 | from models import Graph, LangModel, LogInfo, optimizers, categorical_crossentropy, \ 11 | objective_fnc, np, theano, T, TableSampler, logger, grouped_sentences, MAX_SETN_LEN, \ 12 | chunk_sentences, time, math, make_batches, slice_X, containers, Embedding, PartialSoftmax,\ 13 | Split, LangLSTMLayer, LookupProb, Dense 14 | 15 | 16 | class NCELangModelV2(Graph, LangModel): 17 | def __init__(self, vocab_size, nb_negative, embed_dims=128, context_dims=128, 18 | negprob_table=None, optimizer='adam'): 19 | super(NCELangModelV2, self).__init__(weighted_inputs=False) 20 | self.vocab_size = vocab_size 21 | self.embed_dim = embed_dims 22 | self.optimizer = optimizers.get(optimizer) 23 | self.nb_negative = nb_negative 24 | self.loss = categorical_crossentropy 25 | self.loss_fnc = objective_fnc(self.loss) 26 | 27 | if negprob_table is None: 28 | negprob_table_ = np.ones(shape=(vocab_size,), dtype=theano.config.floatX)/vocab_size 29 | negprob_table = theano.shared(negprob_table_) 30 | self.neg_prob_table = negprob_table_ 31 | else: 32 | self.neg_prob_table = negprob_table.astype(theano.config.floatX) 33 | negprob_table = theano.shared(negprob_table.astype(theano.config.floatX)) 34 | 35 | self.sampler = TableSampler(self.neg_prob_table) 36 | 37 | self.add_input(name='idxes', ndim=3, dtype='int32') 38 | self.add_node(Split(split_at=1, split_axis=0), name=('pos_sents', ''), inputs='idxes') 39 | 40 | seq = containers.Sequential() 41 | seq.add(self.nodes['pos_sents']) 42 | seq.add(Embedding(vocab_size, embed_dims)) 43 | seq.add(LangLSTMLayer(embed_dims, output_dim=context_dims)) 44 | # seq.add(Dropout(0.5)) 45 | 46 | self.add_node(seq, name='seq') 47 | self.add_node(PartialSoftmax(input_dim=context_dims, output_dim=vocab_size), 48 | name='part_prob', inputs=('idxes', 'seq')) 49 | self.add_node(Dense(input_dim=context_dims, output_dim=1, activation='exponential'), 50 | name='normalizer', inputs='seq') 51 | self.add_node(LookupProb(negprob_table), name='lookup_prob', inputs='idxes') 52 | 53 | test_node = Dense(input_dim=context_dims, output_dim=vocab_size, activation='exponential') 54 | test_node.params = [] 55 | test_node.W = self.nodes['part_prob'].W 56 | test_node.b = self.nodes['part_prob'].b 57 | self.add_node(test_node, name='true_unrm_prob', inputs='seq') 58 | # self.add_node(ActivationLayer(name='normalization'), name='true_prob', inputs='true_unrm_prob') 59 | 60 | self.add_output('pos_prob', node='part_prob') 61 | self.add_output('neg_prob', node='lookup_prob') 62 | # self.add_output('pred_prob', node='true_prob') 63 | self.add_output('normalizer', node='normalizer') 64 | self.add_output('unrm_prob', node='true_unrm_prob') 65 | 66 | # noinspection PyMethodOverriding 67 | def compile(self): 68 | pos_prob_layer = self.outputs['pos_prob'] 69 | neg_prob_layer = self.outputs['neg_prob'] 70 | # pre_prob_layer = self.outputs['pred_prob'] 71 | normlzer_layer = self.outputs['normalizer'] 72 | unrm_pro_layer = self.outputs['unrm_prob'] 73 | 74 | 75 | pos_prob_trn = pos_prob_layer.get_output(train=True) 76 | neg_prob_trn = neg_prob_layer.get_output(train=True) * self.nb_negative 77 | pos_prob_tst = pos_prob_layer.get_output(train=False) 78 | neg_prob_tst = neg_prob_layer.get_output(train=False) * self.nb_negative 79 | # pre_prob_tst = pre_prob_layer.get_output(train=False) 80 | unrm_pro_tst = unrm_pro_layer.get_output(train=False) 81 | 82 | nrm_const = normlzer_layer.get_output(train=True) 83 | nrm_const = T.reshape(nrm_const, (nrm_const.shape[0], nrm_const.shape[1])) 84 | nrm_const = nrm_const.dimshuffle('x', 0, 1) 85 | pos_prob_trn *= nrm_const 86 | 87 | nrm_const_tst_ = normlzer_layer.get_output(train=False) 88 | nrm_const_tst = T.reshape(nrm_const_tst_, (nrm_const_tst_.shape[0], nrm_const_tst_.shape[1])) 89 | nrm_const_tst = nrm_const_tst.dimshuffle('x', 0, 1) 90 | pos_prob_tst *= nrm_const_tst 91 | 92 | true_nrm = T.sum(unrm_pro_tst, axis=-1, keepdims=True) 93 | pre_prob_tst = unrm_pro_tst / true_nrm 94 | 95 | unrm_pro_tst *= T.addbroadcast(nrm_const_tst_, 2) 96 | partition = T.sum(unrm_pro_tst, axis=-1) 97 | sum_unrm = T.sum(partition) 98 | squre_urm = T.sum(partition * partition) 99 | 100 | eps = 1.0e-37 101 | z = 1./(nrm_const_tst.ravel() + eps) 102 | z_pred = T.sum(z) 103 | z_true = T.sum(true_nrm.ravel()) 104 | z_err = T.sum(T.abs_(z - true_nrm.ravel())) 105 | z_sq = T.sum(true_nrm * true_nrm) 106 | 107 | #TODO: mask not supported here 108 | nb_words = pos_prob_trn[0].size.astype(theano.config.floatX) 109 | sum_pos_neg_trn = pos_prob_trn + neg_prob_trn 110 | sum_pos_neg_tst = pos_prob_tst + neg_prob_tst 111 | y_train = T.sum(T.log(eps + pos_prob_trn[0] / sum_pos_neg_trn[0])) / nb_words 112 | y_train += T.sum(T.log(eps + neg_prob_trn[1:] / sum_pos_neg_trn[1:])) / nb_words 113 | y_test = T.sum(T.log(eps + pos_prob_tst[0] / sum_pos_neg_tst[0])) / nb_words 114 | y_test += T.sum(T.log(eps + neg_prob_tst[1:] / sum_pos_neg_tst[1:])) / nb_words 115 | 116 | true_labels = self.inputs['idxes'].get_output()[0] 117 | encode_len, nb_words = self.encode_length(true_labels, pre_prob_tst) 118 | 119 | train_loss = -y_train 120 | test_loss = -y_test 121 | for r in self.regularizers: 122 | train_loss = r(train_loss) 123 | updates = self.optimizer.get_updates(self.params, self.constraints, train_loss) 124 | updates += self.updates 125 | 126 | self._train = theano.function([self.inputs['idxes'].get_output(True)], outputs=train_loss, 127 | updates=updates) 128 | self._test = theano.function([self.inputs['idxes'].get_output(False)], 129 | outputs=[test_loss, encode_len, nb_words, sum_unrm, 130 | squre_urm, z_pred, z_true, z_err, z_sq]) 131 | 132 | self._train.out_labels = ('loss', ) 133 | self._test.out_labels = ('loss', 'encode_len', 'nb_words', 'unrm', 'square_unrm', 134 | 'z_pred', 'z_true', 'z_err', 'z_sq') 135 | self.all_metrics = ['loss', 'val_loss', 'val_ppl'] 136 | 137 | def __summarize_outputs(outs, batch_sizes): 138 | """ 139 | :param outs: outputs of the _test* function. It is a list, and each element a list of 140 | values of the outputs of the _test* function on corresponding batch. 141 | :type outs: list 142 | :param batch_sizes: batch sizes. A list with the same length with outs. Each element 143 | is a size of corresponding batch. 144 | :type batch_sizes: list 145 | Aggregate outputs of batches as if the test function evaluates 146 | the metric values on the union of the batches. 147 | Note this function must be redefined for each specific problem 148 | """ 149 | out = np.array(outs, dtype=theano.config.floatX) 150 | loss, encode_len, nb_words, unrm, sq_urm, z_pred, z_true, z_err, z_sq = out 151 | batch_size = np.array(batch_sizes, dtype=theano.config.floatX) 152 | 153 | smry_loss = np.sum(loss * batch_size)/batch_size.sum() 154 | smry_encode_len = encode_len.sum() 155 | smry_nb_words = nb_words.sum() 156 | smry_sum_urm = unrm.sum() 157 | smry_sq_urm = sq_urm.sum() 158 | smry_z_pred = z_pred.sum() 159 | smry_z_true = z_true.sum() 160 | smry_z_err = z_err.sum() 161 | smry_z_sq = z_sq.sum() 162 | return [smry_loss, smry_encode_len, smry_nb_words, smry_sum_urm, 163 | smry_sq_urm, smry_z_pred, smry_z_true, smry_z_err, smry_z_sq] 164 | 165 | self._test.summarize_outputs = __summarize_outputs 166 | 167 | def negative_sample(self, X, order=0): 168 | if order == 0: 169 | ret = np.empty(shape=(self.nb_negative+1,) + X.shape, dtype=X.dtype) 170 | ret[0] = X 171 | ret[1:] = self.sampler.sample(shape=ret[1:].shape) 172 | else: 173 | raise NotImplementedError('Only support order=0 now') 174 | return ret 175 | 176 | def _loop_train(self, data, batch_size): 177 | nb = data.shape[1] 178 | nb_words = data[0].size 179 | loss = 0.0 180 | for start in xrange(0, nb, batch_size): 181 | end = start + batch_size 182 | ins = data[:, start:end] 183 | loss_ = self._train(ins) 184 | loss += loss_ * ins[0].size 185 | 186 | loss /= nb_words 187 | return loss 188 | 189 | def train(self, data_file='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=None, 190 | batch_size=256, train_nb_words=100000000, val_nb_words=100000, train_val_nb=100000, 191 | validation_interval=1800, log_file=None): 192 | opt_info = self.optimizer.get_config() 193 | opt_info = ', '.join(["{}: {}".format(n, v) for n, v in opt_info.items()]) 194 | 195 | logger.info('training with file: %s' % data_file) 196 | logger.info('training with batch size %d' % batch_size) 197 | logger.info('training with %d words; validate with %d words during training; ' 198 | 'evaluate with %d words after training' % (train_nb_words, train_val_nb, val_nb_words)) 199 | logger.info('validate every %f seconds' % float(validation_interval)) 200 | logger.info('optimizer: %s' % opt_info) 201 | 202 | log_file = LogInfo(log_file) 203 | log_file.info('training with file: %s' % data_file) 204 | log_file.info('training with batch size %d' % batch_size) 205 | log_file.info('training with %d words; validate with %d words during training; ' 206 | 'evaluate with %d words after training' % (train_nb_words, train_val_nb, val_nb_words)) 207 | log_file.info('validate every %f seconds' % float(validation_interval)) 208 | log_file.info('optimizer: %s' % opt_info) 209 | 210 | sentences = [None for _ in range(MAX_SETN_LEN)] # TODO: sentences longer than 64 are ignored. 211 | 212 | max_vocab = self.vocab_size - 1 213 | nb_trained = 0. 214 | nb_words_trained = 0.0 215 | sent_gen = grouped_sentences(data_file) 216 | val_sents = self.get_val_data(sent_gen, val_nb_words) 217 | train_val_sents = self.get_val_data(sent_gen, train_val_nb) 218 | 219 | self.validation(train_val_sents, batch_size, log_file) 220 | start_ = time() 221 | next_val_time = start_ + validation_interval 222 | for sents in sent_gen: 223 | mask = (sents > max_vocab) 224 | sents[mask] = max_vocab 225 | chunk = chunk_sentences(sentences, sents, batch_size) 226 | if chunk is None: 227 | continue 228 | 229 | # loss, ce, nb_wrd = self._train(chunk, chunk) 230 | x = self.negative_sample(chunk) 231 | loss = self._loop_train(x, batch_size) 232 | nb_trained += chunk.shape[0] 233 | nb_words_trained += chunk.size 234 | end_ = time() 235 | elapsed = float(end_ - start_) 236 | speed1 = nb_trained/elapsed 237 | speed2 = nb_words_trained/elapsed 238 | eta = (train_nb_words - nb_words_trained) / speed2 239 | eta_h = int(math.floor(eta/3600)) 240 | eta_m = int(math.ceil((eta - eta_h * 3600)/60.)) 241 | logger.info('%s:Train - ETA: %02d:%02d - loss: %5.1f - speed: %.1f sent/s %.1f words/s' % 242 | (self.__class__.__name__, eta_h, eta_m, loss, speed1, speed2)) 243 | log_file.info('%s:Train - time: %f - loss: %.6f' % (self.__class__.__name__, end_, loss)) 244 | 245 | if end_ > next_val_time: 246 | # noinspection PyUnresolvedReferences 247 | self.validation(train_val_sents, batch_size, log_file) 248 | next_val_time = time() + validation_interval 249 | 250 | if nb_words_trained >= train_nb_words: 251 | logger.info('Training finished. Evaluating ...') 252 | log_file.info('Training finished. Evaluating ...') 253 | self.validation(val_sents, batch_size, log_file) 254 | if save_path is not None: 255 | self.save_params(save_path) 256 | break 257 | log_file.close() 258 | 259 | def validation(self, val_sents, batch_size, log_file=None): 260 | """ 261 | :param val_sents: validation sentences. 262 | :type val_sents: a list, each element a ndarray 263 | :return: tuple 264 | """ 265 | code_len = 0. 266 | nb_words = 0. 267 | loss = 0.0 268 | unrm = 0.0 269 | sq_unrm = 0.0 270 | z_pred = 0.0 271 | z_true = 0.0 272 | z_err = 0.0 273 | z_sq = 0.0 274 | 275 | for sents in val_sents: 276 | x = [self.negative_sample(sents)] 277 | loss_, code_len_, nb_words_, unrm_, sq_unrm_, z_pred_, z_true_, z_err_, z_sq_ = \ 278 | self._test_loop(self._test, x, batch_size) 279 | nb_words += nb_words_ 280 | code_len += code_len_ 281 | loss += loss_ * nb_words_ 282 | unrm += unrm_ 283 | sq_unrm += sq_unrm_ 284 | z_pred += z_pred_ 285 | z_true += z_true_ 286 | z_err += z_err_ 287 | z_sq += z_sq_ 288 | 289 | loss /= nb_words 290 | ppl = math.exp(code_len/nb_words) 291 | mean_unrm = unrm / nb_words 292 | mean_sq_unrm = sq_unrm / nb_words 293 | std_unrm = mean_sq_unrm - mean_unrm * mean_unrm 294 | z_pred /= nb_words 295 | z_true /= nb_words 296 | z_err /= nb_words 297 | mean_sq_z = z_sq / nb_words 298 | std_z = mean_sq_z - z_true * z_true 299 | logger.info('%s:Val val_loss: %.2f - val_ppl: %.2f - partition: mean: %.2f std: %.2f - ' 300 | 'z: pred: %.2f true: %.2f err: %.2f std: %.2f' % 301 | (self.__class__.__name__, loss, ppl, mean_sq_unrm, std_unrm, z_pred, z_true, z_err, std_z)) 302 | log_file.info('%s:Val val_loss: %.6f - val_ppl: %.6f - partition: mean: %.6f std: %.6f - ' 303 | 'z: pred: %.6f true: %.6f err: %.6f std: %.6f' % 304 | (self.__class__.__name__, loss, ppl, mean_sq_unrm, std_unrm, z_pred, z_true, z_err, std_z)) 305 | 306 | return loss, ppl, mean_unrm, std_unrm, z_pred, z_true, z_err 307 | 308 | @staticmethod 309 | def _test_loop(f, ins, batch_size=128, verbose=0): 310 | nb_sample = ins[0].shape[1] 311 | outs = [[] for _ in range(f.n_returned_outputs)] 312 | batch_info = [] 313 | batches = make_batches(nb_sample, batch_size) 314 | for batch_index, (batch_start, batch_end) in enumerate(batches): 315 | ins_batch = slice_X(ins, start_=batch_start, end_=batch_end, axis=1) 316 | batch_outs = f(*ins_batch) 317 | for idx, v in enumerate(batch_outs): 318 | outs[idx].append(v) 319 | batch_info.append(batch_end - batch_start) 320 | 321 | outs = f.summarize_outputs(outs, batch_info) 322 | return outs 323 | 324 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 325 | NB_RUN_WORDS = 100000000 326 | NB_VOCAB = 10000 327 | NB_RUN_VAL = 100000 328 | NB_EVALUATE = 5000000 329 | BATCH_SIZE = 256 330 | 331 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 332 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 333 | help="learning rate") 334 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 335 | help="amount of training data (number of words)") 336 | parser.add_option("-V", "--vocab-size", type="int", dest="vocab_size", default=NB_VOCAB, 337 | help="vocabulary size") 338 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 339 | help="running validation words") 340 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 341 | help="running validation words") 342 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 343 | help="decaying rate") 344 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 345 | help="decaying rate") 346 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 347 | help="decay lr or not") 348 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 349 | help="amount of training data (number of words)") 350 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128, 351 | help="amount of training data (number of words)") 352 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 353 | help="amount of training data (number of words)") 354 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 355 | help="amount of training data (number of words)") 356 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 357 | help="decaying rate") 358 | parser.add_option("-s", "--save", type="str", dest="save", default='', 359 | help="amount of training data (number of words)") 360 | options, args = parser.parse_args() 361 | 362 | nb_run_words = options.running_words 363 | nb_vocab = options.vocab_size 364 | nb_run_val = options.val_run 365 | nb_evaluate = options.nb_evaluation 366 | 367 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, 368 | save_path='../data/wiki-unigram-prob-size%d.pkl' % 369 | nb_vocab) 370 | if options.decay: 371 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 372 | else: 373 | opt = adam(lr=options.lr) 374 | 375 | if options.log_file == '': 376 | log_file = None 377 | else: 378 | log_file = options.log_file 379 | 380 | if options.save == '': 381 | save_path = None 382 | else: 383 | save_path = options.save 384 | 385 | model = NCELangModelV2(vocab_size=nb_vocab, nb_negative=options.negative, 386 | embed_dims=options.embed_size, context_dims=options.context_size, 387 | negprob_table=unigram_table, optimizer=opt) 388 | model.compile() 389 | model.train(data_file=DATA_PATH, 390 | save_path=save_path, 391 | batch_size=BATCH_SIZE, train_nb_words=nb_run_words, 392 | val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 393 | validation_interval=options.interval, log_file=log_file) 394 | -------------------------------------------------------------------------------- /real/main_lblv1-pku1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_lblv1.py" 13 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 14 | context_size=5 15 | embed_size=200 16 | 17 | # test different vocab size 18 | lr='0.02' 19 | lr_min='0.002' 20 | gamma='0.03' 21 | for ((nb_vocab=10000; nb_vocab<32000; nb_vocab+=2000)); do 22 | log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log" 23 | command_line_="$python_command -C ${context_size} -E ${embed_size} \ 24 | --lr=${lr} --lr-min=${lr_min} \ 25 | -d --gamma=${gamma} \ 26 | --log-file $log_file \ 27 | -D $data_file -V $nb_vocab " 28 | command_line=`echo "$command_line_" | tr -s " "` 29 | ${command_prefix} nohup sh -c "$command_line" & 30 | sleep 40 31 | done 32 | 33 | 34 | #for ((nb_vocab=30000; nb_vocab<=50000; nb_vocab+=2000)); do 35 | # log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log" 36 | # command_line_="$python_command -C ${context_size} -E ${embed_size} \ 37 | # --lr=${lr} --lr-min=${lr_min} \ 38 | # -d --gamma=${gamma} \ 39 | # --log-file $log_file \ 40 | # -D $data_file -V $nb_vocab " 41 | # command_line=`echo "$command_line_" | tr -s " "` 42 | # ${command_prefix} nohup sh -c "$command_line" & 43 | # sleep 40 44 | #done 45 | 46 | # test different lr: 47 | #lr_min='0.002' 48 | #gamma='0.003' 49 | #nb_neg=50 50 | #nb_vocab=30000 51 | #for lr in 0.04 0.03 0.02; do #0.01 0.008 0.006; do 52 | # log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log" 53 | # command_line_="$python_command -C ${context_size} -E ${embed_size} \ 54 | # --lr=${lr} --lr-min=${lr_min} \ 55 | # -d --gamma=${gamma} \ 56 | # --log-file $log_file \ 57 | # -D $data_file -V $nb_vocab " 58 | # command_line=`echo "$command_line_" | tr -s " "` 59 | # ${command_prefix} nohup sh -c "$command_line" & 60 | # sleep 40 61 | #done 62 | 63 | # 64 | #lr='0.01' 65 | #for gamma in 0.001 0.002 0.004; do 66 | # log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log" 67 | # command_line_="$python_command -C ${context_size} -E ${embed_size} \ 68 | # --lr=${lr} --lr-min=${lr_min} \ 69 | # -d --gamma=${gamma} \ 70 | # --log-file $log_file \ 71 | # -D $data_file -V $nb_vocab " 72 | # command_line=`echo "$command_line_" | tr -s " "` 73 | # ${command_prefix} nohup sh -c "$command_line" & 74 | # sleep 40 75 | #done 76 | -------------------------------------------------------------------------------- /real/main_lblv1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | import optparse 5 | from keras.optimizers import adam, AdamAnneal 6 | from models import LBLangModelV1 7 | # noinspection PyUnresolvedReferences 8 | from SparseEmbed.cu_gen_sparse import compose_dense_repr 9 | 10 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 11 | EMBED_FILE = '../data/models/embeddings/rw2vec_embeddings-size200.pkl' 12 | NB_RUN_WORDS = 100000000 13 | NB_VOCAB = 10000 14 | NB_RUN_VAL = 100000 15 | NB_EVALUATE = 5000000 16 | BATCH_SIZE = 512 17 | 18 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 19 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 20 | help="learning rate") 21 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 22 | help="amount of training data (number of words)") 23 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 24 | help="running validation words") 25 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 26 | help="running validation words") 27 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 28 | help="decaying rate") 29 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 30 | help="decaying rate") 31 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 32 | help="decay lr or not") 33 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=5, 34 | help="amount of training data (number of words)") 35 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=200, 36 | help="amount of training data (number of words)") 37 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 38 | help="amount of training data (number of words)") 39 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=900., 40 | help="decaying rate") 41 | parser.add_option("-s", "--save", type="str", dest="save", default='', 42 | help="amount of training data (number of words)") 43 | parser.add_option("-V", "--nb-vocab", type="int", dest="nb_vocab", default=30000, 44 | help="Number of vocabulary") 45 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH, 46 | help="binarized corpus file") 47 | options, args = parser.parse_args() 48 | 49 | nb_run_words = options.running_words 50 | nb_run_val = options.val_run 51 | nb_evaluate = options.nb_evaluation 52 | 53 | if options.decay: 54 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 55 | else: 56 | opt = adam(lr=options.lr) 57 | 58 | if options.log_file == '': 59 | log_file = None 60 | else: 61 | log_file = options.log_file 62 | 63 | if options.save == '': 64 | save_path = None 65 | else: 66 | save_path = options.save 67 | 68 | model = LBLangModelV1(vocab_size=options.nb_vocab, 69 | context_size=options.context_size, 70 | embed_dims=options.embed_size) 71 | model.compile(opt) 72 | model.train(data_file=options.corpus, 73 | save_path=save_path, 74 | batch_size=BATCH_SIZE, 75 | train_nb_words=nb_run_words, 76 | val_nb_words=nb_evaluate, 77 | train_val_nb=nb_run_val, 78 | validation_interval=options.interval, 79 | log_file=log_file) -------------------------------------------------------------------------------- /real/main_lblv1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_lblv1.py" 13 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 14 | context_size=5 15 | embed_size=200 16 | 17 | # test different vocab size 18 | lr='0.02' 19 | lr_min='0.002' 20 | gamma='0.03' 21 | for ((nb_vocab=10000; nb_vocab<32000; nb_vocab+=2000)); do 22 | log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log" 23 | command_line_="$python_command -C ${context_size} -E ${embed_size} \ 24 | --lr=${lr} --lr-min=${lr_min} \ 25 | -d --gamma=${gamma} \ 26 | --log-file $log_file \ 27 | -D $data_file -V $nb_vocab " 28 | command_line=`echo "$command_line_" | tr -s " "` 29 | ${command_prefix} nohup sh -c "$command_line" & 30 | sleep 40 31 | done 32 | 33 | 34 | #for ((nb_vocab=30000; nb_vocab<=50000; nb_vocab+=2000)); do 35 | # log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log" 36 | # command_line_="$python_command -C ${context_size} -E ${embed_size} \ 37 | # --lr=${lr} --lr-min=${lr_min} \ 38 | # -d --gamma=${gamma} \ 39 | # --log-file $log_file \ 40 | # -D $data_file -V $nb_vocab " 41 | # command_line=`echo "$command_line_" | tr -s " "` 42 | # ${command_prefix} nohup sh -c "$command_line" & 43 | # sleep 40 44 | #done 45 | 46 | # test different lr: 47 | #lr_min='0.002' 48 | #gamma='0.003' 49 | #nb_neg=50 50 | #nb_vocab=30000 51 | #for lr in 0.04 0.03 0.02; do #0.01 0.008 0.006; do 52 | # log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log" 53 | # command_line_="$python_command -C ${context_size} -E ${embed_size} \ 54 | # --lr=${lr} --lr-min=${lr_min} \ 55 | # -d --gamma=${gamma} \ 56 | # --log-file $log_file \ 57 | # -D $data_file -V $nb_vocab " 58 | # command_line=`echo "$command_line_" | tr -s " "` 59 | # ${command_prefix} nohup sh -c "$command_line" & 60 | # sleep 40 61 | #done 62 | 63 | # 64 | #lr='0.01' 65 | #for gamma in 0.001 0.002 0.004; do 66 | # log_file="../logs/main-lblv1-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}.log" 67 | # command_line_="$python_command -C ${context_size} -E ${embed_size} \ 68 | # --lr=${lr} --lr-min=${lr_min} \ 69 | # -d --gamma=${gamma} \ 70 | # --log-file $log_file \ 71 | # -D $data_file -V $nb_vocab " 72 | # command_line=`echo "$command_line_" | tr -s " "` 73 | # ${command_prefix} nohup sh -c "$command_line" & 74 | # sleep 40 75 | #done 76 | -------------------------------------------------------------------------------- /real/main_lblv2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | import optparse 7 | from keras.optimizers import adam, AdamAnneal 8 | from models import LBLangModelV2, logger 9 | import cPickle as pickle 10 | import numpy as np 11 | # noinspection PyUnresolvedReferences 12 | from SparseEmbed.cu_gen_sparse import compose_dense_repr 13 | 14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 15 | EMBED_FILE = '../data/models/embeddings/rw2vec_embeddings-size200.pkl' 16 | NB_RUN_WORDS = 100000000 17 | NB_VOCAB = 10000 18 | NB_RUN_VAL = 100000 19 | NB_EVALUATE = 5000000 20 | BATCH_SIZE = 512 21 | 22 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 23 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 24 | help="learning rate") 25 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 26 | help="amount of training data (number of words)") 27 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file", 28 | help="sparse coding file (pickle)") 29 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", default='', 30 | help="initial embedding file (pickle)") 31 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 32 | help="running validation words") 33 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 34 | help="running validation words") 35 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 36 | help="decaying rate") 37 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 38 | help="decaying rate") 39 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 40 | help="decay lr or not") 41 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 42 | help="amount of training data (number of words)") 43 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=5, 44 | help="amount of training data (number of words)") 45 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=200, 46 | help="amount of training data (number of words)") 47 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 48 | help="amount of training data (number of words)") 49 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=900., 50 | help="decaying rate") 51 | parser.add_option("-s", "--save", type="str", dest="save", default='', 52 | help="amount of training data (number of words)") 53 | parser.add_option("-V", "--nb-vocab", type="int", dest="nb_vocab", default=30000, 54 | help="Number of vocabulary") 55 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH, 56 | help="binarized corpus file") 57 | parser.add_option("-w", "--nb-workers", type="int", dest="nb_workers", default=3, 58 | help="number of data workers") 59 | options, args = parser.parse_args() 60 | 61 | nb_run_words = options.running_words 62 | nb_run_val = options.val_run 63 | nb_evaluate = options.nb_evaluation 64 | embedding_file = options.embedding_file 65 | 66 | with file(options.coding_file, 'rb') as f: 67 | sparse_coding = pickle.load(f) 68 | # print sparse_coding.dtype 69 | 70 | nb_vocab = options.nb_vocab 71 | sparse_coding = sparse_coding[nb_vocab//1000] 72 | nb_vocab, nb_base = sparse_coding.shape 73 | nb_base -= 1 74 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab) 75 | 76 | if embedding_file != '': 77 | with file('../data/wiki-wordmap-trunc300k.wp', 'rb') as f: 78 | wp = pickle.load(f) 79 | freq = wp['idx2wc'] 80 | logger.info('Using word2vec to initialize word embeddings %s ' % embedding_file) 81 | embed = compose_dense_repr(nb_base, nb_vocab, freq, embedding_file) 82 | embed = np.vstack([embed, np.zeros((options.context_size, options.embed_size))]) 83 | ini_embeds = [embed] 84 | else: 85 | ini_embeds = None 86 | 87 | if options.decay: 88 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 89 | else: 90 | opt = adam(lr=options.lr) 91 | 92 | if options.log_file == '': 93 | log_file = None 94 | else: 95 | log_file = options.log_file 96 | 97 | if options.save == '': 98 | save_path = None 99 | else: 100 | save_path = options.save 101 | 102 | model = LBLangModelV2(sparse_coding=sparse_coding, 103 | context_size=options.context_size, 104 | nb_negative=options.negative, 105 | embed_dims=options.embed_size, 106 | init_embeddings=ini_embeds, 107 | negprob_table=unigram_table, 108 | optimizer=opt) 109 | model.compile() 110 | model.train(data_file=options.corpus, 111 | save_path=save_path, 112 | batch_size=BATCH_SIZE, 113 | train_nb_words=nb_run_words, 114 | val_nb_words=nb_evaluate, 115 | train_val_nb=nb_run_val, 116 | validation_interval=options.interval, 117 | log_file=log_file, 118 | nb_data_workers=options.nb_workers) -------------------------------------------------------------------------------- /real/main_lblv2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_lblv2.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=5 17 | embed_size=200 18 | 19 | # test different vocab size 20 | lr='0.004' 21 | lr_min='0.002' 22 | gamma='0.03' 23 | nb_neg=50 24 | for ((nb_vocab=10000; nb_vocab<30000; nb_vocab+=2000)); do 25 | log_file="../logs/main-lblv2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log" 26 | command_line_="$python_command -C ${context_size} -E ${embed_size} \ 27 | --lr=${lr} --lr-min=${lr_min} \ 28 | -d --gamma=${gamma} -N ${nb_neg} \ 29 | -S $coding_file -e $embed_file --log-file $log_file \ 30 | -D $data_file -V $nb_vocab " 31 | command_line=`echo "$command_line_" | tr -s " "` 32 | ${command_prefix} nohup sh -c "$command_line" & 33 | sleep 40 34 | done 35 | 36 | 37 | #for ((nb_vocab=30000; nb_vocab<=50000; nb_vocab+=2000)); do 38 | # log_file="../logs/main-lblv2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log" 39 | # command_line_="$python_command -C ${context_size} -E ${embed_size} \ 40 | # --lr=${lr} --lr-min=${lr_min} \ 41 | # -d --gamma=${gamma} -N ${nb_neg} \ 42 | # -S $coding_file -e $embed_file --log-file $log_file \ 43 | # -D $data_file -V $nb_vocab " 44 | # command_line=`echo "$command_line_" | tr -s " "` 45 | # ${command_prefix} nohup sh -c "$command_line" & 46 | # sleep 40 47 | #done 48 | 49 | # test different lr: 50 | #lr_min='0.002' 51 | #gamma='0.003' 52 | #nb_neg=50 53 | #nb_vocab=30000 54 | #for lr in 0.04 0.03 0.02; do #0.01 0.008 0.006; do 55 | # log_file="../logs/main-lblv2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log" 56 | # command_line_="$python_command -C ${context_size} -E ${embed_size} \ 57 | # --lr=${lr} --lr-min=${lr_min} \ 58 | # -d --gamma=${gamma} -N ${nb_neg} \ 59 | # -S $coding_file -e $embed_file --log-file $log_file \ 60 | # -D $data_file -V $nb_vocab " 61 | # command_line=`echo "$command_line_" | tr -s " "` 62 | # ${command_prefix} nohup sh -c "$command_line" & 63 | # sleep 40 64 | #done 65 | 66 | # 67 | #lr='0.01' 68 | #for gamma in 0.001 0.002 0.004; do 69 | # log_file="../logs/main-lblv2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log" 70 | # command_line_="$python_command -C ${context_size} -E ${embed_size} \ 71 | # --lr=${lr} --lr-min=${lr_min} \ 72 | # -d --gamma=${gamma} -N ${nb_neg} \ 73 | # -S $coding_file -e $embed_file --log-file $log_file \ 74 | # -D $data_file -V $nb_vocab " 75 | # command_line=`echo "$command_line_" | tr -s " "` 76 | # ${command_prefix} nohup sh -c "$command_line" & 77 | # sleep 40 78 | #done 79 | -------------------------------------------------------------------------------- /real/main_nce2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | from models import NCELangModelV2 7 | from keras.optimizers import AdamAnneal, adam 8 | import optparse 9 | 10 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 11 | NB_RUN_WORDS = 100000000 12 | NB_VOCAB = 10000 13 | NB_RUN_VAL = 100000 14 | NB_EVALUATE = 5000000 15 | BATCH_SIZE = 256 16 | 17 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 18 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 19 | help="learning rate") 20 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 21 | help="amount of training data (number of words)") 22 | parser.add_option("-V", "--vocab-size", type="int", dest="vocab_size", default=NB_VOCAB, 23 | help="vocabulary size") 24 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 25 | help="running validation words") 26 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 27 | help="running validation words") 28 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 29 | help="decaying rate") 30 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 31 | help="decaying rate") 32 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 33 | help="decay lr or not") 34 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 35 | help="amount of training data (number of words)") 36 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128, 37 | help="amount of training data (number of words)") 38 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 39 | help="amount of training data (number of words)") 40 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 41 | help="amount of training data (number of words)") 42 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 43 | help="decaying rate") 44 | parser.add_option("-s", "--save", type="str", dest="save", default='', 45 | help="amount of training data (number of words)") 46 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH, 47 | help="binarized corpus file") 48 | options, args = parser.parse_args() 49 | 50 | nb_run_words = options.running_words 51 | nb_vocab = options.vocab_size 52 | nb_run_val = options.val_run 53 | nb_evaluate = options.nb_evaluation 54 | 55 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, 56 | save_path='../data/wiki-unigram-prob-size%d.pkl' % 57 | nb_vocab) 58 | if options.decay: 59 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 60 | else: 61 | opt = adam(lr=options.lr) 62 | 63 | if options.log_file == '': 64 | log_file = None 65 | else: 66 | log_file = options.log_file 67 | 68 | if options.save == '': 69 | save_path = None 70 | else: 71 | save_path = options.save 72 | 73 | model = NCELangModelV2(vocab_size=nb_vocab, 74 | nb_negative=options.negative, 75 | embed_dims=options.embed_size, 76 | context_dims=options.context_size, 77 | negprob_table=unigram_table, 78 | optimizer=opt) 79 | model.compile() 80 | model.train(data_file=options.corpus, 81 | save_path=save_path, 82 | batch_size=BATCH_SIZE, 83 | train_nb_words=nb_run_words, 84 | val_nb_words=nb_evaluate, 85 | train_val_nb=nb_run_val, 86 | validation_interval=options.interval, 87 | log_file=log_file) 88 | 89 | -------------------------------------------------------------------------------- /real/main_nce2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../../lm:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce2.py" 13 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 14 | context_size=200 15 | embed_size=200 16 | 17 | # test different vocab size 18 | lr='0.01' 19 | lr_min='0.002' 20 | gamma='0.003' 21 | nb_neg=50 22 | for ((nb_vocab=10000; nb_vocab<30000; nb_vocab+=2000)); do 23 | log_file="../logs/main-nce2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log" 24 | command_line_="$python_command -C ${context_size} -E ${embed_size} \ 25 | --lr=${lr} --lr-min=${lr_min} -d --gamma=${gamma} -N ${nb_neg} \ 26 | --log-file $log_file -D $data_file -V $nb_vocab " 27 | command_line=`echo "$command_line_" | tr -s " "` 28 | ${command_prefix} nohup sh -c "$command_line" & 29 | done 30 | 31 | ## test different lr: 32 | #lr_min='0.002' 33 | #gamma='0.003' 34 | #nb_neg=50 35 | #nb_vocab=30000 36 | #for lr in 0.04 0.03 0.02 0.01 0.008 0.006; do 37 | # log_file="../logs/main-nce2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log" 38 | # command_line_="$python_command -C ${context_size} -E ${embed_size} \ 39 | # --lr=${lr} --lr-min=${lr_min} -d --gamma=${gamma} -N ${nb_neg} \ 40 | # --log-file $log_file -D $data_file -V $nb_vocab " 41 | # command_line=`echo "$command_line_" | tr -s " "` 42 | # ${command_prefix} nohup sh -c "$command_line" & 43 | #done 44 | ## 45 | #lr='0.01' 46 | #for gamma in 0.001 0.002 0.004; do 47 | # log_file="../logs/main-nce2-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log" 48 | # command_line_="$python_command -C ${context_size} -E ${embed_size} \ 49 | # --lr=${lr} --lr-min=${lr_min} -d --gamma=${gamma} -N ${nb_neg} \ 50 | # --log-file $log_file -D $data_file -V $nb_vocab " 51 | # command_line=`echo "$command_line_" | tr -s " "` 52 | # ${command_prefix} nohup sh -c "$command_line" & 53 | #done 54 | -------------------------------------------------------------------------------- /real/main_nce4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | import optparse 7 | from keras.optimizers import adam, AdamAnneal 8 | from models import NCELangModelV4, logger 9 | import cPickle as pickle 10 | import sys 11 | # noinspection PyUnresolvedReferences 12 | from SparseEmbed.cu_gen_sparse import compose_dense_repr 13 | 14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 15 | EMBED_FILE = '../data/models/embeddings/rw2vec_embeddings-size200.pkl' 16 | NB_RUN_WORDS = 100000000 17 | NB_VOCAB = 10000 18 | NB_RUN_VAL = 100000 19 | NB_EVALUATE = 5000000 20 | BATCH_SIZE = 256 21 | 22 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 23 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 24 | help="learning rate") 25 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 26 | help="amount of training data (number of words)") 27 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file", 28 | help="sparse coding file (pickle)") 29 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", default='', 30 | help="initial embedding file (pickle)") 31 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 32 | help="running validation words") 33 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 34 | help="running validation words") 35 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 36 | help="decaying rate") 37 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 38 | help="decaying rate") 39 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 40 | help="decay lr or not") 41 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 42 | help="amount of training data (number of words)") 43 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128, 44 | help="amount of training data (number of words)") 45 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 46 | help="amount of training data (number of words)") 47 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 48 | help="amount of training data (number of words)") 49 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 50 | help="decaying rate") 51 | parser.add_option("-s", "--save", type="str", dest="save", default='', 52 | help="amount of training data (number of words)") 53 | parser.add_option("-V", "--nb-vocab", type="int", dest="nb_vocab", default=30000, 54 | help="Number of vocabulary") 55 | 56 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH, 57 | help="binarized corpus file") 58 | options, args = parser.parse_args() 59 | 60 | nb_run_words = options.running_words 61 | nb_run_val = options.val_run 62 | nb_evaluate = options.nb_evaluation 63 | embedding_file = options.embedding_file 64 | 65 | with file(options.coding_file, 'rb') as f: 66 | sparse_coding = pickle.load(f) 67 | # print sparse_coding.dtype 68 | 69 | nb_vocab = options.nb_vocab 70 | sparse_coding = sparse_coding[nb_vocab//1000] 71 | nb_vocab, nb_base = sparse_coding.shape 72 | nb_base -= 1 73 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab) 74 | 75 | if embedding_file != '': 76 | with file('../data/wiki-wordmap-trunc300k.wp', 'rb') as f: 77 | wp = pickle.load(f) 78 | freq = wp['idx2wc'] 79 | logger.info('Using word2vec to initialize word embeddings %s ' % embedding_file) 80 | ini_embeds = [compose_dense_repr(nb_base, nb_vocab, freq, embedding_file)] 81 | else: 82 | ini_embeds = None 83 | 84 | if options.decay: 85 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 86 | else: 87 | opt = adam(lr=options.lr) 88 | 89 | if options.log_file == '': 90 | log_file = None 91 | else: 92 | log_file = options.log_file 93 | 94 | if options.save == '': 95 | save_path = None 96 | else: 97 | save_path = options.save 98 | 99 | model = NCELangModelV4(sparse_coding=sparse_coding, nb_negative=options.negative, 100 | embed_dims=options.embed_size, context_dims=options.context_size, 101 | init_embeddings=ini_embeds, negprob_table=unigram_table, optimizer=opt) 102 | model.compile() 103 | model.train(data_file=options.corpus, 104 | save_path=save_path, 105 | batch_size=BATCH_SIZE, train_nb_words=nb_run_words, 106 | val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 107 | validation_interval=options.interval, log_file=log_file) -------------------------------------------------------------------------------- /real/main_nce4.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce4.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | # test different vocab size 20 | lr='0.01' 21 | lr_min='0.002' 22 | gamma='0.003' 23 | nb_neg=50 24 | for ((nb_vocab=10000; nb_vocab<30000; nb_vocab+=2000)); do 25 | log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log" 26 | command_line_="$python_command -C ${context_size} -E ${embed_size} \ 27 | --lr=${lr} --lr-min=${lr_min} \ 28 | -d --gamma=${gamma} -N ${nb_neg} \ 29 | -S $coding_file -e $embed_file --log-file $log_file \ 30 | -D $data_file -V $nb_vocab " 31 | command_line=`echo "$command_line_" | tr -s " "` 32 | ${command_prefix} nohup sh -c "$command_line" & 33 | done 34 | 35 | ## test different lr: 36 | #lr_min='0.002' 37 | #gamma='0.003' 38 | #nb_neg=50 39 | #nb_vocab=30000 40 | #for lr in 0.04 0.03 0.02 0.01 0.008 0.006; do 41 | # log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log" 42 | # $command_prefix $python_command -C ${context_size} -E ${embed_size} \ 43 | # --lr=${lr} --lr-min=${lr_min} \ 44 | # -d --gamma=${gamma} -N ${nb_neg} \ 45 | # -S $coding_file -e $embed_file --log-file $log_file \ 46 | # -D $data_file -V $nb_vocab " 47 | #done 48 | # 49 | #lr='0.01' 50 | #for gamma in 0.001 0.002 0.004; do 51 | # log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-N${nb_neg}.log" 52 | # $command_prefix $python_command -C ${context_size} -E ${embed_size} \ 53 | # --lr=${lr} --lr-min=${lr_min} \ 54 | # -d --gamma=${gamma} -N ${nb_neg} \ 55 | # -S $coding_file -e $embed_file --log-file $log_file \ 56 | # -D $data_file -V $nb_vocab \ 57 | # $command_postfix 58 | #done 59 | -------------------------------------------------------------------------------- /real/main_nce4_lab_proxy_pku2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce4.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | lr='0.002' 20 | nb_neg=50 21 | for ((nb_vocab=32000; nb_vocab<36000; nb_vocab+=2000)); do 22 | log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 23 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 24 | --lr=${lr} \ 25 | -N ${nb_neg} \ 26 | -S $coding_file -e $embed_file --log-file $log_file \ 27 | -D $data_file " 28 | command_line=`echo "$command_line_" | tr -s " "` 29 | ${command_prefix} sh -c "$command_line" & 30 | sleep 80 31 | done 32 | -------------------------------------------------------------------------------- /real/main_nce4_pku1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce4.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | lr='0.002' 20 | nb_neg=50 21 | for ((nb_vocab=16000; nb_vocab<30000; nb_vocab+=2000)); do 22 | log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 23 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 24 | --lr=${lr} \ 25 | -N ${nb_neg} \ 26 | -S $coding_file -e $embed_file --log-file $log_file \ 27 | -D $data_file " 28 | command_line=`echo "$command_line_" | tr -s " "` 29 | ${command_prefix} nohup sh -c "$command_line" & 30 | sleep 80 31 | done -------------------------------------------------------------------------------- /real/main_nce4_pku1_proxy_pku2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce4.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | lr='0.002' 20 | nb_neg=50 21 | for ((nb_vocab=40000; nb_vocab<44000; nb_vocab+=2000)); do 22 | log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 23 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 24 | --lr=${lr} \ 25 | -N ${nb_neg} \ 26 | -S $coding_file -e $embed_file --log-file $log_file \ 27 | -D $data_file " 28 | command_line=`echo "$command_line_" | tr -s " "` 29 | ${command_prefix} nohup sh -c "$command_line" & 30 | sleep 80 31 | done 32 | -------------------------------------------------------------------------------- /real/main_nce4_pku2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce4.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | lr='0.002' 20 | nb_neg=50 21 | for ((nb_vocab=30000; nb_vocab<44000; nb_vocab+=2000)); do 22 | log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 23 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 24 | --lr=${lr} \ 25 | -N ${nb_neg} \ 26 | -S $coding_file -e $embed_file --log-file $log_file \ 27 | -D $data_file " 28 | command_line=`echo "$command_line_" | tr -s " "` 29 | ${command_prefix} nohup sh -c "$command_line" & 30 | sleep 80 31 | done -------------------------------------------------------------------------------- /real/main_nce4_pku3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce4.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | lr='0.002' 20 | nb_neg=50 21 | for ((nb_vocab=44000; nb_vocab<52000; nb_vocab+=2000)); do 22 | log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 23 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 24 | --lr=${lr} \ 25 | -N ${nb_neg} \ 26 | -S $coding_file -e $embed_file --log-file $log_file \ 27 | -D $data_file " 28 | command_line=`echo "$command_line_" | tr -s " "` 29 | ${command_prefix} nohup sh -c "$command_line" & 30 | sleep 80 31 | done -------------------------------------------------------------------------------- /real/main_nce4_pku3_proxy_pku2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce4.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | lr='0.002' 20 | nb_neg=50 21 | for ((nb_vocab=36000; nb_vocab<40000; nb_vocab+=2000)); do 22 | log_file="../logs/main-nce4-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 23 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 24 | --lr=${lr} \ 25 | -N ${nb_neg} \ 26 | -S $coding_file -e $embed_file --log-file $log_file \ 27 | -D $data_file " 28 | command_line=`echo "$command_line_" | tr -s " "` 29 | ${command_prefix} nohup sh -c "$command_line" & 30 | sleep 80 31 | done 32 | -------------------------------------------------------------------------------- /real/main_nce7.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | import optparse 7 | from keras.optimizers import adam, AdamAnneal 8 | from models import NCELangModelV7, logger 9 | import cPickle as pickle 10 | import sys 11 | # noinspection PyUnresolvedReferences 12 | from SparseEmbed.cu_gen_sparse import compose_dense_repr 13 | 14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 15 | EMBED_FILE = '../data/models/embeddings/rw2vec_embeddings-size200.pkl' 16 | NB_RUN_WORDS = 100000000 17 | NB_VOCAB = 10000 18 | NB_RUN_VAL = 100000 19 | NB_EVALUATE = 5000000 20 | BATCH_SIZE = 256 21 | 22 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 23 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 24 | help="learning rate") 25 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 26 | help="amount of training data (number of words)") 27 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file", 28 | help="sparse coding file (pickle)") 29 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", default='', 30 | help="initial embedding file (pickle)") 31 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 32 | help="running validation words") 33 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 34 | help="running validation words") 35 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 36 | help="decaying rate") 37 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 38 | help="decaying rate") 39 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 40 | help="decay lr or not") 41 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 42 | help="amount of training data (number of words)") 43 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128, 44 | help="amount of training data (number of words)") 45 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 46 | help="amount of training data (number of words)") 47 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 48 | help="amount of training data (number of words)") 49 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 50 | help="decaying rate") 51 | parser.add_option("-s", "--save", type="str", dest="save", default='', 52 | help="amount of training data (number of words)") 53 | parser.add_option("-V", "--nb-vocab", type="int", dest="nb_vocab", default=30000, 54 | help="Number of vocabulary") 55 | 56 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH, 57 | help="binarized corpus file") 58 | options, args = parser.parse_args() 59 | 60 | nb_run_words = options.running_words 61 | nb_run_val = options.val_run 62 | nb_evaluate = options.nb_evaluation 63 | embedding_file = options.embedding_file 64 | 65 | with file(options.coding_file, 'rb') as f: 66 | sparse_coding = pickle.load(f) 67 | # print sparse_coding.dtype 68 | 69 | nb_vocab = options.nb_vocab 70 | sparse_coding = sparse_coding[nb_vocab//1000] 71 | nb_vocab, nb_base = sparse_coding.shape 72 | nb_base -= 1 73 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab) 74 | 75 | if embedding_file != '': 76 | with file('../data/wiki-wordmap-trunc300k.wp', 'rb') as f: 77 | wp = pickle.load(f) 78 | freq = wp['idx2wc'] 79 | logger.info('Using word2vec to initialize word embeddings %s ' % embedding_file) 80 | ini_embeds = [compose_dense_repr(nb_base, nb_vocab, freq, embedding_file)] 81 | else: 82 | ini_embeds = None 83 | 84 | if options.decay: 85 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 86 | else: 87 | opt = adam(lr=options.lr) 88 | 89 | if options.log_file == '': 90 | log_file = None 91 | else: 92 | log_file = options.log_file 93 | 94 | if options.save == '': 95 | save_path = None 96 | else: 97 | save_path = options.save 98 | 99 | model = NCELangModelV7(sparse_coding=sparse_coding, nb_negative=options.negative, 100 | embed_dims=options.embed_size, context_dims=options.context_size, 101 | init_embeddings=ini_embeds, negprob_table=unigram_table, optimizer=opt) 102 | model.compile() 103 | model.train(data_file=options.corpus, 104 | save_path=save_path, 105 | batch_size=BATCH_SIZE, train_nb_words=nb_run_words, 106 | val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 107 | validation_interval=options.interval, log_file=log_file) -------------------------------------------------------------------------------- /real/main_nce7.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce7.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | # test different vocab size 20 | lr='0.006' 21 | lr_min='0.002' 22 | gamma='0.003' 23 | nb_neg=50 24 | for ((nb_vocab=10000; nb_vocab<30000; nb_vocab+=2000)); do 25 | log_file="../logs/main-nce7-C${context_size}-E${embed_size}-lr${lr}-lr_min${lr_min}-g${gamma}-V${nb_vocab}-N${nb_neg}.log" 26 | command_line_="$python_command -C ${context_size} -E ${embed_size} \ 27 | --lr=${lr} --lr-min=${lr_min} \ 28 | -d --gamma=${gamma} -N ${nb_neg} \ 29 | -S $coding_file -e $embed_file --log-file $log_file \ 30 | -D $data_file -V $nb_vocab " 31 | command_line=`echo "$command_line_" | tr -s " "` 32 | ${command_prefix} nohup sh -c "$command_line" & 33 | sleep 120 34 | done 35 | -------------------------------------------------------------------------------- /real/main_nce7_pku1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce7.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | # test different vocab size 20 | lr='0.002' 21 | nb_neg=50 22 | for ((nb_vocab=16000; nb_vocab<30000; nb_vocab+=2000)); do 23 | log_file="../logs/main-nce7-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 24 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 25 | --lr=${lr} \ 26 | -N ${nb_neg} \ 27 | -S $coding_file -e $embed_file --log-file $log_file \ 28 | -D $data_file " 29 | command_line=`echo "$command_line_" | tr -s " "` 30 | ${command_prefix} nohup sh -c "$command_line" & 31 | sleep 80 32 | done 33 | -------------------------------------------------------------------------------- /real/main_nce7_pku2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce7.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | # test different vocab size 20 | lr='0.002' 21 | nb_neg=50 22 | for ((nb_vocab=30000; nb_vocab<44000; nb_vocab+=2000)); do 23 | log_file="../logs/main-nce7-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 24 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 25 | --lr=${lr} \ 26 | -N ${nb_neg} \ 27 | -S $coding_file -e $embed_file --log-file $log_file \ 28 | -D $data_file " 29 | command_line=`echo "$command_line_" | tr -s " "` 30 | ${command_prefix} nohup sh -c "$command_line" & 31 | sleep 80 32 | done 33 | -------------------------------------------------------------------------------- /real/main_nce7_pku3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce7.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | # test different vocab size 20 | lr='0.002' 21 | nb_neg=50 22 | for ((nb_vocab=44000; nb_vocab<52000; nb_vocab+=2000)); do 23 | log_file="../logs/main-nce7-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 24 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 25 | --lr=${lr} \ 26 | -N ${nb_neg} \ 27 | -S $coding_file -e $embed_file --log-file $log_file \ 28 | -D $data_file " 29 | command_line=`echo "$command_line_" | tr -s " "` 30 | ${command_prefix} nohup sh -c "$command_line" & 31 | sleep 80 32 | done 33 | -------------------------------------------------------------------------------- /real/main_nce8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | import optparse 7 | from keras.optimizers import adam, AdamAnneal 8 | from models import NCELangModelV8, logger 9 | import cPickle as pickle 10 | import sys 11 | # noinspection PyUnresolvedReferences 12 | from SparseEmbed.cu_gen_sparse import compose_dense_repr 13 | 14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 15 | EMBED_FILE = '../data/models/embeddings/rw2vec_embeddings-size200.pkl' 16 | NB_RUN_WORDS = 100000000 17 | NB_VOCAB = 10000 18 | NB_RUN_VAL = 100000 19 | NB_EVALUATE = 5000000 20 | BATCH_SIZE = 256 21 | 22 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 23 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 24 | help="learning rate") 25 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 26 | help="amount of training data (number of words)") 27 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file", 28 | help="sparse coding file (pickle)") 29 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", default='', 30 | help="initial embedding file (pickle)") 31 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 32 | help="running validation words") 33 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 34 | help="running validation words") 35 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 36 | help="decaying rate") 37 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 38 | help="decaying rate") 39 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 40 | help="decay lr or not") 41 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 42 | help="amount of training data (number of words)") 43 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128, 44 | help="amount of training data (number of words)") 45 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 46 | help="amount of training data (number of words)") 47 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 48 | help="amount of training data (number of words)") 49 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 50 | help="decaying rate") 51 | parser.add_option("-s", "--save", type="str", dest="save", default='', 52 | help="amount of training data (number of words)") 53 | parser.add_option("-V", "--nb-vocab", type="int", dest="nb_vocab", default=30000, 54 | help="Number of vocabulary") 55 | 56 | parser.add_option("-D", "--corpus", type="str", dest="corpus", default=DATA_PATH, 57 | help="binarized corpus file") 58 | options, args = parser.parse_args() 59 | 60 | nb_run_words = options.running_words 61 | nb_run_val = options.val_run 62 | nb_evaluate = options.nb_evaluation 63 | embedding_file = options.embedding_file 64 | 65 | with file(options.coding_file, 'rb') as f: 66 | sparse_coding = pickle.load(f) 67 | # print sparse_coding.dtype 68 | 69 | nb_vocab = options.nb_vocab 70 | sparse_coding = sparse_coding[nb_vocab//1000] 71 | nb_vocab, nb_base = sparse_coding.shape 72 | nb_base -= 1 73 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab) 74 | 75 | if embedding_file != '': 76 | with file('../data/wiki-wordmap-trunc300k.wp', 'rb') as f: 77 | wp = pickle.load(f) 78 | freq = wp['idx2wc'] 79 | logger.info('Using word2vec to initialize word embeddings %s ' % embedding_file) 80 | ini_embeds = [compose_dense_repr(nb_base, nb_vocab, freq, embedding_file)] 81 | else: 82 | ini_embeds = None 83 | 84 | if options.decay: 85 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 86 | else: 87 | opt = adam(lr=options.lr) 88 | 89 | if options.log_file == '': 90 | log_file = None 91 | else: 92 | log_file = options.log_file 93 | 94 | if options.save == '': 95 | save_path = None 96 | else: 97 | save_path = options.save 98 | 99 | model = NCELangModelV8(sparse_coding=sparse_coding, nb_negative=options.negative, 100 | embed_dims=options.embed_size, context_dims=options.context_size, 101 | init_embeddings=ini_embeds, negprob_table=unigram_table, optimizer=opt) 102 | model.compile() 103 | model.train(data_file=options.corpus, 104 | save_path=save_path, 105 | batch_size=BATCH_SIZE, train_nb_words=nb_run_words, 106 | val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 107 | validation_interval=options.interval, log_file=log_file) -------------------------------------------------------------------------------- /real/main_nce8.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce8.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | # test different vocab size 20 | lr='0.002' 21 | nb_neg=50 22 | for ((nb_vocab=10000; nb_vocab<30000; nb_vocab+=2000)); do 23 | log_file="../logs/main-nce8-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 24 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 25 | --lr=${lr} \ 26 | -N ${nb_neg} \ 27 | -S $coding_file -e $embed_file --log-file $log_file \ 28 | -D $data_file " 29 | command_line=`echo "$command_line_" | tr -s " "` 30 | ${command_prefix} nohup sh -c "$command_line" & 31 | sleep 80 32 | done 33 | 34 | -------------------------------------------------------------------------------- /real/main_nce8_nodecay_lab.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce8.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | # test different vocab size 20 | lr='0.002' 21 | nb_neg=50 22 | for ((nb_vocab=10000; nb_vocab<16000; nb_vocab+=2000)); do 23 | log_file="../logs/main-nce8-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 24 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 25 | --lr=${lr}\ 26 | -N ${nb_neg} \ 27 | -S $coding_file -e $embed_file --log-file $log_file \ 28 | -D $data_file " 29 | command_line=`echo "$command_line_" | tr -s " "` 30 | ${command_prefix} sh -c "$command_line" & 31 | sleep 120 32 | done 33 | 34 | -------------------------------------------------------------------------------- /real/main_nce8_pku1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce8.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | # test different vocab size 20 | lr='0.002' 21 | nb_neg=50 22 | for ((nb_vocab=38000; nb_vocab<46000; nb_vocab+=2000)); do 23 | log_file="../logs/main-nce8-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 24 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 25 | --lr=${lr} \ 26 | -N ${nb_neg} \ 27 | -S $coding_file -e $embed_file --log-file $log_file \ 28 | -D $data_file " 29 | command_line=`echo "$command_line_" | tr -s " "` 30 | ${command_prefix} nohup sh -c "$command_line" & 31 | sleep 80 32 | done 33 | 34 | -------------------------------------------------------------------------------- /real/main_nce8_pku2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce8.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | # test different vocab size 20 | lr='0.002' 21 | nb_neg=50 22 | for ((nb_vocab=16000; nb_vocab<38000; nb_vocab+=2000)); do 23 | log_file="../logs/main-nce8-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 24 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 25 | --lr=${lr} \ 26 | -N ${nb_neg} \ 27 | -S $coding_file -e $embed_file --log-file $log_file \ 28 | -D $data_file " 29 | command_line=`echo "$command_line_" | tr -s " "` 30 | ${command_prefix} nohup sh -c "$command_line" & 31 | sleep 80 32 | done 33 | 34 | -------------------------------------------------------------------------------- /real/main_nce8_pku3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ "x$1" = "x--dry-run" ]; then 3 | command_prefix="echo " 4 | else 5 | command_prefix= 6 | fi 7 | 8 | export PYTHONPATH="${PWD}/../..:${PYTHONPATH}" 9 | export OMP_NUM_THREADS=2 10 | export MKL_NUM_THREADS=2 11 | 12 | python_command="python main_nce8.py" 13 | coding_file="../data/sparse/total-app-a0.1-b0.1-w1-0.1-15000.pkl" 14 | embed_file="../data/models/embeddings/rw2vec_embeddings-size200.pkl" 15 | data_file="../data/corpus/wiki-sg-norm-lc-drop-bin-sample.bz2" 16 | context_size=200 17 | embed_size=200 18 | 19 | # test different vocab size 20 | lr='0.002' 21 | nb_neg=50 22 | for ((nb_vocab=46000; nb_vocab<52000; nb_vocab+=2000)); do 23 | log_file="../logs/main-nce8-C${context_size}-E${embed_size}-lr${lr}-V${nb_vocab}-N${nb_neg}.log" 24 | command_line_="$python_command -V $nb_vocab -C ${context_size} -E ${embed_size} \ 25 | --lr=${lr} \ 26 | -N ${nb_neg} \ 27 | -S $coding_file -e $embed_file --log-file $log_file \ 28 | -D $data_file " 29 | command_line=`echo "$command_line_" | tr -s " "` 30 | ${command_prefix} nohup sh -c "$command_line" & 31 | sleep 80 32 | done 33 | 34 | -------------------------------------------------------------------------------- /real/run_batch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source ../environ.sh 3 | 4 | models_dir="../data/models/lang" 5 | log_dir="../logs" 6 | 7 | python run_nce0.py --lr 0.04 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.04.pkl \ 8 | --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.04.log 9 | python run_nce0.py --lr 0.02 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.02.pkl \ 10 | --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.02.log 11 | python run_nce0.py --lr 0.01 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.01.pkl \ 12 | --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.01.log 13 | python run_nce0.py --lr 0.005 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.005.pkl \ 14 | --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.005.log 15 | 16 | python run_nce0.py --lr 0.04 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.04-d.pkl \ 17 | --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.04-d.log -d --lr-min 0.005 18 | python run_nce0.py --lr 0.02 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.02-d.pkl \ 19 | --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.02-d.log -d --lr-min 0.005 20 | python run_nce0.py --lr 0.01 -C 128 -E 128 --save ${models_dir}/nce0-lstm-c128-e128-neg50-lr0.01-d.pkl \ 21 | --log-file ${log_dir}/nce0-lstm-c128-e128-neg50-lr0.01-d.log -d --lr-min 0.005 -------------------------------------------------------------------------------- /real/run_nce0.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | import optparse 7 | from keras.optimizers import adam, AdamAnneal 8 | from models import NCELangModel 9 | 10 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 11 | NB_RUN_WORDS = 100000000 12 | NB_VOCAB = 10000 13 | NB_RUN_VAL = 100000 14 | NB_EVALUATE = 5000000 15 | BATCH_SIZE = 256 16 | 17 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 18 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 19 | help="learning rate") 20 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 21 | help="amount of training data (number of words)") 22 | parser.add_option("-V", "--vocab-size", type="int", dest="vocab_size", default=NB_VOCAB, 23 | help="vocabulary size") 24 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 25 | help="running validation words") 26 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 27 | help="running validation words") 28 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 29 | help="decaying rate") 30 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 31 | help="decaying rate") 32 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 33 | help="decay lr or not") 34 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 35 | help="amount of training data (number of words)") 36 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128, 37 | help="amount of training data (number of words)") 38 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 39 | help="amount of training data (number of words)") 40 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 41 | help="amount of training data (number of words)") 42 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 43 | help="decaying rate") 44 | parser.add_option("-s", "--save", type="str", dest="save", default='', 45 | help="amount of training data (number of words)") 46 | options, args = parser.parse_args() 47 | 48 | nb_run_words = options.running_words 49 | nb_vocab = options.vocab_size 50 | nb_run_val = options.val_run 51 | nb_evaluate = options.nb_evaluation 52 | 53 | # unigram_table = get_unigram_probtable(nb_words=nb_vocab) 54 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, 55 | save_path='../data/wiki-unigram-prob-size%d.pkl' % 56 | nb_vocab) 57 | 58 | if options.decay: 59 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 60 | else: 61 | opt = adam(lr=options.lr) 62 | 63 | if options.log_file == '': 64 | log_file = None 65 | else: 66 | log_file = options.log_file 67 | 68 | if options.save == '': 69 | save_path = None 70 | else: 71 | save_path = options.save 72 | 73 | model = NCELangModel(vocab_size=nb_vocab, nb_negative=options.negative, 74 | embed_dims=options.embed_size, context_dims=options.context_size, 75 | negprob_table=unigram_table, optimizer=opt) 76 | model.compile() 77 | model.train(data_file=DATA_PATH, 78 | save_path=save_path, 79 | batch_size=BATCH_SIZE, train_nb_words=nb_run_words, 80 | val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 81 | validation_interval=options.interval, log_file=log_file) -------------------------------------------------------------------------------- /real/run_nce0_default.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | from models import NCELangModel 7 | 8 | NB_RUN_WORDS = 100000000 9 | NB_VOCAB = 10000 10 | NB_RUN_VAL = 100000 11 | NB_EVALUATE = 5000000 12 | SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128.pkl' 13 | 14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 15 | BATCH_SIZE = 256 16 | VAL_INTER = 1200 17 | 18 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB) 19 | 20 | model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128, 21 | negprob_table=unigram_table, optimizer='adam') 22 | model.compile() 23 | model.train(data_file=DATA_PATH, 24 | save_path=SAVE_PATH, 25 | batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS, 26 | val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER) -------------------------------------------------------------------------------- /real/run_nce0_neg100_default.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | from models import NCELangModel 7 | 8 | NB_RUN_WORDS = 100000000 9 | NB_VOCAB = 10000 10 | NB_RUN_VAL = 100000 11 | NB_EVALUATE = 5000000 12 | SAVE_PATH = '../data/models/lang/nce0-neg100-e128-c128.pkl' 13 | 14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 15 | BATCH_SIZE = 256 16 | VAL_INTER = 1200 17 | 18 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB) 19 | 20 | model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=100, embed_dims=128, context_dims=128, 21 | negprob_table=unigram_table, optimizer='adam') 22 | model.compile() 23 | # model.train(data_file='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2', 24 | # save_path='../data/models/lang/nce-neg50-e128-c128.pkl', 25 | # batch_size=256, train_nb_words=NB_RUN_WORDS//100, 26 | # val_nb_words=NB_EVALUATE//10, train_val_nb=NB_RUN_VAL//5, validation_interval=40) 27 | model.train(data_file=DATA_PATH, save_path=SAVE_PATH, 28 | batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS, 29 | val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER) -------------------------------------------------------------------------------- /real/run_nce0_neg50_lr0.005.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | from models import NCELangModel 7 | from keras.optimizers import adam 8 | 9 | NB_RUN_WORDS = 100000000 10 | NB_VOCAB = 10000 11 | NB_RUN_VAL = 100000 12 | NB_EVALUATE = 5000000 13 | SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128-lr0.005.pkl' 14 | 15 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 16 | BATCH_SIZE = 256 17 | VAL_INTER = 1200 18 | 19 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB) 20 | 21 | opt = adam(lr=0.005) 22 | model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128, 23 | negprob_table=unigram_table, optimizer=opt) 24 | model.compile() 25 | model.train(data_file=DATA_PATH, 26 | save_path=SAVE_PATH, 27 | batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS, 28 | val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER) -------------------------------------------------------------------------------- /real/run_nce0_neg50_lr0.01.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | from models import NCELangModel 7 | from keras.optimizers import adam 8 | 9 | NB_RUN_WORDS = 100000000 10 | NB_VOCAB = 10000 11 | NB_RUN_VAL = 100000 12 | NB_EVALUATE = 5000000 13 | SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128-lr0.01.pkl' 14 | 15 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 16 | BATCH_SIZE = 256 17 | VAL_INTER = 1200 18 | 19 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB) 20 | 21 | opt = adam(lr=0.01) 22 | model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128, 23 | negprob_table=unigram_table, optimizer=opt) 24 | model.compile() 25 | model.train(data_file=DATA_PATH, 26 | save_path=SAVE_PATH, 27 | batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS, 28 | val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER) -------------------------------------------------------------------------------- /real/run_nce0_neg50_lr0.01_g0.001.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | from models import NCELangModel 7 | from keras.optimizers import AdamAnneal 8 | 9 | NB_RUN_WORDS = 100000000 10 | NB_VOCAB = 10000 11 | NB_RUN_VAL = 100000 12 | NB_EVALUATE = 5000000 13 | SAVE_PATH = '../data/models/lang/nce0-neg50-e128-c128-lr0.01-gamma0.001.pkl' 14 | 15 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 16 | BATCH_SIZE = 256 17 | VAL_INTER = 1200 18 | 19 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB) 20 | 21 | opt = AdamAnneal(lr=0.01, lr_min=0.0045, gamma=0.001) 22 | model = NCELangModel(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128, 23 | negprob_table=unigram_table, optimizer=opt) 24 | model.compile() 25 | model.train(data_file=DATA_PATH, 26 | save_path=SAVE_PATH, 27 | batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS, 28 | val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER) -------------------------------------------------------------------------------- /real/run_nce1_neg50_default.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | from models import NCELangModelV1 7 | 8 | NB_RUN_WORDS = 100000000 9 | NB_VOCAB = 10000 10 | NB_RUN_VAL = 100000 11 | NB_EVALUATE = 5000000 12 | SAVE_PATH = '../data/models/lang/nce1-neg50-e128-c128.pkl' 13 | 14 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 15 | BATCH_SIZE = 256 16 | VAL_INTER = 1200 17 | 18 | # NB_RUN_WORDS = 5000000 19 | # NB_VOCAB = 10000 20 | # NB_RUN_VAL = 100000 21 | # NB_EVALUATE = 500000 22 | 23 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB) 24 | 25 | model = NCELangModelV1(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128, 26 | negprob_table=unigram_table, optimizer='adam') 27 | model.compile() 28 | # model.train(data_file='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2', 29 | # save_path='../data/models/lang/nce-neg50-e128-c128.pkl', 30 | # batch_size=256, train_nb_words=NB_RUN_WORDS//100, 31 | # val_nb_words=NB_EVALUATE//10, train_val_nb=NB_RUN_VAL//5, validation_interval=40) 32 | model.train(data_file=DATA_PATH, save_path=SAVE_PATH, 33 | batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS, 34 | val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER) -------------------------------------------------------------------------------- /real/run_nce2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | from models import NCELangModelV2 7 | from keras.optimizers import AdamAnneal, adam 8 | import optparse 9 | 10 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 11 | NB_RUN_WORDS = 100000000 12 | NB_VOCAB = 10000 13 | NB_RUN_VAL = 100000 14 | NB_EVALUATE = 5000000 15 | BATCH_SIZE = 256 16 | 17 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 18 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 19 | help="learning rate") 20 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 21 | help="amount of training data (number of words)") 22 | parser.add_option("-V", "--vocab-size", type="int", dest="vocab_size", default=NB_VOCAB, 23 | help="vocabulary size") 24 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 25 | help="running validation words") 26 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 27 | help="running validation words") 28 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 29 | help="decaying rate") 30 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 31 | help="decaying rate") 32 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 33 | help="decay lr or not") 34 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 35 | help="amount of training data (number of words)") 36 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128, 37 | help="amount of training data (number of words)") 38 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 39 | help="amount of training data (number of words)") 40 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 41 | help="amount of training data (number of words)") 42 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 43 | help="decaying rate") 44 | parser.add_option("-s", "--save", type="str", dest="save", default='', 45 | help="amount of training data (number of words)") 46 | options, args = parser.parse_args() 47 | 48 | nb_run_words = options.running_words 49 | nb_vocab = options.vocab_size 50 | nb_run_val = options.val_run 51 | nb_evaluate = options.nb_evaluation 52 | 53 | # unigram_table = get_unigram_probtable(nb_words=nb_vocab) 54 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, 55 | save_path='../data/wiki-unigram-prob-size%d.pkl' % 56 | nb_vocab) 57 | if options.decay: 58 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 59 | else: 60 | opt = adam(lr=options.lr) 61 | 62 | if options.log_file == '': 63 | log_file = None 64 | else: 65 | log_file = options.log_file 66 | 67 | if options.save == '': 68 | save_path = None 69 | else: 70 | save_path = options.save 71 | 72 | model = NCELangModelV2(vocab_size=nb_vocab, nb_negative=options.negative, 73 | embed_dims=options.embed_size, context_dims=options.context_size, 74 | negprob_table=unigram_table, optimizer=opt) 75 | model.compile() 76 | model.train(data_file=DATA_PATH, 77 | save_path=save_path, 78 | batch_size=BATCH_SIZE, train_nb_words=nb_run_words, 79 | val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 80 | validation_interval=options.interval, log_file=log_file) 81 | 82 | -------------------------------------------------------------------------------- /real/run_nce2_neg50_lr0.01_g0.001.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | from models import NCELangModelV2 7 | from keras.optimizers import AdamAnneal 8 | 9 | NB_RUN_WORDS = 100000000 10 | NB_VOCAB = 10000 11 | NB_RUN_VAL = 100000 12 | NB_EVALUATE = 5000000 13 | SAVE_PATH = '../data/models/lang/nce2-neg50-e128-c128-lr0.01-gamma0.001.pkl' 14 | 15 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 16 | BATCH_SIZE = 256 17 | VAL_INTER = 1200 18 | 19 | unigram_table = get_unigram_probtable(nb_words=NB_VOCAB) 20 | 21 | opt = AdamAnneal(lr=0.01, lr_min=0.0045, gamma=0.001) 22 | model = NCELangModelV2(vocab_size=NB_VOCAB, nb_negative=50, embed_dims=128, context_dims=128, 23 | negprob_table=unigram_table, optimizer=opt) 24 | model.compile() 25 | model.train(data_file=DATA_PATH, 26 | save_path=SAVE_PATH, 27 | batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS, 28 | val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER) -------------------------------------------------------------------------------- /real/run_nce3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | import optparse 7 | from keras.optimizers import adam, AdamAnneal 8 | from models import NCELangModelV3 9 | import cPickle as pickle 10 | 11 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 12 | NB_RUN_WORDS = 100000000 13 | NB_VOCAB = 10000 14 | NB_RUN_VAL = 100000 15 | NB_EVALUATE = 5000000 16 | BATCH_SIZE = 256 17 | 18 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 19 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 20 | help="learning rate") 21 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 22 | help="amount of training data (number of words)") 23 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file", 24 | help="sparse coding file (pickle)") 25 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", 26 | help="initial embedding file (pickle)") 27 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 28 | help="running validation words") 29 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 30 | help="running validation words") 31 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 32 | help="decaying rate") 33 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 34 | help="decaying rate") 35 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 36 | help="decay lr or not") 37 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 38 | help="amount of training data (number of words)") 39 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128, 40 | help="amount of training data (number of words)") 41 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 42 | help="amount of training data (number of words)") 43 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 44 | help="amount of training data (number of words)") 45 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 46 | help="decaying rate") 47 | parser.add_option("-s", "--save", type="str", dest="save", default='', 48 | help="amount of training data (number of words)") 49 | options, args = parser.parse_args() 50 | 51 | nb_run_words = options.running_words 52 | nb_run_val = options.val_run 53 | nb_evaluate = options.nb_evaluation 54 | 55 | 56 | with file(options.coding_file, 'rb') as f: 57 | sparse_coding = pickle.load(f) 58 | # print sparse_coding.dtype 59 | 60 | nb_vocab = sparse_coding.shape[0] 61 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab) 62 | 63 | if options.embedding_file != '': 64 | with file(options.embedding_file, 'rb') as f: 65 | ini_embeds = pickle.load(f) 66 | # print ini_embeds.dtype 67 | # print ini_embeds.shape 68 | # import sys 69 | # sys.exit(0) 70 | else: 71 | ini_embeds = None 72 | 73 | if options.decay: 74 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 75 | else: 76 | opt = adam(lr=options.lr) 77 | 78 | if options.log_file == '': 79 | log_file = None 80 | else: 81 | log_file = options.log_file 82 | 83 | if options.save == '': 84 | save_path = None 85 | else: 86 | save_path = options.save 87 | 88 | model = NCELangModelV3(sparse_coding=sparse_coding, nb_negative=options.negative, 89 | embed_dims=options.embed_size, context_dims=options.context_size, 90 | init_embeddings=[ini_embeds], negprob_table=unigram_table, optimizer=opt) 91 | model.compile() 92 | model.train(data_file=DATA_PATH, 93 | save_path=save_path, 94 | batch_size=BATCH_SIZE, train_nb_words=nb_run_words, 95 | val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 96 | validation_interval=options.interval, log_file=log_file) -------------------------------------------------------------------------------- /real/run_nce4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | import optparse 7 | from keras.optimizers import adam, AdamAnneal 8 | from models import NCELangModelV4 9 | import cPickle as pickle 10 | 11 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 12 | NB_RUN_WORDS = 100000000 13 | NB_VOCAB = 10000 14 | NB_RUN_VAL = 100000 15 | NB_EVALUATE = 5000000 16 | BATCH_SIZE = 256 17 | 18 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 19 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 20 | help="learning rate") 21 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 22 | help="amount of training data (number of words)") 23 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file", 24 | help="sparse coding file (pickle)") 25 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", 26 | help="initial embedding file (pickle)") 27 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 28 | help="running validation words") 29 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 30 | help="running validation words") 31 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 32 | help="decaying rate") 33 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 34 | help="decaying rate") 35 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 36 | help="decay lr or not") 37 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 38 | help="amount of training data (number of words)") 39 | parser.add_option("-C", "--context-size", type="int", dest="context_size", default=128, 40 | help="amount of training data (number of words)") 41 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 42 | help="amount of training data (number of words)") 43 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 44 | help="amount of training data (number of words)") 45 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 46 | help="decaying rate") 47 | parser.add_option("-s", "--save", type="str", dest="save", default='', 48 | help="amount of training data (number of words)") 49 | options, args = parser.parse_args() 50 | 51 | nb_run_words = options.running_words 52 | nb_run_val = options.val_run 53 | nb_evaluate = options.nb_evaluation 54 | 55 | 56 | with file(options.coding_file, 'rb') as f: 57 | sparse_coding = pickle.load(f) 58 | # print sparse_coding.dtype 59 | 60 | nb_vocab = sparse_coding.shape[0] 61 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab) 62 | 63 | if options.embedding_file != '': 64 | with file(options.embedding_file, 'rb') as f: 65 | ini_embeds = pickle.load(f) 66 | # print ini_embeds.dtype 67 | # print ini_embeds.shape 68 | # import sys 69 | # sys.exit(0) 70 | else: 71 | ini_embeds = None 72 | 73 | if options.decay: 74 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 75 | else: 76 | opt = adam(lr=options.lr) 77 | 78 | if options.log_file == '': 79 | log_file = None 80 | else: 81 | log_file = options.log_file 82 | 83 | if options.save == '': 84 | save_path = None 85 | else: 86 | save_path = options.save 87 | 88 | model = NCELangModelV4(sparse_coding=sparse_coding, nb_negative=options.negative, 89 | embed_dims=options.embed_size, context_dims=options.context_size, 90 | init_embeddings=[ini_embeds], negprob_table=unigram_table, optimizer=opt) 91 | model.compile() 92 | model.train(data_file=DATA_PATH, 93 | save_path=save_path, 94 | batch_size=BATCH_SIZE, train_nb_words=nb_run_words, 95 | val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 96 | validation_interval=options.interval, log_file=log_file) -------------------------------------------------------------------------------- /real/run_nce5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | import optparse 7 | from keras.optimizers import adam, AdamAnneal 8 | from models import NCELangModelV5 9 | import cPickle as pickle 10 | 11 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 12 | NB_RUN_WORDS = 100000000 13 | NB_VOCAB = 10000 14 | NB_RUN_VAL = 100000 15 | NB_EVALUATE = 5000000 16 | BATCH_SIZE = 256 17 | 18 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 19 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 20 | help="learning rate") 21 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 22 | help="amount of training data (number of words)") 23 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file", 24 | help="sparse coding file (pickle)") 25 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", 26 | help="initial embedding file (pickle)") 27 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 28 | help="running validation words") 29 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 30 | help="running validation words") 31 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 32 | help="decaying rate") 33 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 34 | help="decaying rate") 35 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 36 | help="decay lr or not") 37 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 38 | help="amount of training data (number of words)") 39 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 40 | help="amount of training data (number of words)") 41 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 42 | help="amount of training data (number of words)") 43 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 44 | help="decaying rate") 45 | parser.add_option("-s", "--save", type="str", dest="save", default='', 46 | help="amount of training data (number of words)") 47 | options, args = parser.parse_args() 48 | 49 | nb_run_words = options.running_words 50 | nb_run_val = options.val_run 51 | nb_evaluate = options.nb_evaluation 52 | 53 | 54 | with file(options.coding_file, 'rb') as f: 55 | sparse_coding = pickle.load(f) 56 | # print sparse_coding.dtype 57 | 58 | nb_vocab = sparse_coding.shape[0] 59 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab) 60 | 61 | if options.embedding_file != '': 62 | with file(options.embedding_file, 'rb') as f: 63 | ini_embeds = pickle.load(f) 64 | # print ini_embeds.dtype 65 | # print ini_embeds.shape 66 | # import sys 67 | # sys.exit(0) 68 | else: 69 | ini_embeds = None 70 | 71 | if options.decay: 72 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 73 | else: 74 | opt = adam(lr=options.lr) 75 | 76 | if options.log_file == '': 77 | log_file = None 78 | else: 79 | log_file = options.log_file 80 | 81 | if options.save == '': 82 | save_path = None 83 | else: 84 | save_path = options.save 85 | 86 | model = NCELangModelV5(sparse_coding=sparse_coding, nb_negative=options.negative, 87 | embed_dims=options.embed_size, init_embeddings=[ini_embeds], 88 | negprob_table=unigram_table, optimizer=opt) 89 | model.compile() 90 | model.train(data_file=DATA_PATH, 91 | save_path=save_path, 92 | batch_size=BATCH_SIZE, train_nb_words=nb_run_words, 93 | val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 94 | validation_interval=options.interval, log_file=log_file) -------------------------------------------------------------------------------- /real/run_nce6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from utils import get_unigram_probtable 6 | import optparse 7 | from keras.optimizers import adam, AdamAnneal 8 | from models import NCELangModelV6 9 | import cPickle as pickle 10 | 11 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 12 | NB_RUN_WORDS = 100000000 13 | NB_VOCAB = 10000 14 | NB_RUN_VAL = 100000 15 | NB_EVALUATE = 5000000 16 | BATCH_SIZE = 256 17 | 18 | parser = optparse.OptionParser(usage="%prog [OPTIONS]") 19 | parser.add_option("-a", "--lr", type="float", dest="lr", default=0.01, 20 | help="learning rate") 21 | parser.add_option("-R", "--running-words", type="int", dest="running_words", default=NB_RUN_WORDS, 22 | help="amount of training data (number of words)") 23 | parser.add_option("-S", "--coding-file", type="str", dest="coding_file", 24 | help="sparse coding file (pickle)") 25 | parser.add_option("-e", "--embedding-file", type="str", dest="embedding_file", 26 | help="initial embedding file (pickle)") 27 | parser.add_option("-m", "--val-run", type="int", dest="val_run", default=NB_RUN_VAL, 28 | help="running validation words") 29 | parser.add_option("-n", "--nb-evaluation", type="int", dest="nb_evaluation", default=NB_EVALUATE, 30 | help="running validation words") 31 | parser.add_option("-g", "--gamma", type="float", dest="gamma", default=0.001, 32 | help="decaying rate") 33 | parser.add_option("-b", "--lr-min", type="float", dest="lr_min", default=0.005, 34 | help="decaying rate") 35 | parser.add_option("-d", "--decay", action="store_true", dest="decay", default=False, 36 | help="decay lr or not") 37 | parser.add_option("-N", "--nb-negative", type="int", dest="negative", default=50, 38 | help="amount of training data (number of words)") 39 | parser.add_option("-E", "--embedding-size", type="int", dest="embed_size", default=128, 40 | help="amount of training data (number of words)") 41 | parser.add_option("-l", "--log-file", type="str", dest="log_file", default='', 42 | help="amount of training data (number of words)") 43 | parser.add_option("-r", "--report-interval", type="float", dest="interval", default=1200., 44 | help="decaying rate") 45 | parser.add_option("-s", "--save", type="str", dest="save", default='', 46 | help="amount of training data (number of words)") 47 | parser.add_option("-p", "--init", type="str", dest="init", default='first', 48 | help="init scheme") 49 | options, args = parser.parse_args() 50 | 51 | nb_run_words = options.running_words 52 | nb_run_val = options.val_run 53 | nb_evaluate = options.nb_evaluation 54 | 55 | 56 | with file(options.coding_file, 'rb') as f: 57 | sparse_coding = pickle.load(f) 58 | # print sparse_coding.dtype 59 | 60 | nb_vocab = sparse_coding.shape[0] 61 | unigram_table = get_unigram_probtable(nb_words=nb_vocab, save_path='../data/wiki-unigram-prob-size%d.pkl' % nb_vocab) 62 | 63 | if options.embedding_file != '': 64 | with file(options.embedding_file, 'rb') as f: 65 | ini_embeds = pickle.load(f) 66 | 67 | if options.init == 'first': 68 | init_e = [ini_embeds] 69 | else: 70 | init_e = [ini_embeds] * 4 71 | # print ini_embeds.dtype 72 | # print ini_embeds.shape 73 | # import sys 74 | # sys.exit(0) 75 | else: 76 | init_e = None 77 | 78 | if options.decay: 79 | opt = AdamAnneal(lr=options.lr, lr_min=options.lr_min, gamma=options.gamma) 80 | else: 81 | opt = adam(lr=options.lr) 82 | 83 | if options.log_file == '': 84 | log_file = None 85 | else: 86 | log_file = options.log_file 87 | 88 | if options.save == '': 89 | save_path = None 90 | else: 91 | save_path = options.save 92 | 93 | model = NCELangModelV6(sparse_coding=sparse_coding, nb_negative=options.negative, 94 | embed_dims=options.embed_size, init_embeddings=init_e, 95 | negprob_table=unigram_table, optimizer=opt) 96 | model.compile() 97 | model.train(data_file=DATA_PATH, 98 | save_path=save_path, 99 | batch_size=BATCH_SIZE, train_nb_words=nb_run_words, 100 | val_nb_words=nb_evaluate, train_val_nb=nb_run_val, 101 | validation_interval=options.interval, log_file=log_file) -------------------------------------------------------------------------------- /real/run_tree_huffman_lr0.01_g0.001.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from models import TreeLangModel 6 | from keras.optimizers import AdamAnneal 7 | import cPickle as pickle 8 | 9 | NB_RUN_WORDS = 100000000 10 | NB_VOCAB = 10000 11 | NB_RUN_VAL = 100000 12 | NB_EVALUATE = 5000000 13 | 14 | # NB_RUN_WORDS = 1000000 15 | # NB_VOCAB = 10000 16 | # NB_RUN_VAL = 10000 17 | # NB_EVALUATE = 50000 18 | SAVE_PATH = '../data/models/lang/huffman-e128-c128-lr0.01-gamma0.001.pkl' 19 | 20 | DATA_PATH = '../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 21 | BATCH_SIZE = 256 22 | VAL_INTER = 1200 23 | 24 | with file('../data/wiki-huffman-tree-info-Vsize10000.pkl', 'rb') as f: 25 | tree_info = pickle.load(f) 26 | 27 | wrd2cls = tree_info['idx2cls'] 28 | wrd2bitstr = tree_info['idx2bitstr'] 29 | 30 | opt = AdamAnneal(lr=0.01, lr_min=0.0045, gamma=0.001) 31 | model = TreeLangModel(vocab_size=NB_VOCAB, embed_dim=128, cntx_dim=128, 32 | word2class=wrd2cls, word2bitstr=wrd2bitstr, optimizer=opt) 33 | model.compile() 34 | model.train(data_file=DATA_PATH, 35 | save_path=SAVE_PATH, 36 | batch_size=BATCH_SIZE, train_nb_words=NB_RUN_WORDS, 37 | val_nb_words=NB_EVALUATE, train_val_nb=NB_RUN_VAL, validation_interval=VAL_INTER) -------------------------------------------------------------------------------- /real/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from .utils import floatX, categorical_crossentropy, objective_fnc, chunk_sentences,\ 6 | slice_X, get_unigram_probtable, TableSampler, load_huffman_tree, save_tree, create_tree,\ 7 | LangModelLogger, LangHistory, epsilon 8 | from .preprocess import data4sri -------------------------------------------------------------------------------- /real/utils/check_maps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | 6 | def check_maps(map1, map2): 7 | for w1, w2 in zip(map1['idx2word'], map2['idx2word']): 8 | if w1 != w2: 9 | raise Exception('idx2word: map not equal') 10 | 11 | for i, m in enumerate([map1, map2]): 12 | for idx, w in enumerate(m['idx2word']): 13 | if idx != m['word2idx'][w]: 14 | raise Exception('map%d not consistent' % i) 15 | 16 | 17 | if __name__ == '__main__': 18 | import cPickle as pickle 19 | wp_file = '../../data/wiki-wordmap-trunc300k.wp' 20 | embeds_file = '/home/cyc/Data/models/embeddings/rw2vec_embeddings-size200.pkl' 21 | 22 | with file(wp_file, 'rb') as f: 23 | wp = pickle.load(f) 24 | 25 | with file(embeds_file, 'rb') as f: 26 | em = pickle.load(f) 27 | 28 | check_maps(wp, em) 29 | 30 | -------------------------------------------------------------------------------- /real/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from bz2 import BZ2File 5 | import unittest 6 | import os 7 | import numpy as np 8 | import cPickle as pickle 9 | import logging 10 | import re 11 | from utils import chunk_sentences 12 | 13 | __author__ = 'Yunchuan Chen' 14 | MAX_SETN_LEN = 65 15 | DATA_ROOT = '../../data/' 16 | 17 | 18 | class ReadFileTest(unittest.TestCase): 19 | def test_prprcs_wrt(self): 20 | if not os.path.exists(DATA_ROOT+'corpus/wiki-sg-norm-lc-drop.bz2'): 21 | return 22 | with BZ2File(DATA_ROOT+'corpus/wiki-sg-norm-lc-drop.bz2') as f: 23 | f.readline() 24 | line = f.readline() 25 | self.failUnless('it was shortlisted for the booker prize and won several other awards .'.strip() == line.strip(), 26 | 'read line: %s not as expected.\n' % line) 27 | 28 | def test_ixport(self): 29 | wpx, flag = export_wordmap() 30 | wpi = import_wordmap() 31 | 32 | self.failUnless(flag is True, 'Failure flag received from export map') 33 | if wpx is not None: 34 | self.failUnless('word2idx' in wpx, 'word2idx key lost for the wordmap.') 35 | self.failUnless('idx2word' in wpx, 'idx2word key lost for the wordmap.') 36 | self.failUnless('idx2wc' in wpx, 'idx2wc key lost for the wordmap.') 37 | 38 | self.failUnless('word2idx' in wpi, 'word2idx key lost for the wordmap.') 39 | self.failUnless('idx2word' in wpi, 'idx2word key lost for the wordmap.') 40 | self.failUnless('idx2wc' in wpi, 'idx2wc key lost for the wordmap.') 41 | 42 | 43 | def smart_open(fname, mode='rb', buffering=5*2**20): 44 | _, ext = os.path.splitext(fname) 45 | if ext == '.bz2': 46 | from bz2 import BZ2File 47 | return BZ2File(fname, mode, buffering) 48 | # if ext == '.gz': 49 | # from gzip import GzipFile 50 | # return GzipFile(fname, mode, buffering) 51 | return open(fname, mode, buffering) 52 | 53 | 54 | def export_wordmap(dist_file=DATA_ROOT+'wiki-wordmap.wp', 55 | corpus_file=DATA_ROOT+'corpus/wiki-sg-norm-lc.txt', rebuild=False): 56 | """ 57 | :param dist_file: file name to store the wordmap 58 | :param corpus_file: corpus source to build wordmap against 59 | :param rebuild: whether rebuild wordmap if it already exists. 60 | :return: exported model and a flag. 61 | """ 62 | if os.path.exists(dist_file) and not rebuild: 63 | return None, True 64 | word2cnt = dict() 65 | with smart_open(corpus_file, buffering=5*2**20) as f: 66 | for sent in f: 67 | words = sent.split() 68 | for w in words: 69 | try: 70 | word2cnt[w] += 1 71 | except KeyError: 72 | word2cnt[w] = 1 73 | kv = sorted(word2cnt.items(), key=lambda x: x[1], reverse=True) 74 | idx2word = [w for w, _ in kv] 75 | idx2wc = [c for _, c in kv] 76 | word2idx = dict((w, idx) for idx, (w, _) in enumerate(kv)) 77 | model = {'idx2word': idx2word, 'idx2wc': idx2wc, 'word2idx': word2idx} 78 | with file(dist_file, 'wb') as f: 79 | pickle.dump(model, f, -1) 80 | return model, True 81 | 82 | 83 | def import_wordmap(fname=DATA_ROOT+'wiki-wordmap.wp'): 84 | """ 85 | :param fname: a string indicate where the wordmap stores. 86 | :return: wordmap 87 | """ 88 | with file(fname, 'rb') as f: 89 | wp = pickle.load(f) 90 | return wp 91 | 92 | 93 | def preprocess_corpus(corpus_file=DATA_ROOT+'corpus/wiki-sg-norm-lc.txt', 94 | dist_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop.bz2'): 95 | """ 96 | :param corpus_file: original corpus file name 97 | :type corpus_file: str 98 | :param dist_file: location to store the preprocessed corpus. 99 | :type dist_file: str 100 | :return: None 101 | Drop all sentences with length not in [3, 64]. 102 | """ 103 | corpus_file = file(corpus_file) 104 | dist_file = smart_open(dist_file, mode='w') 105 | 106 | assert corpus_file is not None and dist_file is not None 107 | for line in corpus_file: 108 | words = line.split() 109 | if not (3 <= len(words) <= 64): 110 | continue 111 | dist_file.write(line) 112 | 113 | corpus_file.close() 114 | dist_file.close() 115 | 116 | 117 | def binarize_corpus(group_size=20000, corpus_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop.bz2', 118 | dist_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-bin.bz2', 119 | max_len=64, wordmap=DATA_ROOT+'wiki-wordmap.wp'): 120 | """ 121 | :param group_size: group size. We repeatedly read group size of sentences and 122 | convert and store them into binary format as a batch. 123 | :type group_size: int 124 | :param corpus_file: the corpus to be converted 125 | :type corpus_file: str 126 | :param dist_file: the file to store the converted corpus 127 | :param max_len: maximum length of sentence. Sentences exceeds this length will be dropped. 128 | :param wordmap: wordmap. 129 | :return: None 130 | """ 131 | def _index_sentence(sent): 132 | """ 133 | :param sent: a sentence as a string 134 | :type sent: str 135 | :return: a list of word index 136 | Represents a sentence using word indexes. 137 | """ 138 | words = sent.split() 139 | return [word2idx[w] for w in words] 140 | 141 | def _commit_result(): 142 | for idx_sent in result[3:]: 143 | if len(idx_sent) > 0: 144 | sents = np.array(idx_sent, dtype=np.int32) 145 | shape = np.array(sents.shape, dtype=np.int32) 146 | dist_file.write(shape.tobytes()) 147 | dist_file.write(sents.tobytes()) 148 | 149 | for j in range(len(result)): 150 | result[j] = [] 151 | 152 | dist_file = smart_open(dist_file, 'wb') 153 | assert dist_file is not None 154 | if isinstance(wordmap, str): 155 | wp = import_wordmap(fname=wordmap) 156 | elif isinstance(wordmap, dict): 157 | wp = wordmap 158 | else: 159 | logging.error('can not recognize wordmap type') 160 | raise TypeError('wordamp must be dict or str') 161 | word2idx = wp['word2idx'] 162 | result = [[] for _ in range(max_len + 1)] 163 | with smart_open(corpus_file) as f: 164 | for i, sent in enumerate(f, start=1): 165 | idxs = _index_sentence(sent) 166 | try: 167 | result[len(idxs)].append(idxs) 168 | if i % group_size == 0: 169 | _commit_result() 170 | except IndexError: 171 | continue 172 | _commit_result() 173 | 174 | dist_file.close() 175 | 176 | 177 | def grouped_sentences(binary_corpus=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-bin.bz2'): 178 | with smart_open(binary_corpus) as f: 179 | while True: 180 | shape_data = f.read(2*4) 181 | if shape_data == "": 182 | return 183 | shape = np.frombuffer(shape_data, dtype=np.uint32) 184 | siz = shape[0] * shape[1] * 4 185 | sents = np.frombuffer(f.read(siz), dtype=np.uint32) 186 | # noinspection PyTypeChecker 187 | sents_ = np.reshape(sents, shape) 188 | yield sents_.copy().astype('int32') 189 | 190 | 191 | def show_grouped_sentences(group_sents, wordmap=DATA_ROOT+'wiki-wordmap.wp'): 192 | """ 193 | :param group_sents: a matrix represents a set of sentences' indexes 194 | :type group_sents: numpy.ndarray 195 | :param wordmap: word_ to index_ map and vise versa 196 | :return: list, a list of string representation of the sentences. 197 | """ 198 | if isinstance(wordmap, str): 199 | # import logging 200 | logger = logging.getLogger('Preprocess') 201 | logger.warn('It would be inefficient if repeatedly call this function with wordmap name') 202 | wordmap = import_wordmap(fname=wordmap) 203 | idx2word = wordmap['idx2word'] 204 | elif isinstance(wordmap, dict): 205 | idx2word = wordmap['idx2word'] 206 | else: 207 | raise TypeError('wordmap must be a string representing the map location or ' 208 | 'a dictionary containing the map') 209 | ret = [None] * group_sents.shape[0] 210 | for i in range(len(ret)): 211 | ret[i] = [idx2word[j] for j in group_sents[i]] 212 | 213 | return ret 214 | 215 | 216 | def get_fake_data_meta(fname=DATA_ROOT+'fake', trn_regex=re.compile(r'\d{3}.bz2')): 217 | data_path = os.path.abspath(fname) 218 | meta_file = os.path.join(data_path, 'meta.pkl') 219 | if not os.path.isfile(meta_file): 220 | train_files_ = [os.path.join(data_path, f) for f in os.listdir(data_path) if trn_regex.match(f)] 221 | train_files = [f for f in train_files_ if os.path.isfile(f)] 222 | nb_total = 0 223 | nb_bin = np.zeros((15,), dtype='int32') 224 | 225 | for f in train_files: 226 | X = np.loadtxt(f, dtype='int32') 227 | nb_bin += np.bincount(X.ravel(), minlength=15) 228 | nb_total += np.prod(X.shape) 229 | 230 | rel_freq = nb_bin.astype('float32')/nb_total 231 | ret = {'freq': nb_bin, 'rel_freq': rel_freq, 'nb_total': nb_total} 232 | with file(meta_file, 'wb') as mf: 233 | pickle.dump(ret, mf) 234 | else: 235 | with file(meta_file, 'rb') as mf: 236 | ret = pickle.load(mf) 237 | 238 | return ret 239 | 240 | 241 | def truncate_wordmap(wp, max_size=300000, dist=DATA_ROOT+'wiki-wordmap-trunc300k.wp'): 242 | idx2word = wp['idx2word'][:max_size] 243 | idx2wc = wp['idx2wc'][:max_size] 244 | 245 | word2idx = dict((w, idx) for idx, w in enumerate(idx2word)) 246 | model = {'idx2word': idx2word, 'idx2wc': idx2wc, 'word2idx': word2idx} 247 | with file(dist, 'wb') as f: 248 | pickle.dump(model, f, -1) 249 | return model 250 | 251 | 252 | def get_val_data(data_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-bin.bz2', val_nb_words=100000, max_vocab=10000): 253 | """ 254 | :param data_file: 255 | :type data_file: basestring | str | unicode | __generator 256 | :param val_nb_words: 257 | :param max_vocab: 258 | :return: 259 | """ 260 | if isinstance(data_file, basestring): 261 | sent_gen = grouped_sentences(data_file) 262 | else: 263 | sent_gen = data_file 264 | 265 | val_sents = [None for _ in range(MAX_SETN_LEN)] 266 | val_nb = 0 267 | for sents in sent_gen: 268 | val_nb += sents.size 269 | chunk_sentences(val_sents, sents, 1000000, no_return=True) 270 | if val_nb >= val_nb_words: 271 | break 272 | val_sents_ = [None for _ in range(MAX_SETN_LEN)] 273 | for idx in range(MAX_SETN_LEN): 274 | if val_sents[idx]: 275 | val_sents_[idx] = np.vstack(val_sents[idx]['sents']) 276 | 277 | val_sents = [sents for sents in val_sents_ if sents is not None] 278 | for sents in val_sents: 279 | mask = (sents > max_vocab) 280 | sents[mask] = max_vocab 281 | 282 | return val_sents 283 | 284 | 285 | def data4sri(src_corpus=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=DATA_ROOT+'corpus/sri', 286 | train_nb_words=100000000, val_nb_words=5000000, train_val_nb=100000, max_vocab=10000): 287 | def bin2txt(sents, dist_file): 288 | for i in xrange(sents.shape[0]): 289 | words = [str(idx) for idx in sents[i]] 290 | sent = ' '.join(words) 291 | dist_file.writelines([sent, '\n']) 292 | 293 | sent_gen = grouped_sentences(src_corpus) 294 | val_sents = get_val_data(sent_gen, val_nb_words, max_vocab) 295 | get_val_data(sent_gen, train_val_nb) 296 | 297 | if train_nb_words >= 1000000: 298 | trn_name = 'wiki-trn-R%dm-V%dk.txt' % (train_nb_words // 1000000, max_vocab//1000) 299 | elif train_nb_words >= 1000: 300 | trn_name = 'wiki-trn-R%dk-V%dk.txt' % (train_nb_words // 1000, max_vocab//1000) 301 | else: 302 | trn_name = 'wiki-trn-R%d-V%dk.txt' % (train_nb_words, max_vocab//1000) 303 | 304 | if val_nb_words >= 1000000: 305 | val_name = 'wiki-val-R%dm-V%dk.txt' % (val_nb_words // 1000000, max_vocab//1000) 306 | elif val_nb_words >= 1000: 307 | val_name = 'wiki-val-R%dk-V%dk.txt' % (val_nb_words // 1000, max_vocab//1000) 308 | else: 309 | val_name = 'wiki-val-R%d-V%dk.txt' % (val_nb_words, max_vocab//1000) 310 | 311 | val_file = file(os.path.join(save_path, val_name), 'w') 312 | for sents in val_sents: 313 | bin2txt(sents, val_file) 314 | 315 | trn_file = file(os.path.join(save_path, trn_name), 'w') 316 | nb_exported = 0 317 | for sents in sent_gen: 318 | mask = (sents > max_vocab) 319 | sents[mask] = max_vocab 320 | bin2txt(sents, trn_file) 321 | nb_exported += sents.size 322 | if nb_exported >= train_nb_words: 323 | break 324 | 325 | val_file.close() 326 | trn_file.close() 327 | 328 | if __name__ == '__main__': 329 | logging.basicConfig(level=logging.INFO) 330 | if not os.path.exists(DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-0.bz2'): 331 | export_wordmap() 332 | preprocess_corpus(dist_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-0.bz2') 333 | if not os.path.exists(DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-0-bin.bz2'): 334 | binarize_corpus(dist_file=DATA_ROOT+'corpus/wiki-sg-norm-lc-drop-0-bin.bz2') 335 | 336 | # unittest.main() -------------------------------------------------------------------------------- /real/utils/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | import math 6 | import os 7 | import cPickle as pickle 8 | from scipy.stats import rv_discrete 9 | from keras.callbacks import History, BaseLogger 10 | from keras.utils.generic_utils import Progbar 11 | import theano 12 | import theano.tensor as T 13 | import numpy as np 14 | import Queue 15 | import re 16 | 17 | floatX = theano.config.floatX 18 | epsilon = 1.0e-9 19 | # if floatX == 'float64': 20 | # epsilon = 1.0e-9 21 | # else: 22 | # epsilon = 1.0e-7 23 | 24 | 25 | def categorical_crossentropy2d(y_true, y_pred): 26 | """ 27 | :param y_true: true index labels with shape (ns, nt) 28 | :param y_pred: predicted probabilities with shape (ns, nt, V) 29 | :return: cce 30 | """ 31 | y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon) 32 | # scale preds so that the class probas of each sample sum to 1 33 | y_pred /= y_pred.sum(axis=-1, keepdims=True) 34 | 35 | ns = y_true.shape[0] 36 | nt = y_true.shape[1] 37 | sample_idx = T.reshape(T.arange(ns), (ns, 1)) 38 | time_idx = T.reshape(T.arange(nt), (1, nt)) 39 | probs_ = y_pred[sample_idx, time_idx, y_true] 40 | return -T.log(probs_) 41 | 42 | 43 | def categorical_crossentropy1d(y_true, y_pred): 44 | """ 45 | :param y_true: true index labels with shape (n, ) 46 | :param y_pred: predicted probabilities with shape (n, V) 47 | :return: cce 48 | """ 49 | y_pred = T.clip(y_pred, epsilon, 1.0 - epsilon) 50 | # scale preds so that the class probas of each sample sum to 1 51 | y_pred /= y_pred.sum(axis=-1, keepdims=True) 52 | 53 | n = y_true.shape[0] 54 | sample_idx = T.arange(n) 55 | probs_ = y_pred[sample_idx, y_true] 56 | return -T.log(probs_) 57 | 58 | 59 | def categorical_crossentropy(y_true, y_pred): 60 | if y_true.ndim == 1: 61 | return categorical_crossentropy1d(y_true, y_pred) 62 | elif y_true.ndim == 2: 63 | return categorical_crossentropy2d(y_true, y_pred) 64 | else: 65 | raise NotImplementedError('not implemented for 3D or larger dimensions') 66 | 67 | 68 | def objective_fnc(fn): 69 | def symvar(y_true, y_pred, mask=None): 70 | obj_output = fn(y_true, y_pred) 71 | if mask is None: 72 | # return obj_output.mean(dtype=theano.config.floatX) 73 | return T.sum(obj_output) / obj_output.shape[0].astype(floatX) 74 | else: 75 | # obj_output = obj_output[mask.nonzero()] 76 | # return obj_output.mean(dtype=theano.config.floatX) 77 | obj_output = T.sum(obj_output * mask) 78 | return obj_output / mask.shape[0].astype(floatX) 79 | return symvar 80 | 81 | 82 | def chunk_sentences(old_sentences, new_sentences, chunk_size, no_return=False, min_nb_ch=5): 83 | """ 84 | :param old_sentences: [{nb_sents: x, sents: [...]}, ...] 85 | :param new_sentences: 86 | :param chunk_size: 87 | :param no_return: 88 | :return: 89 | """ 90 | sent_len = new_sentences.shape[1] 91 | 92 | if old_sentences[sent_len]: 93 | nb_sents = old_sentences[sent_len]['nb_sents'] + new_sentences.shape[0] 94 | old_sentences[sent_len]['nb_sents'] = nb_sents 95 | old_sentences[sent_len]['sents'].append(new_sentences) 96 | 97 | else: 98 | nb_sents = new_sentences.shape[0] 99 | old_sentences[sent_len] = {'nb_sents': nb_sents, 100 | 'sents': [new_sentences]} 101 | 102 | if nb_sents >= chunk_size*min_nb_ch and not no_return: 103 | nb_chunks = nb_sents // chunk_size 104 | nb_ret = nb_chunks * chunk_size 105 | tmp = np.vstack(old_sentences[sent_len]['sents']) 106 | old_sentences[sent_len]['sents'] = [tmp[nb_ret:]] 107 | old_sentences[sent_len]['nb_sents'] = old_sentences[sent_len]['sents'][0].shape[0] 108 | return tmp[:nb_ret] 109 | else: 110 | return None 111 | 112 | 113 | def slice_X(X, start_, end_=None, axis=1): 114 | if end_ is None: 115 | return [x.take(start_, axis=axis) for x in X] 116 | else: 117 | ret = [] 118 | for y in X: 119 | s = [slice(None) for _ in range(y.ndim)] 120 | s[axis] = slice(start_, end_) 121 | s = tuple(s) 122 | ret.append(y[s]) 123 | return ret 124 | 125 | 126 | def get_unigram_probtable(nb_words, wordmap='../data/wiki-wordmap-trunc300k.wp', 127 | save_path='../data/wiki-unigram-prob-size10000.pkl'): 128 | if os.path.exists(save_path): 129 | with file(save_path, 'rb') as f: 130 | freq = pickle.load(f) 131 | return freq 132 | 133 | with file(wordmap, 'rb') as f: 134 | wp = pickle.load(f) 135 | 136 | idx2wc = wp['idx2wc'] 137 | idx2wc[nb_words-1] = sum(idx2wc[nb_words-1:]) 138 | nb_total = sum(idx2wc[:nb_words]) 139 | 140 | freq = np.array(idx2wc[:nb_words], dtype=floatX)/nb_total 141 | freq_reduce = freq[nb_words-1] * 2.0/3.0 142 | freq[nb_words-1] -= freq_reduce 143 | pivot = nb_words // 2 144 | nb = nb_words - pivot 145 | gain = freq_reduce / nb 146 | freq[pivot:nb_words] += gain 147 | freq = freq / freq.sum() 148 | with file(save_path, 'wb') as f: 149 | pickle.dump(freq, f, -1) 150 | 151 | return freq 152 | 153 | 154 | def prefix_generator(s, start=0, end=None): 155 | if end is None: 156 | end = len(s) + 1 157 | for idx in range(start, end): 158 | yield s[:idx] 159 | 160 | 161 | def pad_bitstr(bitstr): 162 | """ 163 | :param bitstr: 164 | :type bitstr: list 165 | :return: padded list of bits 166 | """ 167 | max_bit_len = 0 168 | for bits in bitstr: 169 | if len(bits) > max_bit_len: 170 | max_bit_len = len(bits) 171 | for bits in bitstr: 172 | bits.extend([0] * (max_bit_len-len(bits))) 173 | 174 | return bitstr 175 | 176 | 177 | def pad_virtual_class(clses, pad_value): 178 | max_cls_len = 0 179 | for nodes in clses: 180 | if len(nodes) > max_cls_len: 181 | max_cls_len = len(nodes) 182 | for nodes in clses: 183 | nodes.extend([pad_value] * (max_cls_len-len(nodes))) 184 | 185 | return clses 186 | 187 | 188 | class HuffmanNode(object): 189 | def __init__(self, left=None, right=None, root=None): 190 | self.left = left 191 | self.right = right 192 | self.root = root # Why? Not needed for anything. 193 | 194 | def children(self): 195 | return self.left, self.right 196 | 197 | def preorder(self, path=None, left_code=0, right_code=1, collector=None): 198 | if collector is None: 199 | collector = [] 200 | if path is None: 201 | path = [] 202 | if self.left is not None: 203 | if isinstance(self.left[1], HuffmanNode): 204 | self.left[1].preorder(path+[left_code], left_code, right_code, collector) 205 | else: 206 | # print(self.left[1], path+[left_code]) 207 | collector.append((self.left[1], self.left[0], path+[left_code])) 208 | if self.right is not None: 209 | if isinstance(self.right[1], HuffmanNode): 210 | self.right[1].preorder(path+[right_code], left_code, right_code, collector) 211 | else: 212 | # print(self.right[1], path+[right_code]) 213 | collector.append((self.right[1], self.right[0], path+[right_code])) 214 | 215 | return collector 216 | 217 | 218 | def create_tree(frequencies): 219 | p = Queue.PriorityQueue() 220 | for value in frequencies: # 1. Create a leaf node for each symbol 221 | p.put(value) # and add it to the priority queue 222 | while p.qsize() > 1: # 2. While there is more than one node 223 | l, r = p.get(), p.get() # 2a. remove two highest nodes 224 | node = HuffmanNode(l, r) # 2b. create internal node with children 225 | p.put((l[0]+r[0], node)) # 2c. add new node to queue 226 | return p.get() # 3. tree is complete - return root node 227 | 228 | 229 | def load_huffman_tree(prob_table): 230 | rel_freq = prob_table 231 | freq = zip(rel_freq, range(len(rel_freq))) 232 | tree = create_tree(freq)[1] 233 | x = tree.preorder(left_code=-1, right_code=1) 234 | y = sorted(x, key=lambda z: z[1], reverse=True) 235 | # bitstr = [] 236 | # for _, _, bitstr_ in y: 237 | # bitstr.append(bitstr_[:-1]) 238 | 239 | z = [(wrdidx, bits, list(prefix_generator(bits, end=len(bits)))) for wrdidx, _, bits in y] 240 | clses = set() 241 | for _, _, ele in z: 242 | for i in ele: 243 | clses.add(''.join('%+d' % j for j in i)) 244 | idx2clses = sorted(clses, key=lambda ele: len(ele)) 245 | cls2idx = dict(((cls, idx) for idx, cls in enumerate(idx2clses))) 246 | w = map(lambda x: (x[0], x[1], [cls2idx[''.join('%+d' % j for j in p)] for p in x[2]]), z) 247 | 248 | tmp1, tmp2 = [], [] 249 | for _, bits, cls_idx in w: 250 | tmp1.append(bits) 251 | tmp2.append(cls_idx) 252 | pad_bitstr(tmp1) 253 | pad_virtual_class(tmp2, pad_value=len(idx2clses)-1) 254 | assert len(freq) == len(w) 255 | idx2cls = [None] * len(freq) 256 | idx2bitstr = [None] * len(freq) 257 | for idx, bitstr_, cls_ in w: 258 | idx2cls[idx] = cls_ 259 | idx2bitstr[idx] = bitstr_ 260 | 261 | idx2cls = np.array(idx2cls, dtype='int32') 262 | idx2bitstr = np.array(idx2bitstr, dtype='int8') 263 | 264 | return idx2cls, idx2bitstr, idx2bitstr != 0 265 | 266 | 267 | def save_tree(fn, idx2cls, idx2bitstr, mask): 268 | with file(fn, 'wb') as f: 269 | pickle.dump({'idx2cls': idx2cls, 'idx2bitstr': idx2bitstr, 'mask': mask}, f) 270 | 271 | 272 | _VAL_LINE = re.compile(r'INFO:.*:Val val_loss: (\d*\.\d*) - val_ppl: (\d*\.\d)') 273 | _TRAIN_LINE = re.compile(r'INFO:NCELangModelV4:Train - time: (\d*\.\d*) - loss: (\d*\.\d*)') 274 | 275 | 276 | def convert_logs(log, val_line=_VAL_LINE, trn_line=_TRAIN_LINE): 277 | f = file(log, 'r') 278 | val_loss = [] 279 | val_ppl = [] 280 | t_trn = [] 281 | trn_loss = [] 282 | 283 | for line in f: 284 | val_mat = val_line.match(line) 285 | if val_mat is not None: 286 | loss = float(val_mat.group(1)) 287 | ppl = float(val_mat.group(2)) 288 | val_loss.append(loss) 289 | val_ppl.append(ppl) 290 | continue 291 | trn_mat = trn_line.match(line) 292 | if trn_mat is not None: 293 | t = float(trn_mat.group(1)) 294 | loss = float(trn_mat.group(2)) 295 | t_trn.append(t) 296 | trn_loss.append(loss) 297 | f.close() 298 | 299 | t_trn = np.array(t_trn) 300 | t_trn -= t_trn[0] 301 | trn_loss = np.array(trn_loss) 302 | val_loss = np.array(val_loss[:-1]) 303 | val_ppl = np.array(val_ppl[:-1]) 304 | 305 | return t_trn, trn_loss, val_loss, val_ppl 306 | 307 | 308 | class TableSampler(rv_discrete): 309 | def __init__(self, table): 310 | nk = np.arange(len(table)) 311 | super(TableSampler, self).__init__(b=len(table)-1, values=(nk, table)) 312 | 313 | def sample(self, shape, dtype='int32'): 314 | return self.rvs(size=shape).astype(dtype) 315 | 316 | 317 | class LangHistory(History): 318 | 319 | # def on_train_begin(self, logs=None): 320 | # # logs = {} if logs is None else logs 321 | # self.epoch = [] 322 | # self.history = {} 323 | # 324 | # def on_epoch_begin(self, epoch, logs=None): 325 | # self.seen = 0 326 | # self.totals = {} 327 | 328 | def on_batch_end(self, batch, logs=None): 329 | logs = {} if logs is None else logs 330 | batch_size = logs.get('size', 0) 331 | self.seen += batch_size 332 | 333 | for k, v in logs.items(): 334 | if k == 'encode_len' or 'nb_words': 335 | try: 336 | self.totals[k] += v 337 | except KeyError: 338 | self.totals[k] = v 339 | continue 340 | 341 | try: 342 | self.totals[k] += v * batch_size 343 | except KeyError: 344 | self.totals[k] = v * batch_size 345 | 346 | def on_epoch_end(self, epoch, logs=None): 347 | if hasattr(self.totals, 'encode_len') and hasattr(self, 'nb_words'): 348 | ppl = math.exp(self.totals['encode_len']/float(self.totals['nb_words'])) 349 | k = 'ppl' 350 | try: 351 | self.history[k].append(ppl) 352 | except KeyError: 353 | self.history[k] = [ppl] 354 | 355 | if hasattr(self.totals, 'val_encode_len') and hasattr(self, 'val_nb_words'): 356 | val_ppl = math.exp(self.totals['val_encode_len']/float(self.totals['val_nb_words'])) 357 | k = 'val_ppl' 358 | try: 359 | self.history[k].append(val_ppl) 360 | except KeyError: 361 | self.history[k] = [val_ppl] 362 | 363 | k = 'loss' 364 | v = self.totals[k] 365 | try: 366 | self.history[k].append(v/float(self.seen)) 367 | except KeyError: 368 | self.history[k] = [v/float(self.seen)] 369 | 370 | 371 | class LangModelLogger(BaseLogger): 372 | def __init__(self): 373 | super(LangModelLogger, self).__init__() 374 | self.verbose = None 375 | self.nb_epoch = None 376 | self.seen = 0 377 | self.totals = {} 378 | self.progbar = None 379 | self.log_values = [] 380 | 381 | # def on_train_begin(self, logs=None): 382 | # logger.debug('Begin training...') 383 | # self.verbose = self.params['verbose'] 384 | # self.nb_epoch = self.params['nb_epoch'] 385 | # 386 | # def on_epoch_begin(self, epoch, logs=None): 387 | # # print('Epoch %d/%d' % (epoch + 1, self.nb_epoch)) 388 | # self.progbar = Progbar(target=self.params['nb_sample'], verbose=1) 389 | # self.seen = 0 390 | # self.totals = {} 391 | # 392 | # def on_batch_begin(self, batch, logs=None): 393 | # if self.seen < self.params['nb_sample']: 394 | # self.log_values = [] 395 | # self.params['metrics'] = ['loss', 'ppl', 'val_loss', 'val_ppl'] 396 | 397 | def on_batch_end(self, batch, logs=None): 398 | logs = {} if logs is None else logs 399 | batch_size = logs.get('size', 0) 400 | self.seen += batch_size 401 | 402 | for k, v in logs.items(): 403 | if k == 'encode_len' or 'nb_words': 404 | try: 405 | self.totals[k] += v 406 | except KeyError: 407 | self.totals[k] = v 408 | continue 409 | 410 | try: 411 | self.totals[k] += v * batch_size 412 | except KeyError: 413 | self.totals[k] = v * batch_size 414 | 415 | if 'encode_len' in self.totals and 'nb_words' in self.totals and 'ppl' in self.params['metrics']: 416 | self.totals['ppl'] = math.exp(self.totals['encode_len']/float(self.totals['nb_words'])) 417 | self.log_values.append(('ppl', self.totals['ppl'])) 418 | for k in self.params['metrics']: 419 | if k in logs: 420 | self.log_values.append((k, logs[k])) 421 | 422 | # skip progbar update for the last batch; will be handled by on_epoch_end 423 | if self.seen < self.params['nb_sample']: 424 | self.progbar.update(self.seen, self.log_values) 425 | 426 | def on_epoch_begin(self, epoch, logs=None): 427 | if self.verbose: 428 | self.progbar = Progbar(target=self.params['nb_sample'], 429 | verbose=self.verbose) 430 | self.seen = 0 431 | self.totals = {} 432 | 433 | def on_epoch_end(self, epoch, logs=None): 434 | logs = {} if logs is None else logs 435 | # logger.debug('log keys: %s' % str(logs.keys())) 436 | for k in self.params['metrics']: 437 | if k in self.totals: 438 | if k != 'ppl': 439 | self.log_values.append((k, self.totals[k] / self.seen)) 440 | else: 441 | self.totals['ppl'] = math.exp(self.totals['encode_len']/float(self.totals['nb_words'])) 442 | self.log_values.append((k, self.totals['ppl'])) 443 | if k in logs: 444 | self.log_values.append((k, logs[k])) 445 | if 'val_encode_len' in logs and 'val_nb_words' in logs: 446 | val_ppl = math.exp(logs['val_encode_len']/float(logs['val_nb_words'])) 447 | self.log_values.append(('val_ppl', val_ppl)) 448 | 449 | self.progbar.update(self.seen, self.log_values) -------------------------------------------------------------------------------- /real/workspace/export_sri_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | # noinspection PyUnresolvedReferences 5 | from lm.real.utils import data4sri 6 | 7 | DATA_ROOT = '../../data/' 8 | # data4sri(src_corpus=DATA_ROOT+'/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=DATA_ROOT+'corpus/sri', 9 | # train_nb_words=100000000, val_nb_words=5000000, train_val_nb=100000, max_vocab=10000) 10 | 11 | # data4sri(src_corpus=DATA_ROOT+'/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=DATA_ROOT+'corpus/sri', 12 | # train_nb_words=100000000, val_nb_words=5000000, train_val_nb=100000, max_vocab=50000) 13 | 14 | for k in range(10000, 52000, 2000): 15 | data4sri(src_corpus=DATA_ROOT+'/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=DATA_ROOT+'corpus/sri', 16 | train_nb_words=100000000, val_nb_words=5000000, train_val_nb=100000, max_vocab=k) 17 | 18 | data4sri(src_corpus=DATA_ROOT+'/corpus/wiki-sg-norm-lc-drop-bin.bz2', save_path=DATA_ROOT+'corpus/sri', 19 | train_nb_words=100000000, val_nb_words=5000000, train_val_nb=100000, max_vocab=100000000) -------------------------------------------------------------------------------- /real/workspace/extract_learning_curv_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | import sys 6 | import os 7 | import re 8 | from scipy.io import savemat 9 | import numpy as np 10 | 11 | log_dir = sys.argv[1] 12 | pat = re.compile(r"main-nce4-.*-V(\d+)-N\d+.log") 13 | file_pat = re.compile(sys.argv[2]) if len(sys.argv) >= 3 else pat 14 | # INFO:NCELangModelV4:Train - time: 1453042236.299597 - loss: 4.672819 15 | # INFO:NCELangModelV4:Val val_loss: 4.653410 - val_ppl: 351.053158 16 | trn_pat = re.compile(r'.*:Train - time: (\d+\.\d+) - loss: (\d+\.\d+)') 17 | val_pat = re.compile(r'.*:Val val_loss: (\d+\.\d+) - val_ppl: (\d+\.\d+)') 18 | log_files = os.listdir(log_dir) 19 | 20 | loss_data = {} 21 | val_data = {} 22 | for file_name in os.listdir(log_dir): 23 | m_k = pat.match(file_name) 24 | if m_k is None: 25 | continue 26 | k = m_k.group(1) 27 | loss_key = 'lossV'+k 28 | val_key = 'pplV' + k 29 | loss_data[loss_key] = [] 30 | val_data[val_key] = [] 31 | with file(log_dir+'/'+file_name, 'r') as f: 32 | for line in f: 33 | m = trn_pat.match(line) 34 | if m: 35 | loss_data[loss_key].append([float(m.group(1)), float(m.group(2))]) 36 | continue 37 | m = val_pat.match(line) 38 | if m: 39 | val_data[val_key].append([float(m.group(1)), float(m.group(2))]) 40 | 41 | data = {} 42 | for k in loss_data: 43 | data[k] = np.array(loss_data[k]) 44 | for k in val_data: 45 | data[k] = np.array(val_data[k]) 46 | 47 | savemat(log_dir+'/loss.mat', data) 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /real/workspace/gen_train_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | # noinspection PyUnresolvedReferences 5 | from lm.utils.preprocess import grouped_sentences, smart_open 6 | import numpy as np 7 | 8 | DATA_PATH = '../../data/corpus/wiki-sg-norm-lc-drop-bin.bz2' 9 | DATA_DIST = '../../data/corpus/wiki-sg-norm-lc-drop-bin-sample0.2B.bz2' 10 | 11 | 12 | def _commit_result(dist_file, sents): 13 | shape = np.array(sents.shape, dtype=np.int32) 14 | dist_file.write(shape.tobytes()) 15 | dist_file.write(sents.tobytes()) 16 | 17 | first_chunk_size = 50000000 18 | next_chunk_start = first_chunk_size * 2 19 | total_size = 200000000 20 | nb_words = 0 21 | 22 | dist_file = smart_open(DATA_DIST, 'wb') 23 | sents = grouped_sentences(DATA_PATH) 24 | for chunk in sents: 25 | if nb_words > first_chunk_size: 26 | break 27 | nb_words += chunk.size 28 | _commit_result(dist_file, chunk) 29 | 30 | nb_words_ = nb_words 31 | for chunk in sents: 32 | nb_words_ += chunk.size 33 | if nb_words_ > next_chunk_start: 34 | break 35 | 36 | for chunk in sents: 37 | if nb_words >= total_size: 38 | break 39 | nb_words += chunk.size 40 | _commit_result(dist_file, chunk) 41 | 42 | dist_file.close() 43 | 44 | -------------------------------------------------------------------------------- /real/workspace/show_time_loss.m: -------------------------------------------------------------------------------- 1 | % show_time_loss 2 | 3 | % load('logs/loss.mat'); 4 | 5 | nb = length(10000:2000:28000); 6 | loss_data = cell(nb, 1); 7 | % k = 1; 8 | % for i=10000:2000:28000 9 | % val_name = strcat('lossV', num2str(i)); 10 | % tmp = eval(val_name); 11 | % loss_data{k} = [tmp(:, 1) - tmp(1, 1), tmp(:, 2)]; 12 | % k = k + 1; 13 | % end 14 | % 15 | % colmap = hsv(nb); 16 | % figure; hold on; 17 | % for i = 1:nb 18 | % plot(loss_data{i}(3:20:end, 1), loss_data{i}(3:20:end, 2),... 19 | % 'Color', colmap(i,:)); 20 | % end 21 | k = 1; 22 | ppl_data = cell(nb, 1); 23 | for i=10000:2000:28000 24 | val_name = strcat('pplV', num2str(i)); 25 | tmp = eval(val_name); 26 | ppl_data{k} = [tmp(:, 1) - tmp(1, 1), tmp(:, 2)]; 27 | k = k + 1; 28 | end 29 | 30 | hold off; 31 | for i = 1:nb 32 | figure; 33 | t = linspace(0, 12, size(ppl_data{i},1)); 34 | plotyy(t, ppl_data{i}(:, 1), t, ppl_data{i}(:, 2)); 35 | end -------------------------------------------------------------------------------- /stat/get_stat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | import numpy as np 5 | from scipy.io import savemat 6 | 7 | 8 | def get_sample_sent(number=10, min_len=200, corpus=r'../data/wiki-sg-norm-lc.txt'): 9 | samples = [] 10 | with file(corpus) as f: 11 | for line in f: 12 | if len(line.split()) >= min_len: 13 | samples.append(line) 14 | if len(samples) == number: 15 | break 16 | return samples 17 | 18 | if __name__ == '__main__': 19 | datafile = r'../data/wiki-sg-norm-lc.txt' 20 | max_line = 5000000 21 | len_stat = np.zeros(max_line, dtype='int32') 22 | with file(datafile) as f: 23 | for idx, line in enumerate(f): 24 | if idx == max_line: 25 | break 26 | len_stat[idx] = len(line.split()) 27 | 28 | savemat('../data/wiki-stats.mat', {'len_stat': len_stat}, oned_as='column') 29 | -------------------------------------------------------------------------------- /stat/read_stats.m: -------------------------------------------------------------------------------- 1 | function stat = read_stats(filename) 2 | if nargin == 0 3 | filename = '../data/wiki-stats.mat'; 4 | end 5 | s = load(filename); 6 | stat = s.len_stat; -------------------------------------------------------------------------------- /stat/show_stats.m: -------------------------------------------------------------------------------- 1 | stat_all = read_stats; 2 | stat_lt600 = stat_all(stat_all < 600); 3 | stat_le96 = stat_lt600(stat_lt600 <= 96); 4 | sstat_eq1 = sum(stat_le96 == 1); 5 | sstat_eq2 = sum(stat_le96 == 2); 6 | sstat_eq3 = sum(stat_le96 == 3); 7 | sstat_gt96 = sum(stat_all > 96); 8 | sstat_gt80 = sum(stat_all > 80); 9 | sstat_gt64 = sum(stat_all > 64); 10 | stat_3t96 = stat_lt600(stat_lt600 <= 96 & stat_lt600 >=3); 11 | stat_3t64 = stat_le96(stat_le96 <=64 & stat_le96 >=3); 12 | sstat_3t8 = sum(stat_3t64 <= 8); 13 | sstat_9t12 = sum(stat_3t64 <= 12 & stat_3t64 >=9); 14 | 15 | fprintf('percentage of length 1: %.2f%%\n', 100*double(sstat_eq1)/length(stat_all)); 16 | fprintf('percentage of length 2: %.2f%%\n', 100*double(sstat_eq2)/length(stat_all)); 17 | fprintf('percentage of length 3: %.2f%%\n', 100*double(sstat_eq3)/length(stat_all)); 18 | fprintf('percentage of length above 96: %.2f%%\n', 100*double(sstat_gt96)/length(stat_all)); 19 | fprintf('percentage of length above 80: %.2f%%\n', 100*double(sstat_gt80)/length(stat_all)); 20 | fprintf('percentage of length above 64: %.2f%%\n', 100*double(sstat_gt64)/length(stat_all)); 21 | fprintf('percentage of length 3 to 96: %.2f%%\n', 100*double(length(stat_3t96))/length(stat_all)); 22 | fprintf('percentage of length 3 to 8: %.2f%%\n', 100*double(sstat_3t8)/length(stat_all)); 23 | fprintf('percentage of length above 80: %.2f%%\n', 100*double(sstat_9t12)/length(stat_all)); 24 | fprintf('percentage of length 3 to 64: %.2f%%\n', 100*double(length(stat_3t64))/length(stat_all)); 25 | 26 | figure; 27 | hist(stat_all, 1:2500); 28 | title('histogram of sentences'' lengths between 1 to max'); 29 | 30 | figure; 31 | hist(stat_lt600, 1:600); 32 | title('histogram of sentences'' lengths between 1 to 599'); 33 | 34 | figure; 35 | hist(stat_le96, 1:96); 36 | title('histogram of sentences'' length between 1 to 96'); 37 | 38 | figure; 39 | hist(stat_3t96, 3:96); 40 | title('histogram of sentences'' length between 3 to 96'); 41 | 42 | figure; 43 | hist(stat_3t64, 3:64); 44 | title('histogram of sentences'' length between 3 to 64'); 45 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | -------------------------------------------------------------------------------- /test/snippet.py: -------------------------------------------------------------------------------- 1 | from models import NCELangModel 2 | import os, re 3 | import logging 4 | import numpy as np 5 | 6 | logging.basicConfig(level=logging.DEBUG) 7 | 8 | trn_regex=re.compile(r'\d{3}.bz2') 9 | dir_ = 'data/fake/test' 10 | train_files = [os.path.join(dir_, f) for f in os.listdir(dir_) if trn_regex.match(f)] 11 | X = np.loadtxt(train_files[0], dtype='int32') 12 | 13 | model = NCELangModel(vocab_size=15, nb_negative=2, embed_dims=128) 14 | ins, _ = model.prepare_input(X, 0, None) 15 | data = {model.input['idxes']: ins[0]} 16 | model.compile() 17 | -------------------------------------------------------------------------------- /test/test_io.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import unittest 4 | __author__ = 'Yunchuan Chen' 5 | 6 | 7 | class ReadFileTest(unittest.TestCase): 8 | def test_readlines(self): 9 | iter_lines = [] 10 | read_lines = [] 11 | with file('../data/test_data') as f: 12 | for line in f: 13 | iter_lines.append(line) 14 | 15 | f.seek(0) 16 | while True: 17 | line = f.readline() 18 | if len(line) == 0: 19 | break 20 | read_lines.append(line) 21 | self.failUnless(len(iter_lines) == len(read_lines), 22 | 'Iterating over file is different from readlines\n' 23 | 'The result of iterating over lines: %s\n' 24 | 'The result of readlines: %s' % (str(iter_lines), str(read_lines))) 25 | 26 | 27 | if __name__ == '__main__': 28 | unittest.main() 29 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | -------------------------------------------------------------------------------- /utils/fake_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | from preprocess import smart_open 5 | import sys 6 | import os 7 | 8 | 9 | def chunks(l, n): 10 | """Yield successive n-sized chunks from l.""" 11 | for i in xrange(0, len(l), n): 12 | yield l[i:i+n] 13 | 14 | 15 | def generate(dist_dir, corpus_file='../data/corpus/wiki-sg-norm-lc.tar.bz2', sent_len=64, 16 | max_size=100*2**20, file_size=2**20): 17 | def sentence_generator(): 18 | with smart_open(corpus_file) as f: 19 | for sent in f: 20 | words_ = sent.split() 21 | words = [w for w in words_ if not w.startswith('_')] 22 | chars_ = list(''.join(words)) 23 | chars = [c for c in chars_ if ord('a') <= ord(c) <= ord('z')] 24 | chunk_len = sent_len - 1 25 | if len(chars) < chunk_len: 26 | continue 27 | 28 | num_chars = [(ord(x)-ord('a'))//2 + 1 for x in chars] 29 | 30 | def prefix_line(prefix_char, line): 31 | tmp = [prefix_char] 32 | for c in line: 33 | tmp.append(str(c)) 34 | return ' '.join(tmp) + '\n' 35 | 36 | cnks = list(chunks(num_chars, chunk_len)) 37 | line = cnks[0] 38 | yield prefix_line('0', line) 39 | for line in cnks[:-1]: 40 | yield prefix_line('14', line) 41 | line = cnks[-1] 42 | if len(cnks) == chunk_len: 43 | yield prefix_line('14', line) 44 | 45 | def file_name_generator(max_nb_file=100000, spec='%03d.bz2'): 46 | for idx in xrange(max_nb_file): 47 | dist_file_ = spec % idx 48 | yield os.path.join(dist_dir, dist_file_) 49 | 50 | dfn_gen = file_name_generator() 51 | dist_file_name = dfn_gen.next() 52 | dist_file = smart_open(dist_file_name, mode='wb', buffering=2**10) 53 | sentences = sentence_generator() 54 | 55 | last_size = 0 56 | nb_line = 0 57 | while True: 58 | try: 59 | next_line = sentences.next() 60 | except StopIteration: 61 | break 62 | dist_file.write(next_line) 63 | nb_line += 1 64 | if nb_line % 100 == 0: 65 | if dist_file.tell() >= file_size: 66 | last_size += dist_file.tell() 67 | dist_file.close() 68 | 69 | if last_size >= max_size: 70 | break 71 | 72 | dist_file_name = dfn_gen.next() 73 | dist_file = smart_open(dist_file_name, mode='wb', buffering=2*10) 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /utils/preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from bz2 import BZ2File 5 | from copy import copy 6 | import unittest 7 | import os 8 | import numpy as np 9 | import cPickle as pickle 10 | import logging 11 | import re 12 | 13 | __author__ = 'Yunchuan Chen' 14 | logging.basicConfig(level=logging.INFO) 15 | 16 | 17 | class ReadFileTest(unittest.TestCase): 18 | def test_prprcs_wrt(self): 19 | if not os.path.exists('../data/corpus/wiki-sg-norm-lc-drop.bz2'): 20 | return 21 | with BZ2File('../data/corpus/wiki-sg-norm-lc-drop.bz2') as f: 22 | f.readline() 23 | line = f.readline() 24 | self.failUnless('it was shortlisted for the booker prize and won several other awards .'.strip() == line.strip(), 25 | 'read line: %s not as expected.\n' % line) 26 | 27 | def test_ixport(self): 28 | wpx, flag = export_wordmap() 29 | wpi = import_wordmap() 30 | 31 | self.failUnless(flag is True, 'Failure flag received from export map') 32 | if wpx is not None: 33 | self.failUnless('word2idx' in wpx, 'word2idx key lost for the wordmap.') 34 | self.failUnless('idx2word' in wpx, 'idx2word key lost for the wordmap.') 35 | self.failUnless('idx2wc' in wpx, 'idx2wc key lost for the wordmap.') 36 | 37 | self.failUnless('word2idx' in wpi, 'word2idx key lost for the wordmap.') 38 | self.failUnless('idx2word' in wpi, 'idx2word key lost for the wordmap.') 39 | self.failUnless('idx2wc' in wpi, 'idx2wc key lost for the wordmap.') 40 | 41 | 42 | def smart_open(fname, mode='rb', buffering=5*2**20): 43 | _, ext = os.path.splitext(fname) 44 | if ext == '.bz2': 45 | from bz2 import BZ2File 46 | return BZ2File(fname, mode, buffering) 47 | # if ext == '.gz': 48 | # from gzip import GzipFile 49 | # return GzipFile(fname, mode, buffering) 50 | return open(fname, mode, buffering) 51 | 52 | 53 | def export_wordmap(dist_file='../data/wiki-wordmap.wp', 54 | corpus_file='../data/corpus/wiki-sg-norm-lc.txt', rebuild=False): 55 | """ 56 | :param dist_file: file name to store the wordmap 57 | :param corpus_file: corpus source to build wordmap against 58 | :param rebuild: whether rebuild wordmap if it already exists. 59 | :return: exported model and a flag. 60 | """ 61 | if os.path.exists(dist_file) and not rebuild: 62 | return None, True 63 | word2cnt = dict() 64 | with smart_open(corpus_file, buffering=5*2**20) as f: 65 | for sent in f: 66 | words = sent.split() 67 | for w in words: 68 | try: 69 | word2cnt[w] += 1 70 | except KeyError: 71 | word2cnt[w] = 1 72 | kv = sorted(word2cnt.items(), key=lambda x: x[1], reverse=True) 73 | idx2word = [w for w, _ in kv] 74 | idx2wc = [c for _, c in kv] 75 | word2idx = dict((w, idx) for idx, (w, _) in enumerate(kv)) 76 | model = {'idx2word': idx2word, 'idx2wc': idx2wc, 'word2idx': word2idx} 77 | with file(dist_file, 'wb') as f: 78 | pickle.dump(model, f, -1) 79 | return model, True 80 | 81 | 82 | def import_wordmap(fname='../data/wiki-wordmap.wp'): 83 | """ 84 | :param fname: a string indicate where the wordmap stores. 85 | :return: wordmap 86 | """ 87 | with file(fname, 'rb') as f: 88 | wp = pickle.load(f) 89 | return wp 90 | 91 | 92 | def preprocess_corpus(corpus_file='../data/corpus/wiki-sg-norm-lc.txt', 93 | dist_file='../data/corpus/wiki-sg-norm-lc-drop.bz2'): 94 | """ 95 | :param corpus_file: original corpus file name 96 | :type corpus_file: str 97 | :param dist_file: location to store the preprocessed corpus. 98 | :type dist_file: str 99 | :return: None 100 | Drop all sentences with length not in [3, 64]. Replace words that occurs less than 100 times 101 | with a special word __rare__. 102 | """ 103 | corpus_file = file(corpus_file) 104 | dist_file = smart_open(dist_file, mode='w') 105 | 106 | assert corpus_file is not None and dist_file is not None 107 | wp = import_wordmap() 108 | for line in corpus_file: 109 | words = line.split() 110 | if not (3 <= len(words) <= 64): 111 | continue 112 | words_ = copy(words) 113 | for idx, w in enumerate(words): 114 | if w not in wp['word2idx']: 115 | words_[idx] = '__rare__' 116 | sentence = ' '.join(words_) 117 | dist_file.writelines([sentence, '\n']) 118 | 119 | corpus_file.close() 120 | dist_file.close() 121 | 122 | 123 | def binarize_corpus(group_size=20000, corpus_file='../data/corpus/wiki-sg-norm-lc-drop.bz2', 124 | dist_file='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2', 125 | max_len=64, wordmap='../data/wiki-wordmap.wp'): 126 | """ 127 | :param group_size: group size. We repeatedly read group size of sentences and 128 | convert and store them into binary format as a batch. 129 | :type group_size: int 130 | :param corpus_file: the corpus to be converted 131 | :type corpus_file: str 132 | :param dist_file: the file to store the converted corpus 133 | :param max_len: maximum length of sentence. Sentences exceeds this length will be dropped. 134 | :param wordmap: wordmap. 135 | :return: None 136 | """ 137 | def _index_sentence(sent): 138 | """ 139 | :param sent: a sentence as a string 140 | :type sent: str 141 | :return: a list of word index 142 | Represents a sentence using word indexes. 143 | """ 144 | words = sent.split() 145 | return [word2idx[w] for w in words] 146 | 147 | def _commit_result(): 148 | for idx_sent in result[3:]: 149 | if len(idx_sent) > 0: 150 | sents = np.array(idx_sent, dtype=np.int32) 151 | shape = np.array(sents.shape, dtype=np.int32) 152 | dist_file.write(shape.tobytes()) 153 | dist_file.write(sents.tobytes()) 154 | 155 | for j in range(len(result)): 156 | result[j] = [] 157 | 158 | dist_file = smart_open(dist_file, 'wb') 159 | assert dist_file is not None 160 | if isinstance(wordmap, str): 161 | wp = import_wordmap(fname=wordmap) 162 | elif isinstance(wordmap, dict): 163 | wp = wordmap 164 | else: 165 | logging.error('can not recognize wordmap type') 166 | raise TypeError('wordamp must be dict or str') 167 | word2idx = wp['word2idx'] 168 | result = [[] for _ in range(max_len + 1)] 169 | with smart_open(corpus_file) as f: 170 | for i, sent in enumerate(f, start=1): 171 | idxs = _index_sentence(sent) 172 | try: 173 | result[len(idxs)].append(idxs) 174 | if i % group_size == 0: 175 | _commit_result() 176 | except IndexError: 177 | continue 178 | _commit_result() 179 | 180 | dist_file.close() 181 | 182 | 183 | def grouped_sentences(binary_corpus='../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'): 184 | with smart_open(binary_corpus) as f: 185 | while True: 186 | shape_data = f.read(2*4) 187 | if shape_data == "": 188 | return 189 | shape = np.frombuffer(shape_data, dtype=np.uint32) 190 | siz = shape[0] * shape[1] * 4 191 | sents = np.frombuffer(f.read(siz), dtype=np.uint32) 192 | # noinspection PyTypeChecker 193 | sents_ = np.reshape(sents, shape) 194 | yield sents_.copy().astype('int32') 195 | 196 | 197 | def show_grouped_sentences(group_sents, wordmap='../data/wiki-wordmap.wp'): 198 | """ 199 | :param group_sents: a matrix represents a set of sentences' indexes 200 | :type group_sents: numpy.ndarray 201 | :param wordmap: word_ to index_ map and vise versa 202 | :return: list, a list of string representation of the sentences. 203 | """ 204 | if isinstance(wordmap, str): 205 | # import logging 206 | logger = logging.getLogger('Preprocess') 207 | logger.warn('It would be inefficient if repeatedly call this function with wordmap name') 208 | wordmap = import_wordmap(fname=wordmap) 209 | idx2word = wordmap['idx2word'] 210 | elif isinstance(wordmap, dict): 211 | idx2word = wordmap['idx2word'] 212 | else: 213 | raise TypeError('wordmap must be a string representing the map location or ' 214 | 'a dictionary containing the map') 215 | ret = [None] * group_sents.shape[0] 216 | for i in range(len(ret)): 217 | ret[i] = [idx2word[j] for j in group_sents[i]] 218 | 219 | return ret 220 | 221 | 222 | def get_fake_data_meta(fname='../data/fake', trn_regex=re.compile(r'\d{3}.bz2')): 223 | data_path = os.path.abspath(fname) 224 | meta_file = os.path.join(data_path, 'meta.pkl') 225 | if not os.path.isfile(meta_file): 226 | train_files_ = [os.path.join(data_path, f) for f in os.listdir(data_path) if trn_regex.match(f)] 227 | train_files = [f for f in train_files_ if os.path.isfile(f)] 228 | nb_total = 0 229 | nb_bin = np.zeros((15,), dtype='int32') 230 | 231 | for f in train_files: 232 | X = np.loadtxt(f, dtype='int32') 233 | nb_bin += np.bincount(X.ravel(), minlength=15) 234 | nb_total += np.prod(X.shape) 235 | 236 | rel_freq = nb_bin.astype('float32')/nb_total 237 | ret = {'freq': nb_bin, 'rel_freq': rel_freq, 'nb_total': nb_total} 238 | with file(meta_file, 'wb') as mf: 239 | pickle.dump(ret, mf) 240 | else: 241 | with file(meta_file, 'rb') as mf: 242 | ret = pickle.load(mf) 243 | 244 | return ret 245 | 246 | if __name__ == '__main__': 247 | if not os.path.exists('../data/corpus/wiki-sg-norm-lc-drop.bz2'): 248 | export_wordmap() 249 | preprocess_corpus() 250 | if not os.path.exists('../data/corpus/wiki-sg-norm-lc-drop-bin.bz2'): 251 | binarize_corpus() 252 | 253 | unittest.main() -------------------------------------------------------------------------------- /utils/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | 5 | from tree_util import load_brown_tree 6 | 7 | idx2cls, idx2bitstr, mask = load_brown_tree('../brown-cluster/fake-c15-p2.out/paths', dict((str(x), x) for x in range(15))) 8 | 9 | print idx2cls 10 | print idx2bitstr 11 | print mask 12 | -------------------------------------------------------------------------------- /utils/tree_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'Yunchuan Chen' 4 | import numpy as np 5 | import cPickle as pickle 6 | import Queue 7 | 8 | 9 | def prefix_generator(s, start=0, end=None): 10 | if end is None: 11 | end = len(s) + 1 12 | for idx in range(start, end): 13 | yield s[:idx] 14 | 15 | 16 | #paths_line = re.compile(r'(\d+)\s+(\S+)\s+(\d+)') 17 | def load_brown_tree(paths_file, word2idx, start=0, to_end=False): 18 | """ 19 | :param paths_file: paths file which is the output of the wcluster program 20 | :param word2idx: a dictionary which maps each word in vocabulary to a index 21 | :type word2idx: dict 22 | :return: a tuple of word index to word cluster id and bit string and a mask 23 | """ 24 | bit_namespace = set() 25 | idx2bitstr = [None] * len(word2idx) 26 | idx2cls = [None] * len(word2idx) 27 | idx2cls_name = [None] * len(word2idx) 28 | # cls_idx = -1 29 | with file(paths_file, 'r') as f: 30 | for line in f: 31 | try: 32 | bitstr_, word, _ = line.split() 33 | except ValueError: 34 | continue 35 | word_clses = [] 36 | end_ = len(bitstr_) if not to_end else len(bitstr_) + 1 37 | for pre in prefix_generator(bitstr_, start=start, end=end_): 38 | if pre not in bit_namespace: 39 | bit_namespace.add(pre) 40 | # cls_idx += 1 41 | word_clses.append(pre) 42 | bitstr = [1 if x == '1' else -1 for x in bitstr_[:end_]] 43 | word_idx = word2idx[word] 44 | idx2bitstr[word_idx] = bitstr 45 | idx2cls_name[word_idx] = word_clses 46 | node_names = sorted(bit_namespace, key=lambda x: len(x)) 47 | clsname2idx = dict(((n, idx) for idx, n in enumerate(node_names))) 48 | for i in range(len(idx2cls)): 49 | idx2cls[i] = [clsname2idx[x] for x in idx2cls_name[i]] 50 | 51 | idx2cls = np.array(pad_virtual_class(idx2cls, pad_value=len(node_names)), dtype='int32') 52 | idx2bitstr = np.array(pad_bitstr(idx2bitstr), dtype='int8') 53 | return idx2cls, idx2bitstr, idx2bitstr != 0 54 | 55 | 56 | def pad_bitstr(bitstr): 57 | """ 58 | :param bitstr: 59 | :type bitstr: list 60 | :return: padded list of bits 61 | """ 62 | max_bit_len = 0 63 | for bits in bitstr: 64 | if len(bits) > max_bit_len: 65 | max_bit_len = len(bits) 66 | for bits in bitstr: 67 | bits.extend([0] * (max_bit_len-len(bits))) 68 | 69 | return bitstr 70 | 71 | 72 | def pad_virtual_class(clses, pad_value): 73 | max_cls_len = 0 74 | for nodes in clses: 75 | if len(nodes) > max_cls_len: 76 | max_cls_len = len(nodes) 77 | for nodes in clses: 78 | nodes.extend([pad_value] * (max_cls_len-len(nodes))) 79 | 80 | return clses 81 | 82 | 83 | def save_tree(fn, idx2cls, idx2bitstr, mask): 84 | with file(fn, 'wb') as f: 85 | pickle.dump({'idx2cls': idx2cls, 'idx2bitstr': idx2bitstr, 'mask': mask}, f) 86 | 87 | 88 | class HuffmanNode(object): 89 | def __init__(self, left=None, right=None, root=None): 90 | self.left = left 91 | self.right = right 92 | self.root = root # Why? Not needed for anything. 93 | 94 | def children(self): 95 | return self.left, self.right 96 | 97 | def preorder(self, path=None, left_code=0, right_code=1, collector=None): 98 | if collector is None: 99 | collector = [] 100 | if path is None: 101 | path = [] 102 | if self.left is not None: 103 | if isinstance(self.left[1], HuffmanNode): 104 | self.left[1].preorder(path+[left_code], left_code, right_code, collector) 105 | else: 106 | # print(self.left[1], path+[left_code]) 107 | collector.append((self.left[1], self.left[0], path+[left_code])) 108 | if self.right is not None: 109 | if isinstance(self.right[1], HuffmanNode): 110 | self.right[1].preorder(path+[right_code], left_code, right_code, collector) 111 | else: 112 | # print(self.right[1], path+[right_code]) 113 | collector.append((self.right[1], self.right[0], path+[right_code])) 114 | 115 | return collector 116 | 117 | 118 | def create_tree(frequencies): 119 | p = Queue.PriorityQueue() 120 | for value in frequencies: # 1. Create a leaf node for each symbol 121 | p.put(value) # and add it to the priority queue 122 | while p.qsize() > 1: # 2. While there is more than one node 123 | l, r = p.get(), p.get() # 2a. remove two highest nodes 124 | node = HuffmanNode(l, r) # 2b. create internal node with children 125 | p.put((l[0]+r[0], node)) # 2c. add new node to queue 126 | return p.get() # 3. tree is complete - return root node 127 | 128 | 129 | def load_huffman_tree(meta_file): 130 | import cPickle as pickle 131 | with file(meta_file, 'rb') as f: 132 | meta = pickle.load(f) 133 | rel_freq = meta['rel_freq'] 134 | freq = zip(rel_freq, range(len(rel_freq))) 135 | tree = create_tree(freq)[1] 136 | x = tree.preorder(left_code=-1, right_code=1) 137 | y = sorted(x, key=lambda z: z[1], reverse=True) 138 | bitstr = [] 139 | for _, _, bitstr_ in y: 140 | bitstr.append(bitstr_[:-1]) 141 | 142 | z = [(wrdidx, bits, list(prefix_generator(bits, end=len(bits)))) for wrdidx, _, bits in y] 143 | clses = set() 144 | for _, _, ele in z: 145 | for i in ele: 146 | clses.add(''.join('%+d' % j for j in i)) 147 | idx2clses = sorted(clses, key=lambda ele: len(ele)) 148 | cls2idx = dict(((cls, idx) for idx, cls in enumerate(idx2clses))) 149 | w = map(lambda x: (x[0], x[1], [cls2idx[''.join('%+d' % j for j in p)] for p in x[2]]), z) 150 | 151 | tmp1, tmp2 = [], [] 152 | for _, bits, cls_idx in w: 153 | tmp1.append(bits) 154 | tmp2.append(cls_idx) 155 | pad_bitstr(tmp1) 156 | pad_virtual_class(tmp2, pad_value=len(idx2clses)) 157 | assert len(freq) == len(w) 158 | idx2cls = [None] * len(freq) 159 | idx2bitstr = [None] * len(freq) 160 | for idx, bitstr_, cls_ in w: 161 | idx2cls[idx] = cls_ 162 | idx2bitstr[idx] = bitstr_ 163 | 164 | idx2cls = np.array(idx2cls, dtype='int32') 165 | idx2bitstr = np.array(idx2bitstr, dtype='int8') 166 | 167 | return idx2cls, idx2bitstr, idx2bitstr != 0 168 | 169 | if __name__ == '__main__': 170 | freq = [ 171 | (8.167, 'a'), (1.492, 'b'), (2.782, 'c'), (4.253, 'd'), 172 | (12.702, 'e'),(2.228, 'f'), (2.015, 'g'), (6.094, 'h'), 173 | (6.966, 'i'), (0.153, 'j'), (0.747, 'k'), (4.025, 'l'), 174 | (2.406, 'm'), (6.749, 'n'), (7.507, 'o'), (1.929, 'p'), 175 | (0.095, 'q'), (5.987, 'r'), (6.327, 's'), (9.056, 't'), 176 | (2.758, 'u'), (1.037, 'v'), (2.365, 'w'), (0.150, 'x'), 177 | (1.974, 'y'), (0.074, 'z')] 178 | node = create_tree(freq) 179 | print(node) 180 | 181 | --------------------------------------------------------------------------------