├── .gitignore ├── .hgignore ├── README.txt ├── data-sample-bilingual ├── en-de │ ├── training.align.en-de │ ├── training.de │ └── training.en ├── en-es │ ├── training.align.en-es │ ├── training.align.es-en │ ├── training.en │ └── training.es ├── en-fr │ ├── training2.align.en-fr │ ├── training2.align.fr-en │ ├── training2.en │ └── training2.fr ├── en-it │ ├── training.align.en-it │ ├── training.en │ └── training.it └── en-nl │ ├── training.align.en-nl │ ├── training.en │ └── training.nl ├── data.txt ├── data ├── README.txt ├── allwords.gz ├── allwords.vocabulary-200.txt ├── allwords.vocabulary.txt.gz └── batch └── scripts ├── LOGS ├── LOGS.NOBACKUP └── .keep ├── batch ├── batch-build-examples ├── batch-short ├── batch-w2w ├── batch-w2w2 ├── batch_ngrams ├── diagnostics.py ├── dump-embeddings.py ├── eda ├── badrun.py ├── batch-make-curves.sh ├── make-graphs-trainerror.pl ├── old │ ├── batch-make-curves.sh │ ├── make-graphs-trainloss.pl │ └── make-graphs-validationlogrankloss.pl └── remove-nonfinal-models.pl ├── hyperparameters.language-model.full.yaml ├── hyperparameters.language-model.sample.yaml ├── hyperparameters.py ├── lemmatizer.py ├── miscglobals.py ├── model ├── __init__.py ├── graphcw.py ├── graphlbl.py ├── model.py └── parameters.py ├── monolingual ├── __init__.py ├── build-vocabulary.py ├── corrupt.py ├── examples.py ├── noise.py ├── state.py ├── train.py └── vocabulary.py ├── ngrams.py ├── preprocess ├── filter-sentences-by-lemma.py ├── lemmatizer.py ├── lowercase.pl ├── preprocess-validation.pl └── reverse-alignment.pl ├── random-validation-examples.py ├── rundir.py ├── w2w ├── __init__.py ├── build-example-cache.py ├── build-initial-embeddings.py ├── build-target-vocabulary.py ├── build-vocabulary.py ├── corpora.py ├── dump-example-cache.py ├── dump-target-vocabulary.py ├── dump-vocabulary.py ├── examples.py ├── state.py ├── targetvocabulary.py ├── train.py └── vocabulary.py └── weight-histogram.py /.gitignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | 3 | *~ 4 | \#*\# 5 | bak 6 | .coverage 7 | *.dag 8 | *.dag.* 9 | data 10 | data-sample-bilingual/*.pkl.gz 11 | data-sample-bilingual/*.png 12 | fmap*.pkl.gz 13 | html 14 | hyperparameters.language-model.yaml 15 | LOGS 16 | LOGS*/[A-Za-z]* 17 | *.o 18 | old 19 | *.orig 20 | out 21 | *.out 22 | *.out.gz 23 | pdf 24 | *.pyc 25 | results 26 | *.so 27 | *.sw? 28 | TMP_DBI 29 | wsj_10* 30 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | 3 | *~ 4 | \#*\# 5 | bak 6 | .coverage 7 | *.dag 8 | *.dag.* 9 | data 10 | data-sample-bilingual/*.pkl.gz 11 | data-sample-bilingual/*.png 12 | fmap*.pkl.gz 13 | html 14 | hyperparameters.language-model.yaml 15 | LOGS 16 | LOGS*/[A-Za-z]* 17 | *.o 18 | old 19 | *.orig 20 | out 21 | *.out 22 | *.out.gz 23 | pdf 24 | *.pyc 25 | results 26 | *.so 27 | *.sw? 28 | TMP_DBI 29 | wsj_10* 30 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | Approach based upon language model in Bengio et al ICML 09 "Curriculum Learning". 2 | 3 | 4 | You will need my common python library: 5 | http://github.com/turian/common 6 | and my textSNE wrapper for t-SNE: 7 | http://github.com:turian/textSNE 8 | 9 | You will need Murmur for hashing. 10 | easy_install Murmur 11 | 12 | To train a monolingual language model, probably you should run: 13 | [edit hyperparameters.language-model.yaml] 14 | ./build-vocabulary.py 15 | ./train.py 16 | 17 | To train word-to-word multilingual model, probably you should run: 18 | cd scripts; ln -s hyperparameters.language-model.sample.yaml s hyperparameters.language-model.yaml 19 | 20 | # Create validation data: 21 | ./preprocess-validation.pl > ~/data/SemEval-2-2010/Task\ 3\ -\ Cross-Lingual\ Word\ Sense\ Disambiguation/validation.txt Tokenizer v3 22 | 23 | # [optional: Lemmatize] 24 | Tadpole --skip=tmp -t ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-nl/filtered-training.nl | perl -ne 's/\t/ /g; print lc($_);' | chop 3 | from-one-line-per-word-to-one-line-per-sentence.py > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-nl/filtered-training-lemmas.nl 25 | # 26 | 27 | [TODO: 28 | * Initialize using monolingual language model in source language. 29 | * Loss = logistic, not margin. 30 | ] 31 | 32 | # [optional: Run the following if your alignment for language pair l1-l2 33 | # is in form l2-l1] 34 | ./scripts/preprocess/reverse-alignment.pl 35 | 36 | ./w2w/build-vocabulary.py 37 | # Then see the output with ./w2w/dump-vocabulary.py, to see if you want 38 | # to adjust the w2w minfreq hyperparameter 39 | 40 | ./w2w/build-target-vocabulary.py 41 | # Then see the output with ./w2w/dump-target-vocabulary.py 42 | 43 | ./w2w/build-initial-embeddings.py 44 | 45 | # [optional: Filter the corpora only to include sentences with certain 46 | # focus words.] 47 | # You want to make sure this happens AFTER 48 | # ./w2w/build-initial-embeddings.py, so you have good embeddings for words 49 | # that aren't as common in the filtered corpora. 50 | ./scripts/preprocess/filter-sentences-by-lemma.py 51 | # You should then move the filtered corpora to a new data directory.] 52 | 53 | #[optional: This will cache all the training examples onto disk. This will 54 | # happen automatically during training anyhow.] 55 | ./scripts/w2w/build-example-cache.py 56 | 57 | ./w2w/train.py 58 | 59 | TODO: 60 | * sqrt scaling of SGD updates 61 | * Use normalization of embeddings? 62 | * How do we initialize embeddings? 63 | * Use tanh, not softsign? 64 | * When doing SGD on embeddings, use sqrt scaling of embedding size? 65 | -------------------------------------------------------------------------------- /data-sample-bilingual/en-fr/training2.align.en-fr: -------------------------------------------------------------------------------- 1 | 17-22 3-3 6-7 9-12 12-15 15-20 1-2 18-23 4-4 7-9 7-8 10-13 13-19 16-21 2-0 19-24 5-6 8-11 11-14 0-1 2 | 0-0 3-1 3 | 0-0 3-1 4 | 3-4 6-7 12-15 9-10 2-3 5-6 11-14 8-9 1-2 4-5 7-8 10-13 0-0 5 | 6-9 9-13 12-16 2-4 5-8 11-17 8-11 1-3 4-7 7-10 10-15 10-14 13-18 0-2 0-1 0-0 3-5 6 | 1-1 0-0 7 | 11-8 3-1 14-11 17-14 6-3 9-6 20-16 1-2 12-9 15-12 4-1 18-15 21-17 10-7 2-0 13-10 5-4 16-13 19-16 8-5 8 | 13-12 6-7 23-20 2-5 9-9 16-13 19-17 12-11 18-16 1-2 21-19 7-8 17-15 0-0 20-18 9 | 3-4 8-0 3-3 22-10 17-13 4-5 9-1 20-11 23-16 18-15 5-6 10-1 16-14 21-8 19-15 10 | 1-1 0-2 3-3 11 | 1-1 0-0 12 | 14-13 17-16 6-6 9-9 1-3 12-12 4-5 15-14 7-7 18-17 21-21 10-10 16-15 5-4 19-19 8-8 19-18 22-22 11-11 13 | 1-1 2-2 3-5 4-6 0-0 14 | 3-3 6-6 12-15 9-10 9-9 2-2 5-5 8-8 11-12 14-16 1-1 4-4 7-7 10-11 0-0 13-14 15 | 17-23 3-1 6-8 9-12 12-14 1-2 15-18 4-3 7-7 13-15 16-20 5-4 8-11 11-13 0-0 16 | 3-4 6-9 12-15 15-21 1-2 4-5 7-10 10-11 13-17 16-22 2-3 5-8 5-7 8-14 11-13 14-20 0-0 17 | 3-4 6-6 9-9 5-5 8-8 1-2 4-5 7-7 0-0 18 | 13-13 3-3 6-5 9-9 12-12 2-2 8-8 11-11 4-4 7-6 0-1 10-10 0-0 19 | 1-3 0-2 20 | 3-4 25-24 6-9 17-18 9-12 20-21 12-15 1-2 15-17 26-26 4-5 7-10 18-19 10-13 21-22 13-16 24-25 24-24 2-3 5-8 8-11 19-20 11-14 0-0 21 | 1-3 2-4 0-0 22 | 9-14 2-3 8-13 12-10 1-2 1-1 14-15 4-5 11-9 6-12 0-0 3-5 23 | 4-4 1-1 5-6 2-2 6-7 3-3 0-0 24 | 3-5 14-13 17-17 20-19 9-8 6-0 12-12 23-22 4-6 15-14 18-18 21-20 10-9 7-2 2-4 16-15 22-21 11-10 25 | 9-14 3-1 2-3 7-12 1-0 10-15 0-4 4-2 6-11 26 | 14-15 6-5 9-9 1-3 12-12 15-16 18-20 4-1 7-7 10-10 13-13 2-2 16-17 19-23 5-4 11-11 0-0 27 | 2-10 12-17 5-13 1-9 11-16 4-12 0-8 3-11 10-15 13-18 8-1 8-0 28 | 14-15 6-6 9-9 20-18 12-12 1-1 15-14 4-3 7-7 18-16 10-10 13-13 5-4 8-8 19-17 0-2 11-11 29 | 14-16 17-20 3-2 3-1 20-24 6-7 9-11 12-13 15-18 4-5 4-4 18-21 7-9 21-25 10-12 13-15 2-3 16-19 5-6 19-23 8-10 22-26 11-14 0-0 30 | 16-18 3-3 6-6 12-15 9-10 15-17 2-2 5-5 11-14 8-8 14-16 1-1 4-4 10-13 7-7 0-0 31 | 3-2 6-6 9-10 12-14 2-1 5-5 8-9 11-13 1-1 7-8 10-11 13-15 0-0 32 | 2-1 3-5 0-2 1-4 33 | 6-8 12-18 15-23 2-3 5-7 11-17 14-22 8-10 1-2 10-16 4-5 7-9 13-19 3-6 0-0 34 | 6-7 9-10 12-14 2-2 5-6 8-9 11-13 1-1 14-15 4-4 7-8 10-11 0-0 13-14 3-5 35 | 17-16 9-12 23-25 12-15 1-2 4-9 7-11 10-13 13-18 21-21 16-19 2-1 11-14 8-5 22-22 0-0 14-17 36 | 10-8 13-11 6-4 9-7 2-2 12-10 5-3 1-1 8-5 11-9 0-0 7-4 37 | 3-5 6-11 9-16 12-19 15-22 1-1 4-8 7-12 10-17 21-27 13-18 5-10 2-2 19-26 8-15 11-20 14-21 0-0 38 | 5-7 4-2 2-3 0-4 1-6 3-1 39 | 3-4 14-11 17-17 6-6 20-20 1-1 15-12 18-19 18-18 7-7 2-3 2-2 16-16 8-8 0-0 40 | 0-0 3-1 41 | 0-0 3-1 42 | 14-13 6-7 17-16 9-9 12-12 15-14 4-3 7-8 18-17 10-10 2-2 2-1 13-11 5-6 16-15 19-18 0-0 43 | 14-16 3-4 17-19 6-7 20-22 20-21 9-10 23-26 12-14 1-1 15-17 4-5 18-20 7-8 21-25 10-13 13-15 2-3 16-18 5-6 19-20 8-9 22-23 11-12 0-0 44 | 9-14 12-17 6-6 3-0 8-13 11-16 5-5 10-15 7-8 7-7 4-1 45 | 14-16 19-12 3-3 6-8 9-14 17-10 1-1 15-17 20-13 4-4 10-15 7-7 21-19 18-11 2-2 5-6 13-9 0-0 46 | 3-4 11-7 14-10 17-14 20-20 6-1 23-23 12-8 4-2 15-12 18-17 21-22 0-16 7-1 2-3 10-6 16-13 19-18 5-0 22-21 47 | 6-10 12-21 9-5 1-1 4-7 13-23 7-6 5-9 5-8 11-20 14-24 8-4 0-0 3-7 48 | 1-17 8-1 11-4 14-8 20-22 17-13 7-21 2-19 9-2 12-5 15-9 5-16 18-14 3-20 10-3 13-7 16-10 19-15 0-0 49 | 3-4 9-11 12-13 2-3 5-7 8-10 14-15 4-6 0-5 7-9 10-14 50 | 17-23 20-25 3-0 6-5 9-10 12-15 15-21 18-24 21-26 7-7 10-12 10-11 13-17 16-22 5-1 8-8 11-14 14-18 51 | 5-8 1-1 2-2 3-4 4-7 4-6 0-0 52 | 6-9 9-13 12-17 15-22 18-27 1-1 4-8 7-10 10-15 13-18 16-25 19-28 23-6 2-2 8-14 5-5 11-16 14-20 17-26 21-4 25-29 0-0 53 | 13-12 3-1 16-13 9-8 2-0 8-7 14-15 17-16 7-6 10-11 7-5 54 | 6-9 9-12 2-6 12-15 15-18 5-8 11-14 1-4 14-17 4-7 7-10 0-5 10-13 13-16 55 | 11-9 25-24 14-13 3-0 17-17 20-20 9-6 23-23 12-10 15-15 26-25 4-1 18-14 21-21 7-2 10-9 13-12 16-16 19-19 5-1 19-18 22-22 8-3 56 | 10-8 3-4 9-7 2-3 5-6 1-2 4-5 0-0 57 | 6-7 9-11 2-3 5-6 8-8 1-2 1-1 4-4 7-8 10-12 0-0 3-5 58 | 9-14 3-4 5-9 2-1 1-2 7-11 10-15 0-0 6-10 59 | 3-6 4-24 9-18 6-9 15-21 1-2 18-26 7-14 10-19 13-23 16-25 2-5 19-28 5-10 8-15 11-22 17-27 0-1 0-0 60 | 1-2 0-1 61 | 22-19 3-4 25-23 14-11 28-26 17-15 20-21 1-1 12-9 4-5 23-18 26-24 15-8 2-2 24-22 13-10 18-6 27-25 16-7 0-0 62 | 1-1 0-0 63 | 14-16 3-2 3-1 6-5 9-10 12-13 15-15 26-24 4-3 18-18 7-7 21-22 10-11 13-14 2-1 16-17 27-25 5-4 8-9 19-19 11-13 0-0 64 | 14-16 25-24 3-2 17-19 11-4 9-10 23-25 1-0 15-17 7-11 4-2 10-12 21-22 24-26 16-20 2-1 27-27 8-13 13-8 19-21 22-23 0-0 65 | 3-2 6-5 9-9 12-14 2-1 5-4 11-13 8-8 4-3 7-6 17-15 0-0 13-14 66 | 12-19 8-15 5-9 11-18 1-5 4-8 7-12 10-17 0-2 0-1 3-6 9-16 0-0 6-10 67 | 17-21 25-23 14-12 28-28 11-3 6-5 15-19 12-10 9-2 26-24 21-22 7-4 16-20 13-11 27-27 5-6 10-2 8-8 0-0 68 | 3-4 6-7 9-10 12-13 2-3 5-6 8-9 14-17 1-2 10-16 7-8 0-1 13-15 69 | 3-4 6-8 12-13 1-2 15-18 4-5 7-9 10-11 13-15 2-3 16-19 5-6 8-10 11-12 14-18 0-0 70 | 14-5 16-15 5-9 12-11 15-14 15-13 4-8 1-2 3-7 0-1 71 | 10-8 3-1 6-4 9-7 2-0 5-2 8-6 4-3 7-5 72 | 4-4 1-1 2-2 3-3 0-0 73 | 1-1 5-4 6-6 7-9 6-5 3-2 0-0 74 | 1-1 0-0 75 | 16-18 15-22 12-16 5-9 18-23 2-3 14-21 4-11 11-15 1-2 7-12 17-19 10-14 3-8 13-17 3-7 0-0 76 | 3-4 6-6 9-11 2-3 8-10 5-5 11-13 1-2 7-9 10-12 0-0 77 | 11-9 3-2 17-16 6-5 20-21 9-7 1-1 12-10 15-15 4-3 7-6 21-22 10-8 13-11 5-4 19-20 0-0 78 | 11-7 14-10 15-24 6-2 9-5 12-8 10-14 16-25 7-3 19-29 13-9 5-1 17-28 8-4 79 | 22-19 11-8 3-2 25-22 17-14 9-6 20-15 1-1 23-20 26-23 18-16 7-5 7-4 10-7 2-2 13-11 5-3 16-12 19-17 0-0 80 | 13-13 5-9 2-3 12-12 8-10 4-8 1-2 3-6 0-0 81 | 6-9 12-18 8-16 1-10 5-8 14-20 11-14 4-7 7-12 13-19 9-17 3-6 0-0 2-11 82 | 9-12 12-16 2-4 5-9 8-12 11-15 4-7 1-1 7-11 10-14 0-0 6-10 83 | 9-14 12-17 2-7 5-11 11-20 8-13 1-8 4-9 10-18 13-21 0-5 6-12 3-6 84 | 14-16 20-21 6-4 9-9 1-2 12-12 4-7 18-18 21-24 10-10 2-6 13-15 24-25 5-8 16-13 19-19 0-1 85 | 12-19 8-15 2-3 11-18 14-21 7-14 1-2 10-17 1-1 13-20 9-16 0-0 3-5 86 | 3-2 2-4 1-3 4-6 14-15 7-11 10-13 0-0 13-14 6-10 87 | 3-4 10-8 13-13 16-15 19-18 2-3 9-7 12-9 4-10 18-17 15-11 1-2 1-1 14-14 17-16 0-0 88 | 14-14 3-1 17-16 9-10 1-6 20-19 12-13 4-3 4-2 18-17 21-20 13-15 5-4 19-18 11-11 22-21 89 | 3-3 6-6 14-8 12-15 1-1 23-19 4-4 7-7 10-13 21-22 13-17 24-23 5-5 8-11 11-14 0-0 90 | 13-12 3-2 16-15 6-5 9-8 12-11 15-14 8-7 5-1 11-10 14-13 0-3 7-6 4-0 10-9 91 | 3-4 6-7 9-11 2-3 5-6 8-10 11-13 1-2 4-5 7-9 7-8 10-12 0-0 92 | 14-16 19-12 3-3 6-7 12-18 17-11 1-0 15-17 20-13 4-4 13-16 7-2 18-10 2-1 21-14 5-5 8-8 11-15 22-21 22-20 93 | 3-4 9-12 6-6 12-13 2-3 5-7 17-21 14-15 7-9 0-1 16-20 10-10 0-0 13-14 94 | 4-4 1-1 2-2 3-3 0-0 95 | 3-6 25-26 8-1 14-14 17-17 20-21 9-9 23-24 1-3 12-13 4-7 26-27 15-15 18-19 18-18 29-28 21-22 10-10 2-5 24-25 2-4 7-0 5-8 16-16 19-20 22-23 0-2 11-12 11-11 96 | 1-3 8-8 4-2 3-6 7-7 0-0 97 | 6-9 2-4 5-7 1-3 4-6 11-10 14-13 7-8 10-11 0-0 3-5 98 | 19-12 3-0 12-20 6-6 15-24 9-9 17-11 1-3 20-15 13-22 7-7 2-4 18-10 21-14 5-5 11-19 11-18 14-23 8-8 22-25 0-2 99 | 6-8 2-6 9-10 3-0 12-13 1-5 11-12 5-1 7-9 0-3 10-11 100 | 5-13 8-16 7-15 1-1 3-11 6-14 4-2 0-0 2-11 101 | 3-4 6-7 2-3 5-8 12-10 8-9 11-13 1-2 4-6 7-11 10-12 13-16 0-0 9-15 102 | 3-6 20-28 6-8 9-12 12-18 1-2 4-7 21-29 7-10 10-16 13-19 16-24 2-5 19-26 8-11 11-17 14-21 17-25 0-0 103 | 3-2 6-8 20-23 12-15 9-7 18-21 4-1 7-6 10-10 13-16 5-5 19-22 11-11 0-0 104 | 17-21 3-3 6-8 9-10 12-15 1-1 15-18 18-22 4-4 10-12 10-11 13-16 16-19 2-2 5-6 11-14 0-0 14-17 105 | 4-3 5-4 2-1 6-5 3-2 0-0 7-6 106 | 10-8 6-7 12-14 11-13 1-1 4-6 7-9 0-0 3-5 107 | 3-6 17-20 20-25 9-11 12-15 1-2 4-7 18-21 7-8 5-18 21-24 10-13 13-16 2-5 16-19 19-23 19-22 8-10 22-26 0-1 14-17 108 | 4-4 1-2 5-5 3-3 0-1 0-0 109 | 14-16 3-3 17-18 6-6 9-12 20-21 23-24 15-16 4-4 18-20 7-7 21-22 10-9 24-25 2-2 16-17 5-5 8-11 19-21 11-13 22-23 0-1 110 | 6-7 3-0 2-0 5-5 8-9 1-2 4-4 7-8 0-1 111 | 1-1 2-2 3-4 0-0 4-5 112 | 3-4 17-18 20-21 9-11 12-13 1-2 4-5 15-15 18-19 7-8 10-10 13-14 2-3 5-6 16-16 19-20 8-9 11-12 0-1 0-0 113 | 3-2 6-6 12-15 9-9 2-1 5-5 8-8 11-11 4-3 7-7 10-10 0-0 114 | 6-5 20-22 9-12 12-17 23-25 1-1 15-15 4-3 18-20 21-23 10-9 24-26 13-14 2-2 16-16 5-4 8-11 19-21 22-24 11-13 0-0 25-27 115 | 8-9 4-4 1-2 5-5 6-6 3-3 0-1 0-0 116 | 20-16 13-12 10-6 2-5 9-9 19-15 12-8 22-17 1-4 18-14 11-7 21-16 0-1 0-0 17-13 117 | 3-6 14-15 6-11 12-20 9-13 15-26 1-2 4-8 7-10 10-14 13-17 2-3 5-9 11-20 8-12 0-0 118 | 11-9 14-11 17-14 20-21 9-7 1-1 4-3 15-11 7-6 18-15 21-20 10-8 2-2 13-12 5-4 16-13 19-19 8-7 22-22 0-0 119 | 1-1 0-0 120 | 3-4 25-24 3-3 6-9 9-15 12-16 15-19 23-22 1-1 7-13 18-20 13-17 24-23 2-2 5-8 8-14 11-11 14-18 22-21 0-0 121 | 13-12 13-11 7-1 9-9 2-4 12-12 8-8 1-3 4-7 5-0 11-10 14-13 0-2 3-6 122 | 22-19 3-4 14-13 25-23 6-7 17-15 1-5 12-11 10-21 23-20 15-14 18-16 2-3 13-12 11-22 24-20 5-6 0-2 123 | 17-23 6-9 9-12 1-1 18-24 4-6 15-16 7-11 10-13 13-17 2-3 19-25 8-10 11-14 14-18 0-0 124 | 6-7 13-11 3-0 16-14 15-17 12-12 1-2 11-10 8-4 14-13 17-18 0-3 7-5 10-9 125 | 3-1 6-5 10-4 9-3 11-9 4-2 7-6 0-0 126 | 19-10 6-0 9-6 20-14 12-8 23-18 15-11 18-15 7-1 10-7 13-9 5-2 16-12 127 | 6-4 9-9 8-8 5-2 1-1 7-7 10-10 0-0 3-5 128 | 6-10 2-3 3-4 4-8 5-9 129 | 3-4 6-8 12-16 9-10 2-3 5-7 11-14 1-2 4-6 7-9 10-11 0-1 0-0 130 | 6-11 1-1 2-5 3-6 4-7 0-0 131 | 13-9 6-3 9-5 5-2 11-11 14-10 4-1 17-14 0-0 7-4 132 | 3-3 6-6 9-9 19-17 12-13 2-1 5-5 8-8 1-2 14-16 4-4 10-14 7-7 13-15 0-0 133 | 5-8 2-6 6-9 0-4 1-5 134 | 3-2 12-17 5-9 11-16 2-0 7-15 1-1 8-4 4-3 6-10 135 | 3-1 6-6 9-8 5-6 1-2 11-10 4-4 7-7 0-0 10-9 136 | 3-4 6-8 16-17 9-11 8-14 12-13 2-3 15-16 5-5 11-12 1-2 14-15 4-5 17-18 7-6 0-0 137 | 12-15 15-20 15-19 6-3 18-22 2-1 11-14 14-18 5-2 17-21 1-1 4-5 10-13 3-8 13-16 0-0 138 | 3-3 6-5 2-2 5-6 8-8 1-1 4-4 7-7 0-0 139 | 3-4 25-24 17-17 6-7 9-9 20-18 23-23 12-12 1-0 26-25 4-4 15-13 7-8 10-10 21-19 2-3 24-23 16-16 5-5 8-8 0-2 19-15 11-11 140 | 1-2 2-3 0-0 3-1 141 | 3-4 6-9 9-11 5-8 1-2 7-10 10-12 0-1 142 | 10-7 3-1 9-4 12-8 8-3 11-6 0-2 7-5 143 | 14-15 3-3 17-19 6-6 23-29 9-9 12-12 1-1 15-16 4-4 18-20 21-26 7-7 10-10 13-15 2-2 16-17 5-5 19-22 19-21 8-8 22-25 11-11 0-0 144 | 3-4 9-13 12-15 5-9 2-3 8-12 11-17 1-2 4-6 7-11 4-5 10-14 13-18 0-0 6-10 145 | 4-21 6-7 9-9 1-2 23-22 26-28 15-13 5-20 7-8 10-11 24-23 2-1 16-16 8-10 11-12 0-0 25-27 146 | 5-15 10-7 12-16 9-10 8-14 2-2 4-11 7-13 1-1 11-8 6-12 0-0 147 | 5-9 8-12 1-8 0-7 7-11 3-6 6-10 148 | 14-14 3-3 9-9 17-11 12-13 1-1 4-4 7-7 18-15 21-16 2-2 5-5 8-8 16-10 0-0 149 | 16-17 9-11 12-14 2-2 2-1 8-10 11-13 5-0 7-9 10-12 13-16 3-5 150 | 4-20 17-10 18-25 10-4 3-19 6-22 16-8 12-7 2-17 5-21 11-6 14-11 19-26 1-17 9-15 151 | 5-8 1-2 1-1 2-3 3-4 4-7 0-0 152 | 13-10 10-5 3-0 6-4 16-12 9-6 12-9 5-2 8-7 17-22 11-8 14-11 4-1 153 | 2-4 6-7 0-2 7-8 3-3 0-1 154 | 3-3 6-5 9-9 12-12 2-0 5-4 8-8 11-11 1-1 4-4 7-6 10-10 155 | 1-1 0-2 3-3 156 | 3-4 3-3 6-8 12-14 5-7 2-1 8-10 11-13 1-0 7-9 13-15 0-0 157 | 14-14 17-19 3-0 12-21 6-6 20-22 9-10 1-5 15-16 4-2 7-7 21-23 10-12 10-11 16-18 16-17 5-3 8-9 0-4 11-13 158 | 6-7 12-11 9-5 8-9 1-1 11-10 7-8 0-0 159 | 14-15 14-14 17-20 20-23 9-5 12-12 15-16 18-21 4-2 7-4 24-28 13-13 16-19 19-22 5-3 8-8 22-24 11-10 160 | 3-4 3-3 14-11 6-6 17-15 9-8 20-17 1-1 4-5 15-12 7-6 10-9 2-2 13-10 16-13 8-7 19-16 0-0 161 | 3-4 6-10 17-18 9-12 20-21 23-25 1-1 4-7 15-16 7-11 18-19 10-13 21-23 2-6 24-26 5-8 16-17 19-20 11-14 22-24 0-0 162 | 1-3 2-6 3-7 0-2 163 | 17-19 3-1 14-8 20-21 23-26 26-28 15-17 4-4 7-11 12-6 18-20 10-14 21-22 24-27 16-18 2-0 13-7 8-10 19-20 11-16 22-24 0-3 25-27 164 | 17-23 3-2 20-26 6-6 9-9 12-15 15-22 18-23 4-3 21-27 10-10 13-17 2-1 2-0 19-24 5-5 8-8 14-19 11-11 165 | 4-4 1-1 5-5 2-2 6-6 3-3 7-7 0-0 166 | 3-6 6-12 17-22 16-2 9-15 12-18 4-10 14-1 10-17 7-8 0-16 2-5 15-3 5-11 8-9 13-0 3-7 167 | 6-13 17-23 22-17 20-18 4-11 7-15 18-24 9-0 12-3 2-9 2-8 5-12 10-1 19-21 13-4 3-10 168 | 10-8 6-4 9-7 12-11 2-1 5-3 1-1 8-5 11-10 4-2 7-6 0-0 169 | 14-16 17-17 6-7 9-11 12-15 1-1 15-18 18-22 4-3 7-8 10-12 13-14 2-2 16-17 5-6 19-23 8-10 11-13 0-0 170 | 5-6 1-0 2-3 3-4 0-0 4-5 171 | 6-9 15-22 5-8 14-21 17-24 11-14 20-27 4-7 1-1 10-16 13-20 16-23 9-17 19-25 0-0 172 | 10-8 3-3 9-9 2-2 5-4 8-7 1-1 11-10 4-5 7-6 0-0 173 | 3-4 6-9 9-11 12-15 2-3 5-7 8-10 11-14 11-13 1-2 4-6 7-8 10-12 13-16 0-0 174 | 7-4 8-5 5-2 2-0 6-3 3-1 175 | 1-1 5-6 2-2 6-7 3-3 0-0 4-5 176 | 10-7 6-5 9-10 12-13 2-3 5-8 2-2 8-9 1-1 4-4 11-7 0-0 177 | 3-4 9-9 12-14 15-18 8-13 2-2 14-17 14-16 1-1 11-10 4-5 10-12 13-15 0-0 16-19 178 | 1-1 2-4 3-4 3-3 0-0 4-5 179 | 7-5 4-1 5-2 6-3 0-0 1-4 180 | 1-2 8-8 5-5 2-3 6-6 3-4 7-7 0-0 181 | 3-4 6-8 9-12 2-4 5-7 8-11 11-14 1-1 4-6 7-10 10-13 182 | 3-3 6-7 9-12 2-2 8-11 18-17 11-13 1-1 4-4 10-13 17-16 13-15 0-0 183 | 7-3 6-6 8-15 1-10 5-5 0-9 3-13 10-16 4-4 2-12 9-15 184 | 14-16 3-4 17-19 6-6 9-11 12-14 1-2 15-18 7-8 10-12 13-15 2-3 16-18 5-5 8-10 11-13 0-0 185 | 6-8 12-18 9-12 11-17 5-6 8-11 1-5 14-19 4-7 10-16 7-10 0-3 13-15 186 | 4-4 1-1 2-2 3-3 0-0 187 | 3-4 9-10 2-3 5-6 8-10 1-2 4-7 7-8 10-11 0-0 188 | 17-23 14-15 20-27 3-2 20-26 9-10 12-13 15-20 1-1 18-24 4-4 21-28 7-7 10-11 13-14 16-21 2-1 19-25 5-5 8-9 11-12 0-0 189 | 3-3 6-6 9-9 2-2 5-5 8-8 11-11 1-1 4-4 7-7 10-10 0-0 190 | 6-6 13-10 16-15 3-0 2-1 12-9 15-14 1-4 5-3 8-8 14-13 4-2 7-7 17-16 11-5 191 | 13-13 3-3 16-16 12-15 2-3 5-8 11-14 18-17 4-4 14-13 20-19 6-10 192 | 3-5 6-11 17-18 20-23 12-15 1-4 15-20 4-6 18-21 7-9 10-12 13-16 2-5 5-10 16-19 19-22 11-14 0-3 14-17 193 | 9-13 3-2 12-14 5-9 2-2 14-16 1-1 4-6 7-11 10-12 13-15 0-0 6-10 194 | 2-1 0-3 2-0 3-2 1-4 195 | 10-8 1-11 6-4 9-7 11-13 5-3 5-2 8-6 4-1 2-12 7-5 0-0 196 | 7-0 2-5 5-7 1-3 11-8 4-4 0-2 0-1 3-6 197 | 3-2 17-18 20-20 9-9 23-23 1-0 15-16 4-3 18-15 21-22 7-4 2-1 16-17 5-6 19-19 8-5 22-21 0-0 198 | 7-3 14-6 3-1 10-4 6-2 2-0 16-8 15-7 11-5 199 | 12-16 6-6 9-11 2-2 8-10 11-15 5-4 1-1 4-3 10-13 13-17 7-7 0-0 200 | 13-13 7-3 10-6 1-12 3-0 6-2 9-5 12-8 5-1 8-4 11-7 201 | 10-8 3-4 9-7 2-3 5-6 1-2 4-5 0-0 202 | 1-1 5-4 2-3 6-7 4-6 0-0 203 | 3-2 12-21 17-14 9-7 1-1 15-18 4-5 18-15 10-11 16-17 19-23 11-13 11-12 14-19 8-4 0-0 204 | 9-14 6-9 8-13 5-8 8-12 2-1 1-1 4-6 7-10 0-0 3-5 205 | 7-2 10-7 2-4 9-3 1-1 8-5 11-8 3-6 0-0 206 | 7-3 10-6 3-1 1-10 6-2 9-5 2-0 11-11 8-4 207 | 3-2 6-6 2-2 12-11 5-4 8-8 1-1 11-10 4-3 7-7 0-0 10-9 208 | 1-2 0-1 0-0 209 | 13-13 3-4 6-9 9-12 2-3 8-11 5-5 1-2 7-10 0-0 210 | 14-12 3-1 6-5 20-22 9-9 12-11 15-16 4-2 7-7 18-17 7-6 21-23 13-13 2-0 5-4 19-21 8-8 22-24 11-10 211 | 10-7 7-2 3-0 16-13 6-1 5-5 5-4 15-10 8-6 4-4 11-8 17-14 212 | 25-26 14-14 3-3 17-19 6-6 20-22 9-9 23-24 1-1 15-15 4-4 18-20 7-7 21-23 10-11 13-16 24-25 2-2 5-5 19-21 8-8 22-23 0-0 213 | 4-3 1-1 3-2 0-0 214 | 9-14 12-17 2-6 5-9 8-13 11-16 1-5 1-4 4-8 7-12 10-15 13-18 0-2 3-7 0-1 0-0 6-10 215 | 3-4 13-13 16-16 19-19 2-3 5-7 15-15 8-9 18-18 11-11 1-1 4-5 14-14 7-8 17-17 10-10 0-0 216 | 3-4 6-8 8-14 2-3 8-13 11-17 5-6 1-2 7-12 4-5 0-0 9-15 12-20 217 | 11-9 20-20 2-25 6-2 17-12 15-10 21-21 24-28 18-13 7-3 3-23 13-8 1-24 5-1 16-11 19-16 25-29 8-4 218 | 2-5 3-7 0-3 4-8 1-4 219 | 6-11 3-4 12-21 20-24 9-14 15-20 1-2 4-9 7-12 26-25 21-24 2-3 5-10 8-13 11-19 14-22 0-1 0-0 220 | 3-3 6-5 9-8 2-1 5-4 8-7 1-2 4-3 7-6 10-10 0-0 221 | 14-16 3-3 17-19 9-10 12-12 1-1 15-17 4-4 18-20 7-5 10-11 13-13 2-2 16-18 8-9 0-0 222 | 14-15 3-1 20-24 6-6 23-28 9-10 12-13 15-17 18-21 4-2 21-25 7-7 24-29 10-11 13-18 16-20 16-19 2-0 19-23 8-8 11-12 223 | 3-4 15-18 5-7 2-1 14-19 7-12 17-21 4-6 10-14 13-16 16-20 6-11 0-0 224 | 12-19 18-25 2-5 5-9 11-18 1-4 9-2 4-8 15-10 13-20 0-3 8-1 19-26 3-6 225 | 11-9 14-12 3-1 6-3 9-7 23-23 12-10 15-13 4-2 18-18 7-4 21-21 10-8 13-11 8-5 22-22 0-0 226 | 14-14 17-21 3-3 6-8 23-26 12-13 26-29 15-19 1-0 18-23 4-4 7-9 24-28 13-15 16-20 2-1 5-5 19-22 8-10 11-16 22-25 25-27 227 | 2-7 5-11 8-14 11-19 8-13 1-4 4-9 7-12 10-15 9-18 0-0 3-5 6-10 228 | 25-25 20-26 6-6 17-14 9-9 1-1 23-20 4-3 21-27 26-22 7-7 18-16 10-10 13-12 24-21 27-28 5-5 16-13 8-8 19-18 19-17 22-24 0-0 229 | 6-8 13-12 9-11 16-13 12-14 2-4 5-7 8-10 1-1 4-6 14-15 7-9 0-3 3-5 230 | 14-14 17-16 6-5 20-19 9-9 12-13 1-1 15-17 7-6 24-25 10-7 2-2 16-15 5-4 5-3 8-8 19-18 22-22 25-29 0-0 11-10 231 | 14-14 17-17 6-6 20-19 9-8 1-1 12-11 1-0 4-4 15-14 18-18 7-7 21-21 10-9 2-3 13-12 5-5 16-15 19-20 8-8 0-2 232 | 14-15 3-3 17-20 20-23 6-6 9-9 1-1 15-16 4-4 18-21 7-7 21-24 10-10 13-14 2-2 16-17 5-5 19-22 8-8 11-13 0-0 233 | 3-3 11-22 7-20 2-2 1-1 4-6 6-14 5-18 0-0 8-21 234 | 3-1 3-0 16-13 12-10 5-4 15-12 1-2 8-5 11-9 7-7 235 | 3-5 19-10 22-15 6-8 25-19 11-2 15-21 4-6 20-11 23-14 2-4 5-7 21-12 10-1 24-16 27-22 0-3 14-20 236 | 16-17 3-2 6-5 9-9 15-16 8-8 11-11 1-1 14-15 4-4 7-7 10-10 0-0 237 | 13-13 3-1 6-6 9-8 12-12 2-0 15-14 1-5 8-8 11-10 14-14 0-3 4-2 7-7 17-15 238 | 1-13 3-1 6-3 9-6 12-11 15-15 5-2 8-5 14-14 4-1 10-10 0-0 7-4 239 | 3-4 6-6 2-3 5-5 1-1 7-7 4-2 0-0 240 | 11-7 3-2 17-17 1-1 9-4 23-21 18-18 15-9 10-6 13-10 5-3 16-13 19-19 8-4 0-0 22-20 241 | 6-8 9-11 12-13 2-3 15-16 5-7 8-10 1-2 14-15 4-6 7-9 10-12 0-0 13-14 3-5 242 | 1-2 2-3 3-6 3-5 4-7 0-1 5-10 6-14 243 | 3-4 9-12 5-8 8-11 4-7 1-1 0-0 244 | 23-28 9-10 15-22 12-14 1-2 18-24 7-8 10-13 16-23 13-15 2-3 19-26 5-7 22-27 8-9 0-1 11-11 14-18 17-25 245 | 3-5 14-13 6-7 20-23 9-10 15-21 1-1 18-20 7-8 10-11 13-16 16-22 2-3 5-6 8-9 19-17 11-12 0-0 246 | 5-6 1-0 2-3 3-5 3-4 0-0 4-5 247 | 2-4 4-14 8-11 5-5 3-13 1-1 0-3 7-7 9-15 248 | 10-3 5-8 12-11 9-2 4-7 8-1 7-6 11-5 249 | 19-11 22-13 14-8 12-19 12-18 1-1 20-10 4-3 23-15 13-20 15-7 24-25 18-9 2-2 5-4 11-17 8-5 0-0 250 | 16-4 6-10 9-17 12-20 14-5 4-9 23-22 1-1 22-2 7-12 10-18 20-3 24-23 5-8 8-13 11-19 13-7 0-0 251 | 3-3 6-6 2-2 5-5 7-12 1-2 4-4 0-0 252 | 3-0 0-2 4-6 1-5 253 | 14-16 22-19 3-3 25-22 6-7 9-10 12-13 1-1 15-17 4-5 23-18 26-25 7-8 10-11 21-20 13-15 2-2 24-21 5-6 8-9 11-11 0-0 254 | 19-22 12-16 3-0 9-10 15-18 15-17 18-21 14-20 1-5 8-9 5-3 11-12 7-8 4-1 10-11 16-19 255 | 3-2 14-9 17-15 20-19 12-12 23-20 4-3 10-10 21-18 13-13 24-21 2-1 16-17 8-11 19-16 0-0 11-10 256 | 3-4 6-7 9-10 2-2 5-6 8-9 1-1 4-5 7-8 0-0 257 | 4-3 1-2 5-4 3-3 0-1 0-0 258 | 11-23 2-7 5-12 8-17 5-11 1-6 4-10 10-20 7-14 0-4 3-9 9-19 12-24 259 | 6-8 9-12 12-15 2-2 5-7 8-11 8-10 11-14 1-2 14-16 4-6 7-9 10-13 0-1 13-15 3-5 260 | 17-15 17-14 1-7 20-19 6-0 4-9 9-5 9-4 12-11 15-13 18-17 21-20 7-1 10-6 13-12 16-13 19-18 3-8 8-3 11-10 261 | 17-20 11-4 7-22 4-14 1-1 9-3 15-17 12-5 13-21 5-16 5-15 2-2 16-19 19-23 0-0 262 | 3-4 6-8 9-13 12-17 15-20 1-2 1-1 18-24 18-23 4-5 7-10 13-18 16-22 2-3 19-25 5-7 8-11 11-16 14-19 0-0 263 | 17-21 3-3 20-24 6-7 6-6 9-12 12-16 18-25 15-17 4-4 21-26 7-8 10-13 16-20 2-0 19-23 5-5 8-9 11-15 11-14 0-2 264 | 14-14 3-3 17-17 6-6 9-9 12-12 1-1 15-15 4-4 18-18 7-7 10-10 13-13 2-2 16-16 5-5 19-19 8-8 11-11 0-0 265 | 13-12 16-17 9-11 2-5 5-8 15-15 8-10 1-4 4-7 14-16 7-9 3-6 0-0 266 | 6-5 2-4 8-9 4-8 1-3 1-2 10-10 0-0 267 | 14-14 17-18 20-24 6-5 9-8 1-2 12-11 15-15 4-3 18-19 7-6 10-10 21-20 10-9 2-4 13-12 16-17 19-23 22-25 8-7 0-0 268 | 9-10 2-4 2-3 8-9 4-7 1-2 7-8 10-11 0-1 269 | 17-20 3-2 6-6 20-23 9-9 12-16 15-18 18-21 4-3 21-26 7-7 10-13 16-19 13-11 2-0 22-29 19-22 5-4 0-5 8-8 11-14 14-17 270 | 22-17 3-3 9-10 28-21 20-16 1-1 4-5 23-18 4-4 26-19 18-14 2-2 5-8 8-9 27-20 19-15 0-0 271 | 14-14 3-3 17-16 6-6 9-8 12-13 1-1 15-15 18-22 4-4 7-7 10-10 2-2 13-12 16-16 5-5 8-9 11-11 0-0 272 | 3-6 9-17 17-19 14-10 12-18 4-11 1-2 7-13 18-20 10-16 10-15 15-9 2-5 5-12 16-19 0-0 273 | 3-6 17-22 6-8 9-11 12-14 15-19 18-23 4-3 7-9 10-12 13-15 16-20 2-1 5-7 8-10 11-13 14-18 0-0 274 | 20-26 6-8 9-11 12-16 15-19 1-1 4-6 18-23 18-22 21-27 7-9 10-15 13-20 2-4 5-7 19-24 16-17 22-28 11-14 14-18 0-0 275 | 7-2 2-8 5-11 10-4 12-12 4-10 9-0 8-3 3-9 276 | 6-13 3-5 25-21 17-16 9-10 20-19 1-2 4-6 15-15 12-7 26-23 26-22 18-17 10-9 5-11 2-2 24-20 27-24 16-14 19-18 277 | 4-4 1-2 5-5 2-3 3-4 0-0 278 | 14-15 17-18 20-23 6-4 23-26 9-9 1-5 12-14 15-16 4-3 18-19 7-7 21-24 10-10 13-14 16-17 5-6 19-20 8-8 22-25 0-2 279 | 5-7 2-2 3-3 4-6 1-5 0-0 280 | 16-18 3-3 13-11 12-10 15-13 1-1 4-5 14-14 11-9 4-4 0-0 281 | 13-11 10-6 3-1 16-13 9-4 15-12 18-14 8-3 4-2 11-5 0-0 17-13 282 | 11-8 22-17 14-11 25-19 6-3 9-6 20-15 12-9 23-17 15-12 4-1 26-20 7-4 10-7 21-16 13-10 24-18 16-14 5-2 8-4 0-0 283 | 16-17 2-5 5-10 8-14 15-16 11-15 1-4 4-9 7-12 0-6 10-15 3-7 284 | 3-3 5-12 12-16 9-7 18-22 2-2 11-16 4-10 1-4 0-9 17-21 7-11 10-15 13-17 16-19 285 | 16-5 3-3 12-15 4-11 9-7 23-23 1-1 17-6 7-10 29-25 13-16 5-13 10-8 8-9 19-17 22-24 0-0 6-14 286 | 9-14 12-18 15-22 2-7 8-15 11-18 6-1 14-20 1-5 17-23 10-17 5-0 13-21 4-4 3-8 16-22 287 | 6-8 9-11 12-14 2-3 5-7 8-10 11-13 1-0 7-9 0-4 10-12 3-6 288 | 3-2 6-8 9-11 15-17 4-3 7-9 10-11 21-21 13-15 5-4 8-10 11-12 14-18 0-0 289 | 5-9 5-8 2-1 1-4 4-7 4-6 7-10 0-3 3-5 290 | 1-1 7-11 2-2 4-7 0-0 291 | 16-17 3-1 6-6 12-15 13-9 15-16 5-5 9-4 9-3 8-7 14-16 292 | 3-4 6-6 9-9 2-3 5-6 8-8 1-1 4-5 7-7 0-0 293 | 8-8 4-2 5-5 9-9 6-6 3-1 294 | 7-5 1-1 4-2 8-6 9-7 3-2 6-3 0-0 295 | 5-8 1-1 6-9 2-3 3-7 4-6 0-0 4-5 296 | 2-5 9-9 5-8 8-11 1-6 11-15 13-22 7-10 0-4 10-12 3-7 12-21 297 | 9-11 2-4 6-2 8-10 1-5 11-14 5-1 10-13 0-3 298 | 10-7 13-11 5-10 16-13 5-9 8-8 15-12 18-16 1-2 4-4 21-17 14-12 17-15 0-0 299 | 6-11 17-21 20-24 15-22 12-14 1-1 4-7 7-12 10-17 13-18 16-23 2-5 5-6 8-13 3-10 0-0 300 | 4-4 0-7 6-8 2-2 1-6 3-3 301 | 7-3 3-2 9-6 2-1 5-5 0-10 11-13 1-0 8-4 10-12 302 | 6-9 12-15 2-3 5-8 11-14 4-7 7-11 1-0 0-2 13-16 0-1 3-6 10-10 303 | 4-4 1-1 2-2 3-3 0-0 304 | 4-4 1-1 2-2 3-3 305 | 13-11 6-6 9-9 2-2 5-5 15-14 8-8 1-1 14-13 4-4 7-7 10-10 0-0 306 | 3-4 9-14 6-7 12-15 5-6 8-9 1-2 1-1 11-10 4-5 7-8 10-13 0-0 307 | 7-3 4-2 1-0 8-5 9-6 6-4 0-1 308 | 9-14 16-16 6-2 8-13 12-12 15-16 18-19 5-0 14-15 7-11 0-4 3-5 309 | 3-3 10-5 16-14 13-8 6-4 2-2 9-5 15-12 12-7 8-6 1-1 14-11 0-0 7-4 310 | 14-16 17-21 3-4 6-7 9-10 12-14 1-2 15-17 4-5 7-8 10-11 13-15 16-20 2-3 5-6 8-9 11-13 0-1 311 | 14-14 17-18 3-0 1-5 12-12 15-15 4-1 10-11 2-6 13-13 5-7 16-16 19-23 8-9 0-3 312 | 13-12 10-6 12-11 6-1 8-9 5-2 11-10 14-13 4-0 7-4 313 | 3-2 6-5 16-13 9-8 16-12 19-17 5-4 15-11 8-6 1-1 11-10 4-3 17-16 14-11 0-0 10-9 314 | 1-1 5-6 2-2 6-7 3-4 3-3 0-0 4-5 315 | 3-4 6-7 20-20 17-11 1-1 4-5 12-8 15-9 18-12 2-2 5-6 11-19 16-10 19-16 0-0 316 | 14-14 17-18 20-22 9-6 12-12 26-28 15-15 18-20 4-1 21-23 7-5 24-27 10-9 13-13 27-29 16-16 19-21 22-24 8-4 11-11 25-28 0-0 317 | 3-5 14-15 17-18 6-8 9-11 1-1 23-21 15-16 26-22 18-17 10-10 13-14 2-4 16-19 5-7 8-9 0-0 22-20 318 | 17-21 3-3 6-8 20-23 12-16 15-19 1-1 4-7 7-12 18-22 10-13 13-17 24-24 5-11 5-10 16-20 2-2 8-15 22-25 11-14 25-29 14-18 0-0 319 | 6-9 9-13 2-6 3-0 5-8 8-11 1-4 7-10 10-14 4-1 320 | 9-12 15-21 4-9 7-14 14-18 1-1 16-26 10-15 3-8 6-13 13-17 0-0 321 | 17-12 3-3 10-7 13-10 9-6 5-5 12-8 1-1 18-13 11-9 4-4 14-11 0-0 322 | 3-4 12-19 9-9 9-8 1-2 4-5 13-25 7-7 16-26 10-11 2-3 5-6 14-22 17-27 11-12 11-11 0-0 323 | 2-10 9-13 6-3 11-17 5-6 14-20 8-9 3-11 4-5 13-19 10-14 7-8 0-2 324 | 1-1 5-6 2-3 6-8 3-5 7-9 0-0 4-5 325 | 6-7 9-9 2-2 5-6 8-8 1-1 0-0 326 | 17-23 3-1 14-11 6-5 20-22 18-24 4-3 15-12 21-26 7-6 8-17 2-2 19-25 5-4 16-8 0-0 327 | 4-4 1-2 5-7 2-3 3-6 0-0 328 | 3-2 9-8 2-1 5-5 8-7 1-0 0-4 4-3 7-6 10-9 329 | 20-17 3-3 10-6 9-10 19-13 8-8 12-7 15-12 18-16 14-11 17-14 0-0 7-4 330 | 3-2 16-15 6-6 9-9 12-14 2-1 5-5 11-13 8-8 1-0 7-7 10-11 10-10 331 | 3-1 10-4 13-7 5-9 8-13 2-0 1-3 4-8 14-14 7-10 0-2 11-5 6-11 332 | 4-3 8-6 5-4 9-8 2-0 10-9 3-2 333 | 1-17 17-20 11-5 14-11 6-1 2-18 12-6 15-13 7-2 3-19 10-4 13-7 16-14 0-0 8-3 334 | 14-15 22-18 17-21 6-6 9-12 12-13 20-16 1-2 15-19 23-22 4-4 7-8 13-14 21-17 2-3 16-20 5-5 8-11 0-1 3-7 335 | 17-23 6-9 20-24 9-11 15-19 1-1 4-6 18-22 10-13 13-17 2-5 16-20 5-8 19-22 8-10 11-14 0-3 14-18 3-7 336 | 7-4 1-1 8-5 2-3 6-4 3-2 0-0 337 | 3-4 14-12 17-16 6-6 1-2 12-11 4-3 15-13 7-9 18-17 10-7 2-1 5-5 16-15 19-18 0-0 11-10 338 | 5-7 1-1 6-8 3-5 4-6 0-0 339 | 1-3 5-8 2-6 4-7 0-1 340 | 1-3 0-0 341 | 6-4 17-14 9-10 12-17 1-5 15-21 18-23 21-28 7-6 16-24 19-25 11-20 5-3 8-7 342 | 1-1 8-5 2-3 9-6 3-2 0-0 10-7 343 | 17-21 3-2 12-20 1-10 9-12 6-4 15-18 4-3 10-15 7-6 16-18 8-13 5-5 11-19 0-1 14-17 344 | 7-2 13-7 6-3 6-2 9-5 2-1 12-6 1-0 8-4 14-8 345 | 22-27 17-10 7-0 12-16 6-5 15-20 16-10 21-26 11-15 14-19 19-11 4-2 8-1 13-17 346 | 22-18 3-3 6-7 14-9 28-25 1-1 12-11 23-20 4-4 15-13 18-14 21-17 2-2 13-12 24-21 5-6 5-5 27-24 19-15 0-0 11-10 347 | 7-5 4-2 1-1 2-1 6-4 6-3 0-0 348 | 12-17 6-6 9-11 2-2 5-5 8-10 14-19 11-13 17-21 1-1 4-4 7-9 13-18 10-12 16-20 0-0 349 | 3-2 13-10 16-12 9-6 2-1 12-10 5-3 1-1 8-5 14-11 0-0 17-13 10-9 7-4 350 | 4-4 1-1 5-5 2-3 6-6 7-7 0-0 351 | 6-10 3-1 9-11 12-14 15-20 1-0 4-2 7-7 10-12 13-18 2-1 5-6 8-8 11-13 14-19 0-0 352 | 13-12 2-5 12-14 1-4 8-7 14-15 11-10 0-3 7-6 4-0 10-9 353 | 6-9 9-14 6-8 15-18 2-2 5-7 8-11 14-17 1-1 4-5 7-10 10-15 13-16 3-6 0-0 16-19 354 | 11-8 4-15 9-11 9-10 15-23 15-22 7-18 18-28 2-14 12-5 5-16 10-12 16-25 8-19 0-9 14-26 3-13 6-17 17-27 355 | 19-9 3-3 6-5 17-12 9-7 1-1 20-11 4-4 20-10 15-13 7-6 10-8 21-15 2-2 16-14 8-7 0-0 356 | 6-4 9-8 12-12 5-3 8-7 11-10 11-9 0-5 13-16 7-6 10-11 357 | 3-3 20-27 9-12 15-25 6-4 12-15 21-28 7-9 10-14 10-13 16-24 13-16 2-2 19-26 5-5 8-10 11-14 0-0 14-17 358 | 2-9 10-2 10-1 1-8 13-5 4-12 9-0 0-7 12-4 7-11 14-13 11-3 359 | 3-2 6-4 9-7 2-1 5-3 8-6 1-0 4-3 7-5 0-0 360 | 6-8 9-11 5-7 2-1 8-10 4-6 1-0 7-9 0-3 3-5 361 | 3-0 6-5 9-10 6-4 6-3 12-13 8-9 11-12 5-1 7-8 4-2 10-11 13-14 362 | 4-4 1-1 5-5 2-3 6-6 7-7 0-0 363 | 7-14 3-8 4-9 0-3 5-12 1-4 2-8 6-13 364 | 17-15 12-16 15-20 1-2 4-4 18-21 7-5 10-12 13-18 2-3 11-17 8-8 14-19 0-1 0-0 365 | 6-8 6-7 3-1 2-3 5-6 1-2 8-5 7-9 4-4 10-10 0-0 366 | 1-3 2-1 3-5 8-12 0-0 367 | 1-1 0-0 368 | 1-1 6-8 2-2 7-9 3-4 0-0 4-5 369 | 3-2 9-11 12-14 15-16 2-1 8-11 14-19 11-13 4-7 1-1 6-17 13-21 16-22 10-12 0-0 370 | 3-6 3-5 25-25 6-10 17-19 9-12 12-16 23-23 15-20 1-2 4-7 26-26 7-11 10-15 10-14 13-17 24-24 2-3 5-9 27-27 22-21 0-0 371 | 3-6 9-17 1-4 7-18 15-14 10-12 13-16 8-19 16-21 2-4 11-20 0-1 372 | 3-4 9-12 12-16 8-15 2-3 5-6 7-14 1-1 4-5 11-8 0-0 6-10 10-9 373 | 4-2 6-7 7-8 1-5 0-0 1-4 374 | 5-7 2-4 3-2 0-0 4-5 375 | 3-3 14-13 17-18 6-4 20-19 9-8 9-7 23-23 1-1 12-11 15-14 4-2 18-16 21-20 10-9 13-12 16-15 22-22 0-0 11-10 376 | 18-4 2-7 9-11 5-10 14-1 11-18 17-3 22-19 20-6 13-2 4-9 20-5 3-8 12-0 377 | 3-4 7-2 9-13 12-15 6-1 4-7 5-0 10-14 378 | 6-5 3-0 2-2 1-3 9-1 10-14 379 | 3-4 6-7 9-12 12-15 2-3 8-11 5-6 11-14 1-2 4-5 0-1 0-0 380 | 8-9 6-0 2-5 7-2 3-6 5-3 4-8 1-4 381 | 1-1 2-2 3-5 4-6 0-0 382 | 3-4 25-24 14-11 6-6 17-14 9-9 1-2 23-20 26-25 15-12 7-7 18-16 18-15 10-8 2-3 24-21 13-10 27-26 5-5 16-13 19-17 0-1 383 | 3-5 20-27 9-13 6-2 15-21 15-20 18-26 1-1 4-6 7-11 10-14 13-18 16-22 2-4 5-7 19-24 8-12 11-15 0-3 14-19 384 | 6-7 12-15 15-20 9-9 2-3 14-20 5-4 11-14 8-8 1-1 10-13 7-6 16-21 13-16 0-0 385 | 3-2 17-18 6-8 9-11 12-16 1-1 4-3 7-9 18-19 10-12 13-15 2-4 5-5 8-10 19-20 11-13 0-0 386 | 6-13 3-5 15-25 1-2 7-14 12-10 4-6 10-15 16-26 19-28 2-3 11-22 17-27 0-0 14-17 387 | 3-3 6-7 9-10 2-3 5-7 11-14 8-9 1-2 4-6 7-8 13-15 0-0 388 | 3-3 6-7 9-12 2-2 8-11 5-5 11-14 1-1 7-10 10-13 13-15 0-0 16-19 389 | 10-6 1-12 6-4 11-20 7-19 4-14 5-7 9-5 3-13 2-13 0-1 390 | 22-18 3-4 25-19 6-6 28-23 20-16 1-2 23-20 15-14 7-7 10-12 29-24 21-17 2-3 24-21 5-5 27-22 8-8 19-15 0-1 391 | 6-9 9-13 12-14 2-4 5-8 17-28 8-12 15-16 16-27 1-1 4-6 14-15 7-10 13-17 0-0 3-5 392 | 4-3 5-4 9-7 6-6 2-0 3-2 10-8 393 | 9-13 3-2 6-5 12-14 8-8 11-10 4-1 7-6 10-11 13-15 0-0 394 | 5-9 2-1 1-5 4-7 3-6 0-0 6-10 395 | 14-16 25-25 17-20 28-28 1-10 20-23 9-8 12-13 15-16 26-26 18-21 7-7 21-24 10-12 13-15 16-18 27-27 19-22 11-14 8-5 0-0 396 | 6-11 2-4 3-8 7-12 4-9 1-6 5-10 0-0 397 | 17-23 3-2 20-25 6-5 9-10 12-12 18-24 15-14 4-3 7-6 10-11 13-15 2-1 16-18 19-22 5-4 8-7 11-13 398 | 6-9 2-4 5-7 11-13 4-8 7-11 10-12 0-0 3-5 399 | 3-4 6-7 9-12 12-15 2-2 5-6 8-10 11-14 8-9 1-1 4-5 7-8 10-13 13-16 0-0 400 | 3-2 6-7 2-1 5-6 4-5 1-0 0-4 7-8 401 | 3-1 6-8 9-11 12-16 1-3 15-18 4-6 7-9 10-13 13-17 16-21 2-4 5-7 8-10 11-15 14-20 0-2 402 | 2-2 3-6 0-1 1-5 403 | 1-1 5-2 6-5 0-0 7-6 404 | 3-4 13-9 16-12 2-3 19-15 12-8 15-11 1-1 18-14 11-7 6-13 14-10 0-0 3-5 405 | 3-4 14-13 17-17 9-8 1-2 12-11 4-5 15-14 18-18 10-9 21-19 2-3 13-12 5-6 16-15 8-7 0-1 0-0 11-10 22-20 406 | 17-23 23-28 9-11 6-3 9-10 12-13 15-20 18-27 1-1 4-4 7-5 10-12 2-8 2-7 13-16 16-22 16-21 19-24 5-2 11-15 14-19 0-0 407 | 8-16 14-24 1-7 1-6 2-22 7-15 11-12 5-23 3-9 6-14 0-2 4-1 15-25 408 | 3-5 14-14 17-18 20-22 9-9 6-1 23-23 12-12 4-7 15-15 18-19 21-22 10-10 13-13 5-8 16-17 19-22 22-22 11-11 409 | 5-9 9-5 1-4 11-13 4-7 3-12 0-3 2-11 410 | 1-14 4-16 7-21 12-10 15-9 15-8 8-22 16-23 0-13 2-2 11-12 11-11 3-7 17-24 411 | 3-3 6-7 16-14 9-10 12-13 2-2 2-1 5-5 8-9 11-11 4-4 7-8 0-0 412 | 9-17 20-26 6-9 15-24 4-10 12-13 26-29 7-8 16-25 13-14 5-7 8-11 11-12 413 | 3-4 14-11 20-23 20-22 12-15 9-7 15-19 1-1 18-20 21-24 10-8 2-4 19-21 22-25 8-6 0-2 11-12 414 | 14-14 3-3 6-4 9-11 20-20 12-13 1-1 15-17 15-16 7-5 10-12 13-15 2-2 16-18 5-6 19-19 0-0 415 | 6-9 9-13 3-2 2-5 8-12 11-15 5-4 1-1 7-11 10-14 4-3 0-0 416 | 20-27 3-2 17-19 6-7 9-9 12-14 15-21 1-1 18-24 4-5 7-12 21-28 13-15 16-22 10-8 2-3 19-26 5-6 22-29 8-9 11-13 14-18 0-0 417 | 13-12 10-7 3-2 9-6 2-1 9-5 12-10 5-4 1-0 14-13 11-8 4-3 0-0 418 | 3-4 6-9 9-12 12-16 2-3 5-6 8-11 11-15 1-2 4-7 7-10 10-14 10-13 0-0 3-5 419 | 5-12 2-3 7-14 1-2 3-11 4-4 6-13 0-0 420 | 3-6 17-19 9-11 23-27 9-10 12-16 1-4 26-29 4-8 15-18 18-20 10-12 24-28 13-16 2-5 5-9 19-24 22-26 0-2 25-28 6-14 14-17 421 | 3-3 14-12 17-18 20-23 1-1 12-10 4-6 18-22 15-13 5-16 2-2 13-11 16-14 19-21 8-9 6-15 0-0 422 | 11-8 3-4 3-3 14-13 17-19 1-1 15-18 12-10 4-5 18-20 10-7 2-2 5-6 13-9 19-21 0-0 423 | 4-4 1-2 1-1 5-6 3-5 424 | 7-2 10-6 0-18 3-23 3-22 6-1 9-5 9-4 12-8 15-11 5-0 8-3 16-24 11-7 14-10 425 | 3-3 9-12 12-16 6-6 5-10 2-2 8-11 11-15 1-1 4-4 7-9 10-14 426 | 3-3 6-8 2-7 16-15 12-11 5-5 15-13 1-2 14-14 7-9 4-4 10-10 0-0 427 | 9-11 5-3 8-8 5-2 0-7 4-6 4-5 7-9 428 | 4-3 1-1 2-1 3-2 0-0 429 | 12-17 6-5 9-10 2-2 8-9 11-14 1-1 4-4 13-18 7-6 10-11 0-0 16-19 430 | 14-16 3-3 6-9 20-23 9-12 23-26 12-14 1-1 15-18 4-6 18-21 7-10 21-24 10-13 13-15 2-2 5-8 16-18 19-22 8-11 22-25 431 | 6-10 17-20 20-24 9-13 12-16 1-2 4-8 18-22 7-11 18-21 21-25 10-14 2-6 16-19 5-9 8-13 19-23 11-15 0-1 3-7 432 | 13-11 16-16 9-10 2-5 6-4 12-12 8-9 5-3 1-2 4-7 14-15 11-8 3-6 0-0 433 | 3-4 16-17 6-7 9-10 2-3 15-16 5-6 8-9 11-12 1-2 14-15 4-5 17-18 7-8 10-11 0-0 13-14 434 | 13-13 6-4 9-7 12-12 2-2 11-11 8-6 1-0 4-3 0-1 7-5 435 | 9-13 3-2 15-18 5-8 2-1 8-11 1-6 11-15 14-17 7-10 10-14 4-3 13-16 0-0 436 | 5-13 13-10 10-5 2-3 9-6 4-11 12-8 26-15 1-2 3-11 8-4 18-12 6-14 11-7 14-9 0-0 437 | 3-3 16-15 6-5 9-10 12-13 2-3 18-19 5-4 8-9 1-2 11-11 17-18 7-7 0-0 13-14 438 | 10-7 5-10 2-3 9-6 15-15 4-12 12-8 1-2 14-14 7-5 0-0 439 | 25-25 3-3 14-13 28-28 17-17 6-5 9-8 23-23 1-0 15-16 26-26 4-4 18-20 7-6 21-21 10-9 24-24 2-2 13-12 27-27 16-15 19-19 8-7 22-22 0-0 11-10 440 | 11-9 25-24 3-3 14-12 6-5 20-18 9-7 23-23 1-1 12-10 4-4 15-13 18-16 21-19 10-8 24-23 2-2 5-6 16-14 19-17 0-0 22-20 441 | 6-9 2-3 8-13 5-8 5-7 7-14 1-1 3-11 4-5 0-0 9-15 442 | 1-2 5-5 2-3 6-8 3-4 4-7 0-0 443 | -------------------------------------------------------------------------------- /data.txt: -------------------------------------------------------------------------------- 1 | 2 | # We take the text, tokenize it, maxlength 40 it, 3 | # lemmatize it, and align it to get: 4 | 5 | cd data/maxlength40-lemmas 6 | 7 | mkdir en-fr ; cp ~/data/multitext/align-lemmas/aligner.fr-en/training.?? en-fr/ ; cp ~/data/multitext/align-lemmas/aligner.fr-en/training.align en-fr/training.align.fr-en 8 | mkdir en-nl ; cp ~/data/multitext/align-lemmas/aligner.nl-en/training.?? en-nl/ ; cp ~/data/multitext/align-lemmas/aligner.nl-en/training.align en-nl/training.align.nl-en 9 | mkdir en-de ; cp ~/data/multitext/align-lemmas/aligner.de-en/training.?? en-de/ ; cp ~/data/multitext/align-lemmas/aligner.de-en/training.align en-de/training.align.de-en 10 | mkdir en-it ; cp ~/data/multitext/align-lemmas/aligner.it-en/training.?? en-it/ ; cp ~/data/multitext/align-lemmas/aligner.it-en/training.align en-it/training.align.it-en 11 | mkdir en-es ; cp ~/data/multitext/align-lemmas/aligner.es-en/training.?? en-es/ ; cp ~/data/multitext/align-lemmas/aligner.es-en/training.align en-es/training.align.es-en 12 | 13 | ../../scripts/preprocess/reverse-alignment.pl en-de/training.align.de-en 14 | ../../scripts/preprocess/reverse-alignment.pl en-nl/training.align.nl-en 15 | ../../scripts/preprocess/reverse-alignment.pl en-it/training.align.it-en 16 | ../../scripts/preprocess/reverse-alignment.pl en-es/training.align.es-en 17 | ../../scripts/preprocess/reverse-alignment.pl en-fr/training.align.fr-en 18 | 19 | # At this point, we run the preprocessing, because we want to build the 20 | # initial embeddings BEFORE filtering 21 | 22 | ../../scripts/preprocess/filter-sentences-by-lemma.py en-de/training.en en-de/training.de en-de/training.align.en-de & 23 | ../../scripts/preprocess/filter-sentences-by-lemma.py en-it/training.en en-it/training.it en-it/training.align.en-it & 24 | ../../scripts/preprocess/filter-sentences-by-lemma.py en-fr/training.en en-fr/training.fr en-fr/training.align.en-fr & 25 | ../../scripts/preprocess/filter-sentences-by-lemma.py en-nl/training.en en-nl/training.nl en-nl/training.align.en-nl & 26 | ../../scripts/preprocess/filter-sentences-by-lemma.py en-es/training.en en-es/training.es en-es/training.align.en-es & 27 | 28 | ################################################################## 29 | ### Below is deprecated 30 | ################################################################## 31 | 32 | ##### Wait, using freeling for es will changes the number of tokens. grr.... 33 | ##### Let's use treetagger for es instead 34 | 35 | 36 | cp ~/data/multitext/align/aligner.fr-en.filtered.maxlength-40/training.en data/filtered-full-bilingual/ 37 | cp ~/data/multitext/align/aligner.fr-en.filtered.maxlength-40/training.fr data/filtered-full-bilingual/ 38 | cp ~/data/multitext/align/aligner.fr-en.filtered.maxlength-40/training.align data/filtered-full-bilingual/training.align.fr-en 39 | 40 | ../../scripts/preprocess/reverse-alignment.pl en-fr/training.align.fr-en 41 | 42 | Tadpole --skip=tmp -t ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-nl/filtered-training.nl | perl -ne 's/\t/ /g; print lc($_);' | chop 3 | from-one-line-per-word-to-one-line-per-sentence.py > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-nl/filtered-training-lemmas.nl 43 | 44 | ~/utils/src/treetagger-3.2/lemmatizer.py french-utf8 ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-fr/training.fr > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-fr/filtered-training-lemmas.fr 45 | ~/utils/src/treetagger-3.2/lemmatizer.py german ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-de/filtered-training.de > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-de/filtered-training-lemmas.de 46 | ~/utils/src/treetagger-3.2/lemmatizer.py spanish ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-es/filtered-training.es> ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-es/filtered-training-lemmas.es 47 | 48 | ~/utils/src/libiconv-1.13.1/src/iconv --byte-subst="<0x%x>" --unicode-subst="<0x%x>" ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-de/filtered-training.de > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-de/filtered-training.de.latin1 49 | ~/utils/src/treetagger-3.2/lemmatizer.py german ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-de/filtered-training.de.latin1 > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-de/filtered-training-lemmas.de 50 | 51 | cd /u/turian/utils/src/FreeLing-2.1/src/main/simple_examples 52 | ./justmorph.py it ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-it/filtered-training.it > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-it/filtered-training-lemmas.it 53 | #./justmorph.py es ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-es/filtered-training.es > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-es/filtered-training-lemmas.es 54 | 55 | ./justmorph.py en ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-es/filtered-training.en > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-es/filtered-training-lemmas.en 56 | ./justmorph.py en ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-fr/training.en > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-fr/filtered-training-lemmas.en 57 | ./justmorph.py en ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-de/filtered-training.en > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-de/filtered-training-lemmas.en 58 | ./justmorph.py en ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-nl/filtered-training.en > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-nl/filtered-training-lemmas.en 59 | ./justmorph.py en ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-it/filtered-training.en > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-it/filtered-training-lemmas.en 60 | 61 | cp ../filtered-full-bilingual/en-fr/filtered-training.align.en-fr ../filtered-full-bilingual-lemmas/en-fr/filtered-training-lemmas.align.en-fr 62 | cp ../filtered-full-bilingual/en-nl/filtered-training.align.en-nl ../filtered-full-bilingual-lemmas/en-nl/filtered-training-lemmas.align.en-nl 63 | cp ../filtered-full-bilingual/en-de/filtered-training.align.en-de ../filtered-full-bilingual-lemmas/en-de/filtered-training-lemmas.align.en-de 64 | cp ../filtered-full-bilingual/en-es/filtered-training.align.en-es ../filtered-full-bilingual-lemmas/en-es/filtered-training-lemmas.align.en-es 65 | cp ../filtered-full-bilingual/en-it/filtered-training.align.en-it ../filtered-full-bilingual-lemmas/en-it/filtered-training-lemmas.align.en-it 66 | -------------------------------------------------------------------------------- /data/README.txt: -------------------------------------------------------------------------------- 1 | allwords.gz is from Childes corpus, Eng-USA/allwords.gz 2 | 3 | Create vocabulary: 4 | zcat allwords.gz | sort | uniq -c | sort -rn > allwords.vocabulary.txt 5 | 6 | test is the first 10K words. 7 | validation is the next 10K words. 8 | train is the rest. 9 | 10 | ============= 11 | 12 | wikitext.txt.gz is preprocessed English wikipedia, broken into sentences and 13 | tokenized and shuffled. 14 | 15 | ls | grep gz | ~/common/scripts/shuffle.sh | xargs zcat | ../../scripts/preprocess.pl | grep . | ~/common/scripts/shuffle.sh | gzip -c > ../wikitext.txt.gz 16 | 17 | zcat wikitext.txt.gz | head -10000 | gzip -c > wikitext.test.txt.gz 18 | zcat wikitext.txt.gz | head -20000 | tail -10000 | gzip -c > wikitext.validation.txt.gz 19 | zcat wikitext.txt.gz | tail -66151742 | gzip -c > wikitext.train.txt.gz 20 | 21 | ============= 22 | 23 | italian-wikitext.txt.gz is preprocessed Italian wikipedia: 24 | 25 | bzcat ~/data/italian_SemaWiki_attardi/3_tokenized.txt.bz2 | ~/data/italian_SemaWiki_attardi/one-sentence-per-line.pl | ../scripts/preprocess.pl | grep . | ~/common/scripts/shuffle.sh | gzip -c > italian-wikitext.txt.gz 26 | 27 | zcat italian-wikitext.txt.gz | head -10000 | gzip -c > italian-wikitext.test.txt.gz 28 | zcat italian-wikitext.txt.gz | head -20000 | tail -10000 | gzip -c > italian-wikitext.validation.txt.gz 29 | zcat italian-wikitext.txt.gz | tail -5672365 | gzip -c > italian-wikitext.train.txt.gz 30 | 31 | # Sanity check 32 | zcat italian-wikitext.test.txt.gz italian-wikitext.validation.txt.gz italian-wikitext.train.txt.gz | md5sum 33 | zcat italian-wikitext.txt.gz | md5sum 34 | 35 | ../scripts/examples.py italian-wikitext.validation.txt.gz | ~/common/scripts/shuffle.sh | head -1000 | gzip -c > italian-wikitext.validation-1000.txt.gz 36 | 37 | 38 | # Vocabulary 39 | zcat italian-wikitext.train.txt.gz | perl -ne 's/ /\n/g; print' | grep . | sort | uniq -c | sort -rn | gzip -c > vocabulary-italian-wikitext.txt.gz 40 | zcat vocabulary-italian-wikitext.txt.gz | head -20000 | gzip -c > vocabulary-italian-wikitext-20000.txt.gz 41 | 42 | ============= 43 | 44 | For case sensitive embeddings: 45 | 46 | find wikitext/ | grep gz | ~/common/scripts/shuffle.sh | xargs zcat | grep . | ~/common/scripts/shuffle.sh | gzip -c > english-wikitext.case-intact.txt.gz 47 | 48 | zcat english-wikitext.case-intact.txt.gz | head -10000 | gzip -c > english-wikitext.case-intact.test.txt.gz 49 | zcat english-wikitext.case-intact.txt.gz | head -20000 | tail -10000 | gzip -c > english-wikitext.case-intact.validation.txt.gz 50 | zcat english-wikitext.case-intact.txt.gz | tail -66151742 | gzip -c > english-wikitext.case-intact.train.txt.gz 51 | 52 | # Sanity check 53 | zcat english-wikitext.case-intact.test.txt.gz english-wikitext.case-intact.validation.txt.gz english-wikitext.case-intact.train2.txt.gz | md5sum 54 | zcat english-wikitext.case-intact.txt.gz | md5sum 55 | 56 | # Vocabulary 57 | zcat english-wikitext.case-intact.train2.txt.gz | perl -ne 's/ /\n/g; print' | grep . | sort -T /cluster/paralisi3/turian/tmp | uniq -c | sort -rn | gzip -c > vocabulary-english-wikitext.case-intact.txt.gz 58 | zcat vocabulary-english-wikitext.case-intact.txt.gz | head -20000 | gzip -c > vocabulary-english-wikitext.case-intact-20000.txt.gz 59 | zcat vocabulary-english-wikitext.case-intact.txt.gz | head -50000 | gzip -c > vocabulary-english-wikitext.case-intact-50000.txt.gz 60 | 61 | # Enter scripts directory 62 | ./build-vocabulary.py 63 | ./random-validation-examples.py 64 | 65 | ============= 66 | -------------------------------------------------------------------------------- /data/allwords.gz: -------------------------------------------------------------------------------- 1 | /u/turian/data/childes/childes-original/Eng-USA/allwords.gz -------------------------------------------------------------------------------- /data/allwords.vocabulary-200.txt: -------------------------------------------------------------------------------- 1 | 1554809 . 2 | 514944 ? 3 | 321320 you 4 | 265588 the 5 | 237190 # 6 | 204922 I 7 | 196906 a 8 | 174804 it 9 | 142331 to 10 | 128046 and 11 | 128028 that 12 | 122771 what 13 | 121857 [: 14 | 107318 ! 15 | 97090 is 16 | 92242 this 17 | 89117 in 18 | 85166 xxx 19 | 84145 yeah 20 | 81372 xx 21 | 80737 do 22 | 78573 no 23 | 78149 on 24 | 74463 yy 25 | 65570 oh 26 | 61896 one 27 | 60334 okay 28 | 60044 your 29 | 59387 have 30 | 59367 that's 31 | 58699 don't 32 | 58381 want 33 | 56844 go 34 | 56517 he 35 | 53963 there 36 | 52772 here 37 | 51600 we 38 | 51501 +... 39 | 50561 can 40 | 49654 like 41 | 49376 see 42 | 49317 of 43 | 48794 me 44 | 47579 are 45 | 46246 it's 46 | 46012 right 47 | 43048 my 48 | 42616 put 49 | 42462 0 50 | 42362 get 51 | 40781 know 52 | 40443 up 53 | 39846 [>] 54 | 39621 [<] 55 | 37921 for 56 | 36659 going 57 | 35350 with 58 | 35312 all 59 | 35148 not 60 | 34426 I'm 61 | 34344 did 62 | 33208 what's 63 | 32993 gonna 64 | 32619 to] 65 | 32560 look 66 | 32547 was 67 | 32520 +" 68 | 31515 now 69 | 30824 they 70 | 30376 [?] 71 | 29451 [= 72 | 29099 some 73 | 28792 [//] 74 | 28388 at 75 | 27291 out 76 | 27082 how 77 | 27001 little 78 | 26538 come 79 | 25990 good 80 | 25596 where 81 | 25374 so 82 | 25270 she 83 | 25181 got 84 | 25028 you're 85 | 24777 be 86 | 24557 think 87 | 24014 just 88 | 23786 [=! 89 | 23468 [/] 90 | 23364 down 91 | 22599 but 92 | 21649 why 93 | 21044 wanna 94 | 20782 yes] 95 | 20669 huh 96 | 20167 [!] 97 | 20056 let's 98 | 19925 Mommy 99 | 19648 yes 100 | 19283 too 101 | 19173 well 102 | 18847 more 103 | 18583 then 104 | 18577 say 105 | 17953 make 106 | 17737 if 107 | 17710 when 108 | 17546 uh 109 | 17253 back 110 | 17113 two 111 | 17086 take 112 | 16944 over 113 | 16849 her 114 | 16774 big 115 | 16518 his 116 | 15957 eat 117 | 15849 um 118 | 15844 play 119 | 15799 them 120 | 15759 he's 121 | 15426 I'll 122 | 15289 there's 123 | 15185 him 124 | 14829 off 125 | 14396 does 126 | 14141 who 127 | 13941 baby 128 | 13715 about 129 | 13600 where's 130 | 13441 can't 131 | 12845 these 132 | 12638 let 133 | 12532 said 134 | 12431 ["] 135 | 12321 would 136 | 12162 hmm 137 | 11747 or 138 | 11726 didn't 139 | 11574 +/. 140 | 11524 very 141 | 11497 book 142 | 11291 those 143 | 11167 doing 144 | 10967 other 145 | 10553 need 146 | 10435 could 147 | 10360 will 148 | 10295 tell 149 | 10093 way 150 | 9974 has 151 | 9932 mmhm 152 | 9844 hey 153 | 9792 because 154 | 9630 something 155 | 9623 time 156 | 9620 read 157 | 9614 give 158 | 9606 had 159 | 9550 &=laughs 160 | 9549 another 161 | 9474 uhhuh 162 | 9464 sit 163 | 8999 car 164 | 8923 they're 165 | 8852 here's 166 | 8845 were 167 | 8809 turn 168 | 8733 house 169 | 8727 an 170 | 8723 mommy 171 | 8570 , 172 | 8488 oh, 173 | 8485 ## 174 | 8382 goes 175 | 8368 www 176 | 8331 an(d) 177 | 8124 three 178 | 8124 really 179 | 8118 Daddy 180 | 8062 we're 181 | 8044 again 182 | 8013 nice 183 | 7985 boy 184 | 7789 doesn't 185 | 7730 else 186 | 7692 (be)cause 187 | 7690 ball 188 | 7586 please 189 | 7572 +//. 190 | 7536 &=noise 191 | 7530 hi 192 | 7466 this] 193 | 7421 yyy 194 | 7357 water 195 | 7305 from 196 | 7300 dis 197 | 7264 wait 198 | 7224 went 199 | 7152 [+ 200 | 7128 ya 201 | -------------------------------------------------------------------------------- /data/allwords.vocabulary.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turian/neural-language-model/f7559a6cc4e9f4c34a553fbda974762f2d3f781b/data/allwords.vocabulary.txt.gz -------------------------------------------------------------------------------- /data/batch: -------------------------------------------------------------------------------- 1 | # Generate data from the English wikipedia, case information intact 2 | 3 | find wikitext/ | grep gz | ~/common/scripts/shuffle.sh | xargs zcat | grep . | ~/common/scripts/shuffle.sh | gzip -c > english-wikitext.case-intact.txt.gz 4 | 5 | zcat english-wikitext.case-intact.txt.gz | head -10000 | gzip -c > english-wikitext.case-intact.test.txt.gz 6 | zcat english-wikitext.case-intact.txt.gz | head -20000 | tail -10000 | gzip -c > english-wikitext.case-intact.validation.txt.gz 7 | zcat english-wikitext.case-intact.txt.gz | tail -66151742 | gzip -c > english-wikitext.case-intact.train.txt.gz 8 | 9 | # Sanity check 10 | zcat english-wikitext.case-intact.test.txt.gz english-wikitext.case-intact.validation.txt.gz english-wikitext.case-intact.train.txt.gz | md5sum 11 | zcat english-wikitext.case-intact.txt.gz | md5sum 12 | 13 | # Vocabulary 14 | zcat english-wikitext.case-intact.train.txt.gz | perl -ne 's/ /\n/g; print' | grep . | sort | uniq -c | sort -rn | gzip -c > vocabulary-english-wikitext.case-intact.txt.gz 15 | zcat vocabulary-english-wikitext.case-intact.txt.gz | head -20000 | gzip -c > vocabulary-english-wikitext.case-intact-20000.txt.gz 16 | -------------------------------------------------------------------------------- /scripts/LOGS: -------------------------------------------------------------------------------- 1 | LOGS.NOBACKUP/ -------------------------------------------------------------------------------- /scripts/LOGS.NOBACKUP/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turian/neural-language-model/f7559a6cc4e9f4c34a553fbda974762f2d3f781b/scripts/LOGS.NOBACKUP/.keep -------------------------------------------------------------------------------- /scripts/batch: -------------------------------------------------------------------------------- 1 | #./train.py 2>&1 | tee train-0.1.out 2 | #./train.py 2>&1 | tee train-0.01.out 3 | #./train.py 2>&1 | tee train-0.001.out 4 | 5 | # From 20K italian and 20K english, we want the following hyperparams 6 | # 7 | # 2 EMBEDDING_LEARNING_RATE=0_000001.dat 8 | # 9 EMBEDDING_LEARNING_RATE=0_0000032.dat 9 | # 6 EMBEDDING_LEARNING_RATE=0_00001.dat 10 | # 6 EMBEDDING_LEARNING_RATE=0_000032.dat 11 | # 3 EMBEDDING_LEARNING_RATE=0_0001.dat 12 | # 13 | # 10 LEARNING_RATE=0_00000001 14 | # 8 LEARNING_RATE=0_000000032 15 | # 7 LEARNING_RATE=0_0000001 16 | # 1 LEARNING_RATE=0_00000032 17 | # 18 | # From 100K english lowercase, we want the following hyperparams 19 | # 20 | # 3 EMBEDDING_LEARNING_RATE=0_000001.dat 21 | # 3 EMBEDDING_LEARNING_RATE=0_0000032.dat 22 | # 23 | # 3 LEARNING_RATE=0_0000000032 24 | # 2 LEARNING_RATE=0_000000001 25 | # 2 LEARNING_RATE=0_000000032 26 | # 27 | # From 280K RCV1 english with case, hyperparams: 28 | # EMBEDDING_LEARNING_RATE=0_00001.dat and LEARNING_RATE=0_000000001 29 | # seemed to be the only version that worked well, when experimenting with values x10 and /10. 30 | # 31 | # 32 | # From 20090916-rcv1.case-intact, the best hyperparams are: 33 | # LEARNING_RATE\=0_000000001 34 | # EMBEDDING_LEARNING_RATE\=0_0000032 35 | # 36 | # From 20090923-rcv1.unclean.CoNLL03-tokenize.case-intact, the best hyperparams are: 37 | # 3 EMBEDDING_LEARNING_RATE\=0_0000032 38 | # 39 | # 1 LEARNING_RATE\=0_0000000001 40 | # 1 LEARNING_RATE\=0_00000000032 41 | # 1 LEARNING_RATE\=0_000000001 42 | # 43 | # From 20091011-corpus-conll2003-ner and 20091011-corpus-ptb2-entirewsj, the best hyperparams are: 44 | # 2 EMBEDDING_LEARNING_RATE=0_00001 45 | # 3 EMBEDDING_LEARNING_RATE=0_000032 46 | # 47 | # 1 LEARNING_RATE=0_00000000032 48 | # 2 LEARNING_RATE=0_000000001 49 | # 2 LEARNING_RATE=0_0000000032 50 | # 51 | 52 | 53 | dbidispatch --exp_dir="T" ./train.py \ 54 | '--no_LOG_BILINEAR_MODEL' \ 55 | '--no_NORMALIZE_EMBEDDINGS' \ 56 | '--EMBEDDING_SIZE=500' \ 57 | '--HIDDEN_SIZE=100' \ 58 | '--CW_EMBEDDING_L1_PENALTY={{0,0.001,0.0001,0.00001}}' \ 59 | '--NGRAM_FOR_TRAINING_NOISE=0' \ 60 | '--LEARNING_RATE={{0.0000001,0.00000001,0.000000001,0.0000000001,0.00000000001,0.000000000001}}' \ 61 | '--EMBEDDING_LEARNING_RATE={{0.0001,0.00001,0.000001,0.0000001,0.00000001,0.000000001}}' 62 | 63 | #'--LEARNING_RATE={{0.0000001,0.00000001,0.0000000032,0.000000001,0.00000000032,0.0000000001,0.000000000032,0.00000000001,0.0000000000032,0.000000000001}}' \ 64 | #'--EMBEDDING_LEARNING_RATE={{0.0001,0.000032,0.00001,0.0000032,0.000001,0.0000001,0.00000001,0.000000001}}' 65 | 66 | #dbidispatch --exp_dir="T" ./train.py \ 67 | # '--no_NORMALIZE_EMBEDDINGS' \ 68 | # '--NGRAM_FOR_TRAINING_NOISE=0' \ 69 | # '--MONOLINGUAL_VOCABULARY_SIZE=50000' \ 70 | # '--LEARNING_RATE={{0.00000032,0.0000001,0.000000032,0.00000001,0.0000000032,0.000000001}}' \ 71 | # '--EMBEDDING_LEARNING_RATE={{0.00032,0.0001,0.000032,0.00001,0.0000032,0.000001}}' 72 | -------------------------------------------------------------------------------- /scripts/batch-build-examples: -------------------------------------------------------------------------------- 1 | #dbidispatch --exp_dir="T" ./w2w/train.py \ 2 | dbidispatch --no_machine=maggie42.iro.umontreal.ca --mem=1900 --exp_dir="T" ./w2w/build-example-cache.py \ 3 | '--no_console' \ 4 | '--no_LOG_BILINEAR_MODEL' \ 5 | '--no_NORMALIZE_EMBEDDINGS' \ 6 | '--EMBEDDING_SIZE=50' \ 7 | '--HIDDEN_SIZE=100' \ 8 | '--WINDOW_SIZE={{11,9,7,5}}' \ 9 | '--CW_EMBEDDING_L1_PENALTY=0' \ 10 | '--NGRAM_FOR_TRAINING_NOISE=0' \ 11 | '--EMBEDDING_LEARNING_RATE=0' 12 | # '--LEARNING_RATE={{0.001,0.0001,0.00001,0.000001,0.0000001,0.00000001,0.000000001}}' \ 13 | # '--EMBEDDING_LEARNING_RATE={{0.1,0.01,0.001,0.0001,0.00001,0.000001}}' 14 | -------------------------------------------------------------------------------- /scripts/batch-short: -------------------------------------------------------------------------------- 1 | dbidispatch --exp_dir="T-short" ./train.py \ 2 | '--no_NORMALIZE_EMBEDDINGS' \ 3 | '--NGRAM_FOR_TRAINING_NOISE=1' \ 4 | '--VALIDATE_EVERY=2500000' \ 5 | '--MONOLINGUAL_VOCABULARY_SIZE={{5000,20000}}' \ 6 | '--TRAINING_NOISE_SMOOTHING_ADDITION={{0,10000,10000000}}' \ 7 | '--LEARNING_RATE={{0.0001,0.00001,0.000001,0.0000001}}' \ 8 | '--EMBEDDING_LEARNING_RATE={{0.01,0.001,0.0001,0.00001}}' 9 | -------------------------------------------------------------------------------- /scripts/batch-w2w: -------------------------------------------------------------------------------- 1 | #dbidispatch --exp_dir="T" ./w2w/train.py \ 2 | #dbidispatch --no_machine=maggie42.iro.umontreal.ca --mem=1900 --exp_dir="T" ./w2w/train.py \ 3 | dbidispatch --no_machine=brams01.iro.umontreal.ca --mem=1900 --exp_dir="T" ./w2w/train.py \ 4 | '--no_console' \ 5 | '--no_LOG_BILINEAR_MODEL' \ 6 | '--no_NORMALIZE_EMBEDDINGS' \ 7 | '{{--TWO_HIDDEN_LAYERS,--no_TWO_HIDDEN_LAYERS}}' \ 8 | '--EMBEDDING_SIZE=50' \ 9 | '--HIDDEN_SIZE={{50,100,200}}' \ 10 | '--WINDOW_SIZE={{9,7,5}}' \ 11 | '--CW_EMBEDDING_L1_PENALTY=0' \ 12 | '--NGRAM_FOR_TRAINING_NOISE=0' \ 13 | '--LEARNING_RATE={{0.01,0.001,0.0001}}' \ 14 | '--EMBEDDING_LEARNING_RATE=0' 15 | -------------------------------------------------------------------------------- /scripts/batch-w2w2: -------------------------------------------------------------------------------- 1 | #dbidispatch --exp_dir="T" ./w2w/train.py \ 2 | dbidispatch --no_machine=maggie42.iro.umontreal.ca --exp_dir="T" ./w2w/train.py \ 3 | '--no_console' \ 4 | '--no_LOG_BILINEAR_MODEL' \ 5 | '--no_NORMALIZE_EMBEDDINGS' \ 6 | '--TWO_HIDDEN_LAYERS' \ 7 | '--EMBEDDING_SIZE=50' \ 8 | '--HIDDEN_SIZE={{100,200}}' \ 9 | '--WINDOW_SIZE={{7,5}}' \ 10 | '--CW_EMBEDDING_L1_PENALTY=0' \ 11 | '--NGRAM_FOR_TRAINING_NOISE=0' \ 12 | '--LEARNING_RATE={{0.1,0.01,0.001,0.0001,0.00001,0.000001,0.0000001}}' \ 13 | '--EMBEDDING_LEARNING_RATE=0' 14 | # '--EMBEDDING_LEARNING_RATE={{0.1,0.01,0.001,0.0001,0.00001,0.000001}}' 15 | -------------------------------------------------------------------------------- /scripts/batch_ngrams: -------------------------------------------------------------------------------- 1 | dbidispatch ./ngrams.py \ 2 | '--MONOLINGUAL_VOCABULARY_SIZE={{5000,10000,20000}}' \ 3 | '--WINDOW_SIZE={{1,2,3}}' 4 | -------------------------------------------------------------------------------- /scripts/diagnostics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Verbose debug output for the model. 3 | """ 4 | 5 | import logging 6 | from common.stats import stats 7 | from common.str import percent 8 | 9 | import examples 10 | 11 | import numpy 12 | import random 13 | 14 | def diagnostics(cnt, model): 15 | logging.info(stats()) 16 | idxs = range(model.parameters.vocab_size) 17 | random.shuffle(idxs) 18 | idxs = idxs[:100] 19 | 20 | embeddings_debug(model.parameters.embeddings[idxs], cnt, "rand 100 words, model %s" % model.modelname) 21 | embeddings_debug(model.parameters.embeddings[:100], cnt, "top 100 words, model %s" % model.modelname) 22 | embeddings_debug(model.parameters.embeddings[model.parameters.vocab_size/2-50:model.parameters.vocab_size/2+50], cnt, "mid 100 words, model %s" % model.modelname) 23 | embeddings_debug(model.parameters.embeddings[-100:], cnt, "last 100 words, model %s" % model.modelname) 24 | weights_debug(model.parameters.hidden_weights.value, cnt, "hidden weights, model %s" % model.modelname) 25 | weights_debug(model.parameters.output_weights.value, cnt, "output weights, model %s" % model.modelname) 26 | logging.info(stats()) 27 | 28 | def visualizedebug(cnt, model, rundir, newkeystr, WORDCNT=500): 29 | idxs = range(model.parameters.vocab_size) 30 | random.shuffle(idxs) 31 | idxs = idxs[:WORDCNT] 32 | 33 | visualize(cnt, model, rundir, idxs, "randomized%s" % newkeystr) 34 | visualize(cnt, model, rundir, range(WORDCNT), "mostcommon%s" % newkeystr) 35 | visualize(cnt, model, rundir, range(-1, -WORDCNT*50, -1*50), "leastcommon%s" % newkeystr) 36 | visualize(cnt, model, rundir, range(model.parameters.vocab_size/2-WORDCNT*20/2,model.parameters.vocab_size/2+WORDCNT*20/2, 20), "midcommon%s" % newkeystr) 37 | 38 | def visualize(cnt, model, rundir, idxs, str): 39 | """ 40 | Visualize a set of examples using t-SNE. 41 | """ 42 | from vocabulary import wordmap, wordform 43 | PERPLEXITY=30 44 | 45 | idxs = [id % model.parameters.embeddings.shape[0] for id in idxs] 46 | x = model.parameters.embeddings[idxs] 47 | print x.shape 48 | #titles = [`wordmap().str(id)` for id in idxs] 49 | titles = [wordform(id) for id in idxs] 50 | 51 | import os.path 52 | filename = os.path.join(rundir, "embeddings.model-%s.-%s-%d.png" % (model.modelname, str, cnt)) 53 | try: 54 | from textSNE.calc_tsne import tsne 55 | # from textSNE.tsne import tsne 56 | out = tsne(x, perplexity=PERPLEXITY) 57 | from textSNE.render import render 58 | render([(title, point[0], point[1]) for title, point in zip(titles, out)], filename) 59 | except IOError: 60 | logging.info("ERROR visualizing", filename, ". Continuing...") 61 | 62 | def embeddings_debug(w, cnt, str): 63 | """ 64 | Output the l2norm mean and max of the embeddings, including in debug out the str and training cnt 65 | """ 66 | totalcnt = numpy.sum(numpy.abs(w) >= 0) 67 | notsmallcnt = numpy.sum(numpy.abs(w) >= 0.1) 68 | logging.info("%d %s dimensions of %s have absolute value >= 0.1" % (cnt, percent(notsmallcnt, totalcnt), str)) 69 | notsmallcnt = numpy.sum(numpy.abs(w) >= 0.01) 70 | logging.info("%d %s dimensions of %s have absolute value >= 0.01" % (cnt, percent(notsmallcnt, totalcnt), str)) 71 | 72 | l2norm = numpy.sqrt(numpy.square(w).sum(axis=1)) 73 | median = numpy.median(l2norm) 74 | mean = numpy.mean(l2norm) 75 | std = numpy.std(l2norm) 76 | # print("%d l2norm of top 100 words: mean = %f stddev=%f" % (cnt, numpy.mean(l2norm), numpy.std(l2norm),)) 77 | l2norm = l2norm.tolist() 78 | l2norm.sort() 79 | l2norm.reverse() 80 | logging.info("%d l2norm of %s: median = %f mean = %f stddev=%f top3=%s" % (cnt, str, median, mean, std, `l2norm[:3]`)) 81 | # print("top 5 = %s" % `l2norm[:5]`) 82 | 83 | def weights_debug(w, cnt, str): 84 | """ 85 | Output the abs median, mean, and max of the weights w, including in debug out the str and training cnt 86 | """ 87 | w = numpy.abs(w) 88 | logging.info("%d abs of %s: median=%f mean=%f stddev=%f" % (cnt, str, numpy.median(w), numpy.mean(w), numpy.std(w),)) 89 | # print("%d l2norm of top 100 words: mean = %f stddev=%f" % (cnt, numpy.mean(l2norm), numpy.std(l2norm),)) 90 | # w = w.tolist() 91 | # w.sort() 92 | # w.reverse() 93 | # logging.info("\ttop 5 = %s" % `w[:5]`) 94 | # print("top 5 = %s" % `l2norm[:5]`) 95 | -------------------------------------------------------------------------------- /scripts/dump-embeddings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from optparse import OptionParser 4 | parser = OptionParser() 5 | parser.add_option("-m", "--modelfile", dest="modelfile") 6 | (options, args) = parser.parse_args() 7 | assert options.modelfile is not None 8 | 9 | import cPickle 10 | m = cPickle.load(open(options.modelfile)) 11 | #print m.parameters.embeddings.shape 12 | 13 | from vocabulary import wordmap 14 | for i in range(m.parameters.vocab_size): 15 | print wordmap.str(i), 16 | for v in m.parameters.embeddings[i]: 17 | print v, 18 | print 19 | -------------------------------------------------------------------------------- /scripts/eda/badrun.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # For every filename in sys.stdin, add a file BAD to that run directory. 4 | # Read stdin until there is a blank line. 5 | # 6 | # BUG: If the filename has a space in it, sorry you're out of luck. 7 | # BUG: We don't unescape quotes, we just strip them. 8 | # 9 | 10 | import sys, os.path, string 11 | #assert len(sys.argv)>2 12 | 13 | while 1: 14 | # for l in sys.stdin: 15 | l = sys.stdin.readline() 16 | # for l in sys.stdin: 17 | if string.strip(l) == "": break 18 | for f in string.split(l): 19 | f = f.replace('\"','').replace("\'",'') 20 | if not os.path.exists(f): continue 21 | d = os.path.dirname(os.path.realpath(f)) 22 | newf = os.path.join(d, "BAD") 23 | print newf 24 | if os.path.exists(newf): continue 25 | cmd = "rm %s" % os.path.join(d, "*.dat") 26 | print >> sys.stderr, "Creating %s, %s" % (newf, cmd) 27 | open(newf, "wt").close() 28 | os.system("rm %s" % os.path.join(d, "*.dat")) 29 | -------------------------------------------------------------------------------- /scripts/eda/batch-make-curves.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | rm *trainerror.dat 4 | #rm ../run*/*trainerror.dat 5 | 6 | # Make all dat files 7 | ../../eda/make-graphs-trainerror.pl 8 | 9 | ln -s ../*/*trainerror.dat . 10 | 11 | # Sort all dat files 12 | # First perl recipe adds gnuplot codes 13 | # Second perl recipe strips final ', \' to prevent gnuplot error 14 | echo > graphs-trainerror.gp 15 | echo "set terminal postscript color 12" >> graphs-trainerror.gp 16 | echo "set output 'graphs-trainerror.ps'" >> graphs-trainerror.gp 17 | echo "set logscale y" >> graphs-trainerror.gp 18 | echo "plot [] [] \\" >> graphs-trainerror.gp 19 | ~/dev/common-scripts/sort-curves.py *trainerror.dat | perl -ne "chop; print \"\\t'\$_' with l lw 3, \\\\\\n\"" | perl -e '$str = ""; while(<>){ $str .= $_; } $str =~ s/, \\$//s; print $str' >> graphs-trainerror.gp 20 | 21 | gnuplot graphs-trainerror.gp 22 | ps2pdf graphs-trainerror.ps 23 | cp *pdf ~/public_html/priv ; chmod a+r ~/public_html/priv/*pdf 24 | #scp *pdf turian@joyeux.iro.umontreal.ca:public_html/priv/ 25 | -------------------------------------------------------------------------------- /scripts/eda/make-graphs-trainerror.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # Make a .dat file for each .out file. 4 | # 5 | 6 | $gnuplot = "plot"; 7 | $first = 1; 8 | #foreach $f (split(/[\r\n]+/, `ls [0-9]*out`)) { 9 | foreach $f (split(/[\r\n]+/, `ls ../run*/log.* | grep -v 'dat\$'`)) { 10 | next if not $f =~ m/f1426d05c578bfd029875b646b66195044/; 11 | next if $f =~ m/\.dat$/; 12 | ($badf = $f) =~ s/\/[^\/]*$/\/BAD/; 13 | next if -e $badf; 14 | ($fnew = $f) =~ s/$/-trainerror.dat/; 15 | die $! if $fnew eq $f; 16 | print STDERR "$f => $fnew\n"; 17 | # We can allow e to be grepped, because of numbers like 5e-8 18 | $cmd = "cat $f | grep --text 'pre-update train err' | perl -ne 's/=/ /g; print' | cut -d ' ' -f 2,10 | grep -v '[a-df-zA-DF-Z]' | grep '0000 ' > $fnew"; 19 | print STDERR "$cmd\n"; 20 | system($cmd); 21 | $gnuplot .= "," unless $first; 22 | $first = 0; 23 | $gnuplot .= " \\\n\t'$fnew' with lp" 24 | } 25 | print "$gnuplot\n"; 26 | -------------------------------------------------------------------------------- /scripts/eda/old/batch-make-curves.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | rm *trainerror.dat *trainloss.dat *validationlogrankloss.dat 4 | #rm ../run*/*trainerror.dat ../run*/*trainloss.dat ../run*/*validationlogrankloss.dat 5 | 6 | # Make all dat files 7 | ../../eda/make-graphs-trainerror.pl 8 | ../../eda/make-graphs-trainloss.pl 9 | ../../eda/make-graphs-validationlogrankloss.pl 10 | 11 | ln -s ../*/*trainerror.dat . 12 | ln -s ../*/*trainloss.dat . 13 | ln -s ../*/*validationlogrankloss.dat . 14 | 15 | # Sort all dat files 16 | # First perl recipe adds gnuplot codes 17 | # Second perl recipe strips final ', \' to prevent gnuplot error 18 | echo > graphs-trainerror.gp 19 | echo "set terminal postscript color 12" >> graphs-trainerror.gp 20 | echo "set output 'graphs-trainerror.ps'" >> graphs-trainerror.gp 21 | echo "set logscale y" >> graphs-trainerror.gp 22 | echo "plot [] [0.006:0.025] \\" >> graphs-trainerror.gp 23 | ~/dev/common-scripts/sort-curves.py *trainerror.dat | perl -ne "chop; print \"\\t'\$_' with l lw 3, \\\\\\n\"" | perl -e '$str = ""; while(<>){ $str .= $_; } $str =~ s/, \\$//s; print $str' >> graphs-trainerror.gp 24 | 25 | echo > graphs-trainloss.gp 26 | echo "set terminal postscript color 12" >> graphs-trainloss.gp 27 | echo "set output 'graphs-trainloss.ps'" >> graphs-trainloss.gp 28 | echo "set logscale y" >> graphs-trainloss.gp 29 | echo "plot [] [] \\" >> graphs-trainloss.gp 30 | ~/dev/common-scripts/sort-curves.py *trainloss.dat | perl -ne "chop; print \"\\t'\$_' with l lw 3, \\\\\\n\"" | perl -e '$str = ""; while(<>){ $str .= $_; } $str =~ s/, \\$//s; print $str' >> graphs-trainloss.gp 31 | 32 | echo > graphs-validationlogrankloss.gp 33 | echo "set terminal postscript color 12" >> graphs-validationlogrankloss.gp 34 | echo "set output 'graphs-validationlogrankloss.ps'" >> graphs-validationlogrankloss.gp 35 | #echo "set logscale y" >> graphs-validationlogrankloss.gp 36 | echo "plot [] [] \\" >> graphs-validationlogrankloss.gp 37 | ~/dev/common-scripts/sort-curves.py *validationlogrankloss.dat | perl -ne "chop; print \"\\t'\$_' with l lw 3, \\\\\\n\"" | perl -e '$str = ""; while(<>){ $str .= $_; } $str =~ s/, \\$//s; print $str' >> graphs-validationlogrankloss.gp 38 | 39 | gnuplot graphs-trainerror.gp 40 | gnuplot graphs-trainloss.gp 41 | gnuplot graphs-validationlogrankloss.gp 42 | ps2pdf graphs-trainerror.ps 43 | ps2pdf graphs-trainloss.ps 44 | ps2pdf graphs-validationlogrankloss.ps 45 | cp *pdf ~/public_html/priv ; chmod a+r ~/public_html/priv/*pdf 46 | #scp *pdf turian@joyeux.iro.umontreal.ca:public_html/priv/ 47 | -------------------------------------------------------------------------------- /scripts/eda/old/make-graphs-trainloss.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # Make a .dat file for each .out file. 4 | # 5 | 6 | $gnuplot = "plot"; 7 | $first = 1; 8 | #foreach $f (split(/[\r\n]+/, `ls [0-9]*out`)) { 9 | foreach $f (split(/[\r\n]+/, `ls ../run*/log.* | grep -v 'dat\$'`)) { 10 | next if $f =~ m/\.dat$/; 11 | ($badf = $f) =~ s/\/[^\/]*$/\/BAD/; 12 | next if -e $badf; 13 | ($fnew = $f) =~ s/$/-trainloss.dat/; 14 | die $! if $fnew eq $f; 15 | print STDERR "$f => $fnew\n"; 16 | $cmd = "cat $f | grep --text 'pre-update train unpenalized loss' | perl -ne 's/=/ /g; print' | cut -d ' ' -f 2,11 | grep -v '[a-zA-Z]' | grep '0000 ' > $fnew"; 17 | print STDERR "$cmd\n"; 18 | system($cmd); 19 | $gnuplot .= "," unless $first; 20 | $first = 0; 21 | $gnuplot .= " \\\n\t'$fnew' with lp" 22 | } 23 | print "$gnuplot\n"; 24 | -------------------------------------------------------------------------------- /scripts/eda/old/make-graphs-validationlogrankloss.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # Make a .dat file for each .out file. 4 | # 5 | 6 | $gnuplot = "plot"; 7 | $first = 1; 8 | #foreach $f (split(/[\r\n]+/, `ls [0-9]*out`)) { 9 | foreach $f (split(/[\r\n]+/, `ls ../run*/log.* | grep -v 'dat\$'`)) { 10 | next if $f =~ m/\.dat$/; 11 | ($badf = $f) =~ s/\/[^\/]*$/\/BAD/; 12 | next if -e $badf; 13 | ($fnew = $f) =~ s/$/-validationlogrankloss.dat/; 14 | die $! if $fnew eq $f; 15 | print STDERR "$f => $fnew\n"; 16 | $cmd = "cat $f | grep --text FINAL | cut -d ' ' -f 6,9 | perl -ne 's/[:,]//g; print' > $fnew"; 17 | print STDERR "$cmd\n"; 18 | system($cmd); 19 | $gnuplot .= "," unless $first; 20 | $first = 0; 21 | $gnuplot .= " \\\n\t'$fnew' with lp" 22 | } 23 | print "$gnuplot\n"; 24 | -------------------------------------------------------------------------------- /scripts/eda/remove-nonfinal-models.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # For each directory in @ARGV, go in that directory and remove every 4 | # model file except for the last one. 5 | # 6 | 7 | @torm = (); 8 | foreach $d (@ARGV) { 9 | $last = -1; 10 | # Find the last model 11 | foreach $f (split(/[\r\n]+/, `ls $d`)) { 12 | if ($f =~ m/model-(\d+).pkl/) { 13 | $last = $1 if $1 > $last; 14 | } 15 | } 16 | # All non-last models are added to torm 17 | foreach $f (split(/[\r\n]+/, `ls $d`)) { 18 | if ($f =~ m/model-(\d+).pkl/) { 19 | if ($1 < $last) { 20 | $torm[++$#torm] = "$d/$f"; 21 | } else { 22 | print "KEEPING $d/$f\n"; 23 | } 24 | } 25 | } 26 | } 27 | foreach $f (@torm) { 28 | $cmd = "rm $f"; 29 | print "$cmd\n"; 30 | system("$cmd"); 31 | } 32 | -------------------------------------------------------------------------------- /scripts/hyperparameters.language-model.full.yaml: -------------------------------------------------------------------------------- 1 | #: Not actually used directly, just for convenience 2 | # 3 | #locations: {"DATA_DIR": "/home/fringant2/lisa/turian/dev/python/language-model/data/"} 4 | #locations: {"DATA_DIR": "/home/turianjo/dev/python/language-model/data/"} 5 | #locations: {"DATA_DIR": "../data-sample-bilingual/"} 6 | #locations: {"DATA_DIR": "../data/full-bilingual/"} 7 | #locations: {"DATA_DIR": "../data/maxlength40-lemmas/"} 8 | locations: {"DATA_DIR": "../data/maxlength40-lemmas-filtered/"} 9 | #locations: {"DATA_DIR": "../data/small-sample.maxlength40-lemmas-filtered/"} 10 | 11 | # Are we running this automatically from a console, or is this job part of some larger batch? 12 | # If True, we log output to stdout, not to a log file on disk. 13 | #console: False 14 | console: True 15 | 16 | # A list of the validation examples 17 | # Currently unused 18 | VALIDATION_INPUT: /u/turian/data/SemEval-2-2010/Task 3 - Cross-Lingual Word Sense Disambiguation/validation.txt 19 | 20 | PERCENT_OF_TRAINING_EXAMPLES_FOR_VALIDATION: 0.01 21 | 22 | ## 32-bit for the GPU 23 | ##import theano.config as config 24 | ##floatX: config.floatX 25 | #floatX: 'float32' 26 | 27 | # Should we induce an embedding for OOV words? 28 | INCLUDE_UNKNOWN_WORD: True 29 | 30 | RUN_NAME: "rcv1.case-intact" 31 | MONOLINGUAL_VOCABULARY_SIZE: 268810 32 | 33 | # Make all example weights are uniform 34 | # Note: When a word has more target words, we might want to give it higher weight. (Or, when we use ngram noise.) 35 | UNIFORM EXAMPLE WEIGHTS: True 36 | 37 | # Bilingual corpora language pairs 38 | #W2W BICORPORA: [["en", "fr"], ["en", "nl"]] 39 | #W2W BICORPORA: [["en", "fr"]] 40 | W2W BICORPORA: [["en", "nl"], ["en", "de"], ["en", "it"], ["en", "fr"], ["en", "es"]] 41 | # Monolingual corpora language singletons 42 | W2W MONOCORPORA: [] 43 | #W2W MONOCORPORA: ["en"] 44 | 45 | # Only train on examples in which the focus word lemmatizes to one of these words. 46 | # If an empty list, we use ALL examples, and don't do any filtering. 47 | W2W FOCUS LEMMAS: [ 48 | "bank", "movement", "occupation", "passage", "plant", 49 | "coach", "education", "execution", "figure", "job", "post", "pot", "range", "rest", "ring", "mood", "soil", "strain", "match", "scene", "test", "mission", "letter", "paper", "side" 50 | ] 51 | 52 | # Delexicalize all words that occur fewer than this number of times. 53 | W2W MINIMUM WORD FREQUENCY: 3 54 | #W2W MINIMUM WORD FREQUENCY: 10 55 | 56 | # Skip translations to unknown word. 57 | # This makes coding easy, because we treat the Unknown word as having its own language. 58 | # However, this means that we always will try to translate words to 59 | # something in the target vocab, whereas in practice we might want to 60 | # translate to *UNKNOWN* and in the target language just keep the word 61 | # form as is. 62 | W2W SKIP TRANSLATIONS TO UNKNOWN WORD: True 63 | 64 | # Use these embeddings to initialize the model 65 | W2W INITIAL EMBEDDINGS: /u/turian/data/share/embeddings/model-2520000000.LEARNING_RATE=1e-09.EMBEDDING_LEARNING_RATE=1e-06.HIDDEN_SIZE=800.txt.gz 66 | # Language of the initial embeddings 67 | W2W INITIAL EMBEDDINGS LANGUAGE: en 68 | # Were the initial embeddings induced case-sensitive, but now we want to lowercase them? 69 | W2W LOWERCASE INITIAL EMBEDDINGS BEFORE INITIALIZATION: True 70 | 71 | # Use the log-bilinear model or not? 72 | # If True, we predict the Mnih log-bilinear model 73 | # If False, we predict the C&W language model. 74 | #LOG BILINEAR MODEL: True 75 | LOG BILINEAR MODEL: False 76 | 77 | # Number of examples per minibach 78 | MINIBATCH SIZE: 100 79 | 80 | # Randomly initialize embeddings uniformly in the range [-this value, +this value] 81 | INITIAL_EMBEDDING_RANGE: 0.01 82 | 83 | # l1 penalty appliedto C&W embeddings 84 | CW_EMBEDDING_L1_PENALTY: 0. 85 | 86 | NORMALIZE_EMBEDDINGS: False 87 | #NORMALIZE_EMBEDDINGS: True 88 | #UPDATES_PER_NORMALIZE_EMBEDDINGS: 1000 89 | 90 | # Number of validation examples 91 | #VALIDATION EXAMPLES: 10000 92 | #VALIDATION EXAMPLES: 2500 93 | VALIDATION EXAMPLES: 1000 94 | 95 | # What percent of noise examples should we use for computing the logrank 96 | # during validation? 97 | # This is a speed optimization. 98 | PERCENT OF NOISE EXAMPLES FOR VALIDATION LOGRANK: 0.01 99 | 100 | NGRAM_FOR_TRAINING_NOISE: 0 101 | 102 | #NGRAMS: {(1, 5000): join(DATA_DIR, "1grams-wikitext-5000.json.gz"), 103 | #(1, 10000): join(DATA_DIR, "1grams-wikitext-10000.json.gz"), 104 | #(1, 20000): join(DATA_DIR, "1grams-wikitext-20000.json.gz")} 105 | 106 | # Number of instances of each ngram to add, for smoothing. 107 | TRAINING_NOISE_SMOOTHING_ADDITION: 0 108 | 109 | # Each embedded word representation has this width 110 | EMBEDDING_SIZE: 50 111 | #EMBEDDING_SIZE: 20 112 | #EMBEDDING_SIZE: 5 113 | 114 | # Predict with a window of five words at a time 115 | WINDOW_SIZE: 11 116 | 117 | HIDDEN_SIZE: 100 118 | #HIDDEN_SIZE: 40 119 | #HIDDEN_SIZE: 10 120 | 121 | # Two hidden layers, or only one? 122 | TWO_HIDDEN_LAYERS: False 123 | 124 | #: Scaling value to control range for weight initialization 125 | #SCALE_INITIAL_WEIGHTS_BY: math.sqrt(3) 126 | SCALE_INITIAL_WEIGHTS_BY: 1 127 | 128 | # Which activation function to use? 129 | #ACTIVATION_FUNCTION="sigmoid" 130 | #ACTIVATION_FUNCTION="tanh" 131 | ACTIVATION_FUNCTION: "softsign" 132 | 133 | LEARNING_RATE: 0.000000011 134 | #LEARNING_RATE: 0.000000000001 135 | 136 | # The learning rate for the embeddings 137 | #EMBEDDING_LEARNING_RATE: 0.00000000034 138 | EMBEDDING_LEARNING_RATE: 0 139 | 140 | ## number of (higher-order) quadratic filters for James's neuron 141 | #NUMBER_OF_QUADRATIC_FILTERS=0 142 | ## We use this scaling factor for initial weights of quadratic filters, 143 | ## instead of SCALE_INITIAL_WEIGHTS_BY 144 | ## @note: Try between 10 and 0.01 145 | #SCALE_QUADRATIC_INITIAL_WEIGHTS_BY: 1 146 | 147 | # Validate after this many examples 148 | #VALIDATE_EVERY: 10000000 149 | VALIDATE_EVERY: 100000 150 | #VALIDATE_EVERY: 10000 151 | -------------------------------------------------------------------------------- /scripts/hyperparameters.language-model.sample.yaml: -------------------------------------------------------------------------------- 1 | #: Not actually used directly, just for convenience 2 | # 3 | #locations: {"DATA_DIR": "/home/fringant2/lisa/turian/dev/python/language-model/data/"} 4 | #locations: {"DATA_DIR": "/home/turianjo/dev/python/language-model/data/"} 5 | #locations: {"DATA_DIR": "../data-sample-bilingual/"} 6 | #locations: {"DATA_DIR": "../data/full-bilingual/"} 7 | #locations: {"DATA_DIR": "../data/maxlength40-lemmas/"} 8 | #locations: {"DATA_DIR": "../data/maxlength40-lemmas-filtered/"} 9 | locations: {"DATA_DIR": "../data/small-sample.maxlength40-lemmas-filtered/"} 10 | 11 | # Are we running this automatically from a console, or is this job part of some larger batch? 12 | # If True, we log output to stdout, not to a log file on disk. 13 | console: False 14 | #console: True 15 | 16 | # A list of the validation examples 17 | # Currently unused 18 | VALIDATION_INPUT: /u/turian/data/SemEval-2-2010/Task 3 - Cross-Lingual Word Sense Disambiguation/validation.txt 19 | 20 | PERCENT_OF_TRAINING_EXAMPLES_FOR_VALIDATION: 0.01 21 | 22 | ## 32-bit for the GPU 23 | ##import theano.config as config 24 | ##floatX: config.floatX 25 | #floatX: 'float32' 26 | 27 | # Should we induce an embedding for OOV words? 28 | INCLUDE_UNKNOWN_WORD: True 29 | 30 | RUN_NAME: "rcv1.case-intact" 31 | MONOLINGUAL_VOCABULARY_SIZE: 268810 32 | 33 | # Make all example weights are uniform 34 | # Note: When a word has more target words, we might want to give it higher weight. (Or, when we use ngram noise.) 35 | UNIFORM EXAMPLE WEIGHTS: True 36 | 37 | # Bilingual corpora language pairs 38 | #W2W BICORPORA: [["en", "fr"], ["en", "nl"]] 39 | #W2W BICORPORA: [["en", "fr"]] 40 | W2W BICORPORA: [["en", "nl"], ["en", "de"], ["en", "it"], ["en", "fr"], ["en", "es"]] 41 | # Monolingual corpora language singletons 42 | W2W MONOCORPORA: [] 43 | #W2W MONOCORPORA: ["en"] 44 | 45 | # Only train on examples in which the focus word lemmatizes to one of these words. 46 | # If an empty list, we use ALL examples, and don't do any filtering. 47 | W2W FOCUS LEMMAS: [ 48 | "bank", "movement", "occupation", "passage", "plant", 49 | "coach", "education", "execution", "figure", "job", "post", "pot", "range", "rest", "ring", "mood", "soil", "strain", "match", "scene", "test", "mission", "letter", "paper", "side" 50 | ] 51 | 52 | # Delexicalize all words that occur fewer than this number of times. 53 | W2W MINIMUM WORD FREQUENCY: 3 54 | #W2W MINIMUM WORD FREQUENCY: 10 55 | 56 | # Skip translations to unknown word. 57 | # This makes coding easy, because we treat the Unknown word as having its own language. 58 | # However, this means that we always will try to translate words to 59 | # something in the target vocab, whereas in practice we might want to 60 | # translate to *UNKNOWN* and in the target language just keep the word 61 | # form as is. 62 | W2W SKIP TRANSLATIONS TO UNKNOWN WORD: True 63 | 64 | # Use these embeddings to initialize the model 65 | W2W INITIAL EMBEDDINGS: /u/turian/data/share/embeddings/model-2520000000.LEARNING_RATE=1e-09.EMBEDDING_LEARNING_RATE=1e-06.HIDDEN_SIZE=800.txt.gz 66 | # Language of the initial embeddings 67 | W2W INITIAL EMBEDDINGS LANGUAGE: en 68 | # Were the initial embeddings induced case-sensitive, but now we want to lowercase them? 69 | W2W LOWERCASE INITIAL EMBEDDINGS BEFORE INITIALIZATION: True 70 | 71 | # Use the log-bilinear model or not? 72 | # If True, we predict the Mnih log-bilinear model 73 | # If False, we predict the C&W language model. 74 | #LOG BILINEAR MODEL: True 75 | LOG BILINEAR MODEL: False 76 | 77 | # Number of examples per minibach 78 | MINIBATCH SIZE: 100 79 | 80 | # Randomly initialize embeddings uniformly in the range [-this value, +this value] 81 | INITIAL_EMBEDDING_RANGE: 0.01 82 | 83 | # l1 penalty appliedto C&W embeddings 84 | CW_EMBEDDING_L1_PENALTY: 0. 85 | 86 | NORMALIZE_EMBEDDINGS: False 87 | #NORMALIZE_EMBEDDINGS: True 88 | #UPDATES_PER_NORMALIZE_EMBEDDINGS: 1000 89 | 90 | # Number of validation examples 91 | #VALIDATION EXAMPLES: 10000 92 | #VALIDATION EXAMPLES: 2500 93 | VALIDATION EXAMPLES: 1000 94 | 95 | # What percent of noise examples should we use for computing the logrank 96 | # during validation? 97 | # This is a speed optimization. 98 | PERCENT OF NOISE EXAMPLES FOR VALIDATION LOGRANK: 0.01 99 | 100 | NGRAM_FOR_TRAINING_NOISE: 0 101 | 102 | #NGRAMS: {(1, 5000): join(DATA_DIR, "1grams-wikitext-5000.json.gz"), 103 | #(1, 10000): join(DATA_DIR, "1grams-wikitext-10000.json.gz"), 104 | #(1, 20000): join(DATA_DIR, "1grams-wikitext-20000.json.gz")} 105 | 106 | # Number of instances of each ngram to add, for smoothing. 107 | TRAINING_NOISE_SMOOTHING_ADDITION: 0 108 | 109 | # Each embedded word representation has this width 110 | EMBEDDING_SIZE: 50 111 | #EMBEDDING_SIZE: 20 112 | #EMBEDDING_SIZE: 5 113 | 114 | # Predict with a window of five words at a time 115 | WINDOW_SIZE: 5 116 | 117 | HIDDEN_SIZE: 100 118 | #HIDDEN_SIZE: 40 119 | #HIDDEN_SIZE: 10 120 | 121 | # Two hidden layers, or only one? 122 | TWO_HIDDEN_LAYERS: False 123 | 124 | #: Scaling value to control range for weight initialization 125 | #SCALE_INITIAL_WEIGHTS_BY: math.sqrt(3) 126 | SCALE_INITIAL_WEIGHTS_BY: 1 127 | 128 | # Which activation function to use? 129 | #ACTIVATION_FUNCTION="sigmoid" 130 | #ACTIVATION_FUNCTION="tanh" 131 | ACTIVATION_FUNCTION: "softsign" 132 | 133 | LEARNING_RATE: 0.000000011 134 | #LEARNING_RATE: 0.000000000001 135 | 136 | # The learning rate for the embeddings 137 | #EMBEDDING_LEARNING_RATE: 0.00000000034 138 | EMBEDDING_LEARNING_RATE: 0 139 | 140 | ## number of (higher-order) quadratic filters for James's neuron 141 | #NUMBER_OF_QUADRATIC_FILTERS=0 142 | ## We use this scaling factor for initial weights of quadratic filters, 143 | ## instead of SCALE_INITIAL_WEIGHTS_BY 144 | ## @note: Try between 10 and 0.01 145 | #SCALE_QUADRATIC_INITIAL_WEIGHTS_BY: 1 146 | 147 | # Validate after this many examples 148 | #VALIDATE_EVERY: 10000000 149 | VALIDATE_EVERY: 10000 150 | -------------------------------------------------------------------------------- /scripts/hyperparameters.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module to update hyperparameters automatically. 3 | """ 4 | 5 | from os.path import join 6 | import common.hyperparameters 7 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 8 | HYPERPARAMETERS["DATA_DIR"] = HYPERPARAMETERS["locations"]["DATA_DIR"] 9 | RUN_NAME = HYPERPARAMETERS["RUN_NAME"] 10 | MONOLINGUAL_VOCABULARY_SIZE = HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"] 11 | INCLUDE_UNKNOWN_WORD = HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"] 12 | HYPERPARAMETERS["TRAIN_SENTENCES"] = join(HYPERPARAMETERS["DATA_DIR"], "%s.train.txt.gz" % RUN_NAME) 13 | HYPERPARAMETERS["ORIGINAL VALIDATION_SENTENCES"] = join(HYPERPARAMETERS["DATA_DIR"], "%s.validation.txt.gz" % RUN_NAME) 14 | HYPERPARAMETERS["VALIDATION_SENTENCES"] = join(HYPERPARAMETERS["DATA_DIR"], "%s.validation-%d.txt.gz" % (RUN_NAME, HYPERPARAMETERS["VALIDATION EXAMPLES"])) 15 | HYPERPARAMETERS["MONOLINGUAL_VOCABULARY"] = join(HYPERPARAMETERS["DATA_DIR"], "vocabulary-%s-%d.txt.gz" % (RUN_NAME, MONOLINGUAL_VOCABULARY_SIZE)) 16 | HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_IDMAP_FILE"] = join(HYPERPARAMETERS["DATA_DIR"], "idmap.%s-%d.include_unknown=%s.pkl.gz" % (RUN_NAME, MONOLINGUAL_VOCABULARY_SIZE, INCLUDE_UNKNOWN_WORD)) 17 | HYPERPARAMETERS["INITIAL_EMBEDDINGS"] = join(HYPERPARAMETERS["DATA_DIR"], "initial-embeddings.minfreq=%d.include_unknown=%s.pkl.gz" % (HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"], HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"])) 18 | -------------------------------------------------------------------------------- /scripts/lemmatizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Lemmatize English using the NLTK WordNetLemmatizer. 3 | """ 4 | 5 | from nltk.stem.wordnet import WordNetLemmatizer 6 | 7 | _lmtzr = None 8 | def lmtzr(): 9 | global _lmtzr 10 | if _lmtzr is None: _lmtzr = WordNetLemmatizer() 11 | return _lmtzr 12 | 13 | def lemmatize(language, wordform): 14 | assert language == "en" 15 | return lmtzr().lemmatize(wordform) 16 | -------------------------------------------------------------------------------- /scripts/miscglobals.py: -------------------------------------------------------------------------------- 1 | """ 2 | Miscellaneous globals. 3 | 4 | @todo: Most of these should be moved somewhere more specific. 5 | """ 6 | 7 | #: RNG seed 8 | RANDOMSEED = 0 9 | 10 | #LINKER = 'c|py' 11 | ##LINKER = 'py' 12 | #OPTIMIZER = 'merge' # 'math' optimizer is broken with 'c|py' linker 13 | -------------------------------------------------------------------------------- /scripts/model/__init__.py: -------------------------------------------------------------------------------- 1 | from model import * 2 | -------------------------------------------------------------------------------- /scripts/model/graphcw.py: -------------------------------------------------------------------------------- 1 | """ 2 | Theano graph of Collobert & Weston language model. 3 | """ 4 | 5 | import theano 6 | #import theano.sandbox.cuda 7 | #theano.sandbox.cuda.use() 8 | 9 | from theano.compile import pfunc, shared 10 | from theano import config 11 | floatX = config.floatX 12 | 13 | 14 | from theano import tensor as t 15 | from theano import scalar as s 16 | 17 | from theano.tensor.basic import horizontal_stack 18 | from theano.tensor import dot 19 | 20 | from theano import gradient 21 | 22 | import theano.compile 23 | #from miscglobals import LINKER, OPTIMIZER 24 | #mode = theano.compile.Mode(LINKER, OPTIMIZER) 25 | #import theano.compile.debugmode 26 | #COMPILE_MODE = theano.compile.debugmode.DebugMode(optimizer='fast_run', check_isfinite=False) 27 | #import theano.compile.profilemode 28 | #COMPILE_MODE = theano.compile.profilemode.ProfileMode() 29 | COMPILE_MODE = theano.compile.Mode('c|py', 'fast_run') 30 | #COMPILE_MODE = theano.compile.Mode('py', 'fast_compile') 31 | 32 | import numpy 33 | 34 | #hidden_weights = t.matrix() 35 | #hidden_biases = t.matrix() 36 | 37 | #if HYPERPARAMETERS["USE_SECOND_HIDDEN_LAYER"] == True: 38 | # hidden2_weights = t.matrix() 39 | # hidden2_biases = t.matrix() 40 | 41 | #output_weights = t.matrix() 42 | #output_biases = t.matrix() 43 | 44 | # TODO: Include gradient steps in actual function, don't do them manually 45 | 46 | def activation_function(r): 47 | from hyperparameters import HYPERPARAMETERS 48 | if HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "sigmoid": 49 | return sigmoid(r) 50 | elif HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "tanh": 51 | return t.tanh(r) 52 | elif HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "softsign": 53 | from theano.sandbox.softsign import softsign 54 | return softsign(r) 55 | else: 56 | assert 0 57 | 58 | def stack(x): 59 | """ 60 | Horizontally stack a list of representations, and then compress them to 61 | one representation. 62 | """ 63 | assert len(x) >= 2 64 | return horizontal_stack(*x) 65 | 66 | def score(x): 67 | from hyperparameters import HYPERPARAMETERS 68 | prehidden = dot(x, hidden_weights) + hidden_biases 69 | hidden = activation_function(prehidden) 70 | if HYPERPARAMETERS["TWO_HIDDEN_LAYERS"] == True: 71 | prehidden2 = dot(hidden, hidden2_weights) + hidden2_biases 72 | hidden2 = activation_function(prehidden2) 73 | score = dot(hidden2, output_weights) + output_biases 74 | else: 75 | score = dot(hidden, output_weights) + output_biases 76 | return score, prehidden 77 | 78 | cached_functions = {} 79 | def functions(sequence_length): 80 | """ 81 | Return two functions 82 | * The first function does prediction. 83 | * The second function does learning. 84 | """ 85 | global cached_functions 86 | cachekey = (sequence_length) 87 | if len(cached_functions.keys()) > 1: 88 | # This is problematic because we use global variables for the model parameters. 89 | # Hence, we might be unsafe, if we are using the wrong model parameters globally. 90 | assert 0 91 | if cachekey not in cached_functions: 92 | print "Need to construct graph for sequence_length=%d..." % (sequence_length) 93 | # Create the sequence_length inputs. 94 | # Each is a t.matrix(), initial word embeddings (provided by 95 | # Jason + Ronan) to be transformed into an initial representation. 96 | # We could use a vector, but instead we use a matrix with one row. 97 | correct_inputs = [t.matrix() for i in range(sequence_length)] 98 | noise_inputs = [t.matrix() for i in range(sequence_length)] 99 | learning_rate = t.scalar() 100 | 101 | stacked_correct_inputs = stack(correct_inputs) 102 | stacked_noise_inputs = stack(noise_inputs) 103 | 104 | correct_score, correct_prehidden = score(stacked_correct_inputs) 105 | noise_score, noise_prehidden = score(stacked_noise_inputs) 106 | unpenalized_loss = t.clip(1 - correct_score + noise_score, 0, 1e999) 107 | 108 | from hyperparameters import HYPERPARAMETERS 109 | if HYPERPARAMETERS["CW_EMBEDDING_L1_PENALTY"] != 0: 110 | l1penalty = t.sum(t.abs_(stacked_correct_inputs) + t.abs_(stacked_noise_inputs), axis=1).T * HYPERPARAMETERS["CW_EMBEDDING_L1_PENALTY"] 111 | else: 112 | l1penalty = t.as_tensor_variable(numpy.asarray(0, dtype=floatX)) 113 | # l1penalty = t.as_tensor_variable(numpy.asarray((0,), dtype=floatX)) 114 | loss = (unpenalized_loss.T + l1penalty).T 115 | 116 | # import sys 117 | # print >> sys.stderr, "FIXME: MODEL_LEARNING_RATE = fixed at 0.001" 118 | # MODEL_LEARNING_RATE = t.as_tensor_variable(numpy.asarray(0.001, dtype=floatX)) 119 | 120 | total_loss = t.sum(loss) 121 | 122 | if HYPERPARAMETERS["TWO_HIDDEN_LAYERS"] == True: 123 | (dhidden_weights, dhidden_biases, dhidden2_weights, dhidden2_biases, doutput_weights, doutput_biases) = t.grad(total_loss, [hidden_weights, hidden_biases, hidden2_weights, hidden2_biases, output_weights, output_biases]) 124 | else: 125 | (dhidden_weights, dhidden_biases, doutput_weights, doutput_biases) = t.grad(total_loss, [hidden_weights, hidden_biases, output_weights, output_biases]) 126 | if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0: 127 | dcorrect_inputs = t.grad(total_loss, correct_inputs) 128 | dnoise_inputs = t.grad(total_loss, noise_inputs) 129 | #print "REMOVEME", len(dcorrect_inputs) 130 | predict_inputs = correct_inputs 131 | train_inputs = correct_inputs + noise_inputs + [learning_rate] 132 | verbose_predict_inputs = predict_inputs 133 | predict_outputs = [correct_score] 134 | if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0: 135 | train_outputs = dcorrect_inputs + dnoise_inputs + [loss, unpenalized_loss, l1penalty, correct_score, noise_score] 136 | else: 137 | train_outputs = [loss, unpenalized_loss, l1penalty, correct_score, noise_score] 138 | verbose_predict_outputs = [correct_score, correct_prehidden] 139 | 140 | import theano.gof.graph 141 | 142 | nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs)) 143 | print "About to compile predict function over %d ops [nodes]..." % nnodes 144 | predict_function = pfunc(predict_inputs, predict_outputs, mode=COMPILE_MODE) 145 | print "...done constructing graph for sequence_length=%d" % (sequence_length) 146 | 147 | nnodes = len(theano.gof.graph.ops(verbose_predict_inputs, verbose_predict_outputs)) 148 | print "About to compile predict function over %d ops [nodes]..." % nnodes 149 | verbose_predict_function = pfunc(verbose_predict_inputs, verbose_predict_outputs, mode=COMPILE_MODE) 150 | print "...done constructing graph for sequence_length=%d" % (sequence_length) 151 | 152 | nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs)) 153 | print "About to compile train function over %d ops [nodes]..." % nnodes 154 | if HYPERPARAMETERS["TWO_HIDDEN_LAYERS"] == True: 155 | train_function = pfunc(train_inputs, train_outputs, mode=COMPILE_MODE, updates=[(p, p-learning_rate*gp) for p, gp in zip((hidden_weights, hidden_biases, hidden2_weights, hidden2_biases, output_weights, output_biases), (dhidden_weights, dhidden_biases, dhidden2_weights, dhidden2_biases, doutput_weights, doutput_biases))]) 156 | else: 157 | train_function = pfunc(train_inputs, train_outputs, mode=COMPILE_MODE, updates=[(p, p-learning_rate*gp) for p, gp in zip((hidden_weights, hidden_biases, output_weights, output_biases), (dhidden_weights, dhidden_biases, doutput_weights, doutput_biases))]) 158 | print "...done constructing graph for sequence_length=%d" % (sequence_length) 159 | 160 | cached_functions[cachekey] = (predict_function, train_function, verbose_predict_function) 161 | return cached_functions[cachekey] 162 | 163 | #def apply_function(fn, sequence, target_output, parameters): 164 | # assert len(sequence) == parameters.hidden_width 165 | # inputs = [numpy.asarray([token]) for token in sequence] 166 | # if target_output != None: 167 | ## if HYPERPARAMETERS["USE_SECOND_HIDDEN_LAYER"]: 168 | ## return fn(*(inputs + [numpy.asarray([target_output]), parameters.hidden_weights, parameters.hidden_biases, parameters.hidden2_weights, parameters.hidden2_biases, parameters.output_weights, parameters.output_biases])) 169 | ## else: 170 | # return fn(*(inputs + [numpy.asarray([target_output]), parameters.hidden_weights, parameters.hidden_biases, parameters.output_weights, parameters.output_biases])) 171 | # else: 172 | ## if HYPERPARAMETERS["USE_SECOND_HIDDEN_LAYER"]: 173 | ## return fn(*(inputs + [parameters.hidden_weights, parameters.hidden_biases, parameters.hidden2_weights, parameters.hidden2_biases, parameters.output_weights, parameters.output_biases])) 174 | ## else: 175 | # return fn(*(inputs + [parameters.hidden_weights, parameters.hidden_biases, parameters.output_weights, parameters.output_biases])) 176 | # 177 | def predict(correct_sequence): 178 | fn = functions(sequence_length=len(correct_sequence))[0] 179 | # print "REMOVEME", correct_sequence 180 | r = fn(*(correct_sequence)) 181 | assert len(r) == 1 182 | r = r[0] 183 | assert r.shape == (1, 1) 184 | return r[0,0] 185 | def verbose_predict(correct_sequence): 186 | fn = functions(sequence_length=len(correct_sequence))[2] 187 | r = fn(*(correct_sequence)) 188 | assert len(r) == 2 189 | (score, prehidden) = r 190 | assert score.shape == (1, 1) 191 | return score[0,0], prehidden 192 | def train(correct_sequence, noise_sequence, learning_rate): 193 | assert len(correct_sequence) == len(noise_sequence) 194 | fn = functions(sequence_length=len(correct_sequence))[1] 195 | r = fn(*(correct_sequence + noise_sequence + [learning_rate])) 196 | from hyperparameters import HYPERPARAMETERS 197 | if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0: 198 | dcorrect_inputs = r[:len(correct_sequence)] 199 | r = r[len(correct_sequence):] 200 | dnoise_inputs = r[:len(noise_sequence)] 201 | r = r[len(correct_sequence):] 202 | # print "REMOVEME", len(dcorrect_inputs), len(dnoise_inputs) 203 | (loss, unpenalized_loss, l1penalty, correct_score, noise_score) = r 204 | # if loss == 0: 205 | # for di in [dhidden_weights, dhidden_biases, doutput_weights, doutput_biases]: 206 | # assert (di == 0).all() 207 | 208 | if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0: 209 | return (dcorrect_inputs, dnoise_inputs, loss, unpenalized_loss, l1penalty, correct_score, noise_score) 210 | else: 211 | return (loss, unpenalized_loss, l1penalty, correct_score, noise_score) 212 | -------------------------------------------------------------------------------- /scripts/model/graphlbl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Theano graph of Mnih log bi-linear model. 3 | """ 4 | 5 | import theano 6 | import theano.sandbox.cuda 7 | theano.sandbox.cuda.use() 8 | 9 | from theano import tensor as t 10 | from theano import scalar as s 11 | 12 | from theano.tensor.basic import horizontal_stack 13 | from theano.tensor import dot 14 | 15 | from theano import gradient 16 | 17 | import theano.compile 18 | #from miscglobals import LINKER, OPTIMIZER 19 | #mode = theano.compile.Mode(LINKER, OPTIMIZER) 20 | COMPILE_MODE = theano.compile.Mode('c|py', 'fast_run') 21 | #COMPILE_MODE = theano.compile.Mode('py', 'fast_compile') 22 | 23 | import numpy 24 | 25 | from common.chopargs import chopargs 26 | 27 | #output_weights = t.xmatrix() 28 | #output_biases = t.xmatrix() 29 | 30 | # TODO: Include gradient steps in actual function, don't do them manually 31 | 32 | def activation_function(r): 33 | from hyperparameters import HYPERPARAMETERS 34 | if HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "sigmoid": 35 | return sigmoid(r) 36 | elif HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "tanh": 37 | return t.tanh(r) 38 | elif HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "softsign": 39 | from theano.sandbox.softsign import softsign 40 | return softsign(r) 41 | else: 42 | assert 0 43 | 44 | def stack(x): 45 | """ 46 | Horizontally stack a list of representations, and then compress them to 47 | one representation. 48 | """ 49 | assert len(x) >= 2 50 | return horizontal_stack(*x) 51 | 52 | def score(targetrepr, predictrepr): 53 | # TODO: Is this the right scoring function? 54 | score = dot(targetrepr, predictrepr.T) 55 | return score 56 | 57 | cached_functions = {} 58 | def functions(sequence_length): 59 | """ 60 | Return two functions 61 | * The first function does prediction. 62 | * The second function does learning. 63 | """ 64 | global cached_functions 65 | p = (sequence_length) 66 | if len(cached_functions.keys()) > 1: 67 | # This is problematic because we use global variables for the model parameters. 68 | # Hence, we might be unsafe, if we are using the wrong model parameters globally. 69 | assert 0 70 | if p not in cached_functions: 71 | print "Need to construct graph for sequence_length=%d..." % (sequence_length) 72 | # Create the sequence_length inputs. 73 | # Each is a t.xmatrix(), initial word embeddings (provided by 74 | # Jason + Ronan) to be transformed into an initial representation. 75 | # We could use a vector, but instead we use a matrix with one row. 76 | sequence = [t.xmatrix() for i in range(sequence_length)] 77 | correct_repr = t.xmatrix() 78 | noise_repr = t.xmatrix() 79 | # correct_scorebias = t.xscalar() 80 | # noise_scorebias = t.xscalar() 81 | correct_scorebias = t.xvector() 82 | noise_scorebias = t.xvector() 83 | 84 | stackedsequence = stack(sequence) 85 | predictrepr = dot(stackedsequence, output_weights) + output_biases 86 | 87 | correct_score = score(correct_repr, predictrepr) + correct_scorebias 88 | noise_score = score(noise_repr, predictrepr) + noise_scorebias 89 | loss = t.clip(1 - correct_score + noise_score, 0, 1e999) 90 | 91 | (doutput_weights, doutput_biases) = t.grad(loss, [output_weights, output_biases]) 92 | dsequence = t.grad(loss, sequence) 93 | (dcorrect_repr, dnoise_repr) = t.grad(loss, [correct_repr, noise_repr]) 94 | (dcorrect_scorebias, dnoise_scorebias) = t.grad(loss, [correct_scorebias, noise_scorebias]) 95 | #print "REMOVEME", len(dcorrect_inputs) 96 | predict_inputs = sequence + [correct_repr, correct_scorebias, output_weights, output_biases] 97 | train_inputs = sequence + [correct_repr, noise_repr, correct_scorebias, noise_scorebias, output_weights, output_biases] 98 | predict_outputs = [predictrepr, correct_score] 99 | train_outputs = [loss, predictrepr, correct_score, noise_score] + dsequence + [dcorrect_repr, dnoise_repr, doutput_weights, doutput_biases, dcorrect_scorebias, dnoise_scorebias] 100 | # train_outputs = [loss, correct_repr, correct_score, noise_repr, noise_score] 101 | 102 | import theano.gof.graph 103 | 104 | nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs)) 105 | print "About to compile predict function over %d ops [nodes]..." % nnodes 106 | predict_function = theano.function(predict_inputs, predict_outputs, mode=COMPILE_MODE) 107 | print "...done constructing graph for sequence_length=%d" % (sequence_length) 108 | 109 | nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs)) 110 | print "About to compile train function over %d ops [nodes]..." % nnodes 111 | train_function = theano.function(train_inputs, train_outputs, mode=COMPILE_MODE) 112 | print "...done constructing graph for sequence_length=%d" % (sequence_length) 113 | 114 | cached_functions[p] = (predict_function, train_function) 115 | return cached_functions[p] 116 | 117 | #def apply_function(fn, sequence, target_output, parameters): 118 | # assert len(sequence) == parameters.hidden_width 119 | # inputs = [numpy.asarray([token]) for token in sequence] 120 | # if target_output != None: 121 | ## if HYPERPARAMETERS["USE_SECOND_HIDDEN_LAYER"]: 122 | ## return fn(*(inputs + [numpy.asarray([target_output]), parameters.hidden_weights, parameters.hidden_biases, parameters.hidden2_weights, parameters.hidden2_biases, parameters.output_weights, parameters.output_biases])) 123 | ## else: 124 | # return fn(*(inputs + [numpy.asarray([target_output]), parameters.hidden_weights, parameters.hidden_biases, parameters.output_weights, parameters.output_biases])) 125 | # else: 126 | ## if HYPERPARAMETERS["USE_SECOND_HIDDEN_LAYER"]: 127 | ## return fn(*(inputs + [parameters.hidden_weights, parameters.hidden_biases, parameters.hidden2_weights, parameters.hidden2_biases, parameters.output_weights, parameters.output_biases])) 128 | ## else: 129 | # return fn(*(inputs + [parameters.hidden_weights, parameters.hidden_biases, parameters.output_weights, parameters.output_biases])) 130 | # 131 | 132 | def predict(sequence, targetrepr, target_scorebias): 133 | fn = functions(sequence_length=len(sequence))[0] 134 | (predictrepr, score) = fn(*(sequence + [targetrepr, target_scorebias])) 135 | return predictrepr, score 136 | 137 | def train(sequence, correct_repr, noise_repr, correct_scorebias, noise_scorebias, learning_rate): 138 | fn = functions(sequence_length=len(sequence))[1] 139 | # print "REMOVEME", correct_scorebias, noise_scorebias 140 | # print "REMOVEME", correct_scorebias[0], noise_scorebias[0] 141 | r = fn(*(sequence + [correct_repr, noise_repr, correct_scorebias, noise_scorebias])) 142 | 143 | (loss, predictrepr, correct_score, noise_score, dsequence, dcorrect_repr, dnoise_repr, doutput_weights, doutput_biases, dcorrect_scorebias, dnoise_scorebias) = chopargs(r, (0,0,0,0,len(sequence),0,0,0,0,0,0)) 144 | if loss == 0: 145 | for di in [doutput_weights, doutput_biases]: 146 | # This tends to trigger if training diverges (NaN) 147 | assert (di == 0).all() 148 | 149 | parameters.output_weights -= 1.0 * learning_rate * doutput_weights 150 | parameters.output_biases -= 1.0 * learning_rate * doutput_biases 151 | 152 | # You also need to update score_biases here 153 | assert 0 154 | 155 | dsequence = list(dsequence) 156 | return (loss, predictrepr, correct_score, noise_score, dsequence, dcorrect_repr, dnoise_repr, dcorrect_scorebias, dnoise_scorebias) 157 | -------------------------------------------------------------------------------- /scripts/model/model.py: -------------------------------------------------------------------------------- 1 | from parameters import Parameters 2 | 3 | from hyperparameters import HYPERPARAMETERS 4 | LBL = HYPERPARAMETERS["LOG BILINEAR MODEL"] 5 | 6 | if LBL: 7 | import graphlbl as graph 8 | else: 9 | import graphcw as graph 10 | 11 | import sys, pickle 12 | import math 13 | import logging 14 | 15 | from common.file import myopen 16 | from common.movingaverage import MovingAverage 17 | 18 | from vocabulary import * 19 | 20 | class Model: 21 | """ 22 | A Model can: 23 | 24 | @type parameters: L{Parameters} 25 | @todo: Document 26 | """ 27 | 28 | import hyperparameters 29 | import miscglobals 30 | import vocabulary 31 | def __init__(self, modelname="", window_size=HYPERPARAMETERS["WINDOW_SIZE"], vocab_size=vocabulary.wordmap().len, embedding_size=HYPERPARAMETERS["EMBEDDING_SIZE"], hidden_size=HYPERPARAMETERS["HIDDEN_SIZE"], seed=miscglobals.RANDOMSEED, initial_embeddings=None, two_hidden_layers=HYPERPARAMETERS["TWO_HIDDEN_LAYERS"]): 32 | self.modelname = modelname 33 | self.parameters = Parameters(window_size, vocab_size, embedding_size, hidden_size, seed, initial_embeddings, two_hidden_layers) 34 | if LBL: 35 | graph.output_weights = self.parameters.output_weights 36 | graph.output_biases = self.parameters.output_biases 37 | graph.score_biases = self.parameters.score_biases 38 | else: 39 | graph.hidden_weights = self.parameters.hidden_weights 40 | graph.hidden_biases = self.parameters.hidden_biases 41 | if self.parameters.two_hidden_layers: 42 | graph.hidden2_weights = self.parameters.hidden2_weights 43 | graph.hidden2_biases = self.parameters.hidden2_biases 44 | graph.output_weights = self.parameters.output_weights 45 | graph.output_biases = self.parameters.output_biases 46 | 47 | # (self.graph_train, self.graph_predict, self.graph_verbose_predict) = graph.functions(self.parameters) 48 | import sets 49 | self.train_loss = MovingAverage() 50 | self.train_err = MovingAverage() 51 | self.train_lossnonzero = MovingAverage() 52 | self.train_squashloss = MovingAverage() 53 | self.train_unpenalized_loss = MovingAverage() 54 | self.train_l1penalty = MovingAverage() 55 | self.train_unpenalized_lossnonzero = MovingAverage() 56 | self.train_correct_score = MovingAverage() 57 | self.train_noise_score = MovingAverage() 58 | self.train_cnt = 0 59 | 60 | def __getstate__(self): 61 | return (self.modelname, self.parameters, self.train_loss, self.train_err, self.train_lossnonzero, self.train_squashloss, self.train_unpenalized_loss, self.train_l1penalty, self.train_unpenalized_lossnonzero, self.train_correct_score, self.train_noise_score, self.train_cnt) 62 | 63 | def __setstate__(self, state): 64 | (self.modelname, self.parameters, self.train_loss, self.train_err, self.train_lossnonzero, self.train_squashloss, self.train_unpenalized_loss, self.train_l1penalty, self.train_unpenalized_lossnonzero, self.train_correct_score, self.train_noise_score, self.train_cnt) = state 65 | if LBL: 66 | graph.output_weights = self.parameters.output_weights 67 | graph.output_biases = self.parameters.output_biases 68 | graph.score_biases = self.parameters.score_biases 69 | else: 70 | graph.hidden_weights = self.parameters.hidden_weights 71 | graph.hidden_biases = self.parameters.hidden_biases 72 | if self.parameters.two_hidden_layers: 73 | graph.hidden2_weights = self.parameters.hidden2_weights 74 | graph.hidden2_biases = self.parameters.hidden2_biases 75 | graph.output_weights = self.parameters.output_weights 76 | graph.output_biases = self.parameters.output_biases 77 | 78 | # def load(self, filename): 79 | # sys.stderr.write("Loading model from: %s\n" % filename) 80 | # f = myopen(filename, "rb") 81 | # (self.parameters, self.train_loss, self.train_err, self.train_lossnonzero, self.train_squashloss, self.train_unpenalized_loss, self.train_l1penalty, self.train_unpenalized_lossnonzero, self.train_correct_score, self.train_noise_score, self.train_cnt) = pickle.load(f) 82 | # if LBL: 83 | # graph.output_weights = self.parameters.output_weights 84 | # graph.output_biases = self.parameters.output_biases 85 | # graph.score_biases = self.parameters.score_biases 86 | # else: 87 | # graph.hidden_weights = self.parameters.hidden_weights 88 | # graph.hidden_biases = self.parameters.hidden_biases 89 | # graph.output_weights = self.parameters.output_weights 90 | # graph.output_biases = self.parameters.output_biases 91 | # 92 | # def save(self, filename): 93 | # sys.stderr.write("Saving model to: %s\n" % filename) 94 | # f = myopen(filename, "wb") 95 | # pickle.dump((self.parameters, self.train_loss, self.train_err, self.train_lossnonzero, self.train_squashloss, self.train_unpenalized_loss, self.train_l1penalty, self.train_unpenalized_lossnonzero, self.train_correct_score, self.train_noise_score, self.train_cnt), f) 96 | 97 | def embed(self, sequence): 98 | """ 99 | Embed a sequence of vocabulary IDs 100 | """ 101 | seq = [self.parameters.embeddings[s] for s in sequence] 102 | import numpy 103 | return [numpy.resize(s, (1, s.size)) for s in seq] 104 | # return [self.parameters.embeddings[s] for s in sequence] 105 | 106 | def embeds(self, sequences): 107 | """ 108 | Embed sequences of vocabulary IDs. 109 | If we are given a list of MINIBATCH lists of SEQLEN items, return a list of SEQLEN matrices of shape (MINIBATCH, EMBSIZE) 110 | """ 111 | embs = [] 112 | for sequence in sequences: 113 | embs.append(self.embed(sequence)) 114 | 115 | for emb in embs: assert len(emb) == len(embs[0]) 116 | 117 | new_embs = [] 118 | for i in range(len(embs[0])): 119 | colembs = [embs[j][i] for j in range(len(embs))] 120 | import numpy 121 | new_embs.append(numpy.vstack(colembs)) 122 | assert new_embs[-1].shape == (len(sequences), self.parameters.embedding_size) 123 | assert len(new_embs) == len(sequences[0]) 124 | return new_embs 125 | 126 | def train(self, correct_sequences, noise_sequences, weights): 127 | from hyperparameters import HYPERPARAMETERS 128 | learning_rate = HYPERPARAMETERS["LEARNING_RATE"] 129 | 130 | # All weights must be the same, because of how we use a scalar learning rate 131 | assert HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"] 132 | if HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]: 133 | for w in weights: assert w == weights[0] 134 | 135 | if LBL: 136 | # REWRITE FOR MINIBATCH 137 | assert 0 138 | 139 | # noise_repr = noise_sequence[-1] 140 | # correct_repr = correct_sequence[-1] 141 | noise_repr = noise_sequence[-1:] 142 | correct_repr = correct_sequence[-1:] 143 | assert noise_repr != correct_repr 144 | assert noise_sequence[:-1] == correct_sequence[:-1] 145 | sequence = correct_sequence[:-1] 146 | # r = graph.train(self.embed(sequence), self.embed([correct_repr])[0], self.embed([noise_repr])[0], self.parameters.score_biases[correct_repr], self.parameters.score_biases[noise_repr]) 147 | r = graph.train(self.embed(sequence), self.embed(correct_repr)[0], self.embed(noise_repr)[0], self.parameters.score_biases[correct_repr], self.parameters.score_biases[noise_repr], learning_rate * weight) 148 | assert len(noise_repr) == 1 149 | assert len(correct_repr) == 1 150 | noise_repr = noise_repr[0] 151 | correct_repr = correct_repr[0] 152 | (loss, predictrepr, correct_score, noise_score, dsequence, dcorrect_repr, dnoise_repr, dcorrect_scorebias, dnoise_scorebias) = r 153 | # print 154 | # print "loss = ", loss 155 | # print "predictrepr = ", predictrepr 156 | # print "correct_repr = ", correct_repr, self.embed(correct_repr)[0] 157 | # print "noise_repr = ", noise_repr, self.embed(noise_repr)[0] 158 | # print "correct_score = ", correct_score 159 | # print "noise_score = ", noise_score 160 | else: 161 | r = graph.train(self.embeds(correct_sequences), self.embeds(noise_sequences), learning_rate * weights[0]) 162 | if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0: 163 | (dcorrect_inputss, dnoise_inputss, losss, unpenalized_losss, l1penaltys, correct_scores, noise_scores) = r 164 | else: 165 | (losss, unpenalized_losss, l1penaltys, correct_scores, noise_scores) = r 166 | # print [d.shape for d in dcorrect_inputss] 167 | # print [d.shape for d in dnoise_inputss] 168 | # print "losss", losss.shape, losss 169 | # print "unpenalized_losss", unpenalized_losss.shape, unpenalized_losss 170 | # print "l1penaltys", l1penaltys.shape, l1penaltys 171 | # print "correct_scores", correct_scores.shape, correct_scores 172 | # print "noise_scores", noise_scores.shape, noise_scores 173 | 174 | import sets 175 | to_normalize = sets.Set() 176 | for ecnt in range(len(correct_sequences)): 177 | (loss, unpenalized_loss, correct_score, noise_score) = \ 178 | (losss[ecnt], unpenalized_losss[ecnt], correct_scores[ecnt], noise_scores[ecnt]) 179 | if l1penaltys.shape == (): 180 | assert l1penaltys == 0 181 | l1penalty = 0 182 | else: 183 | l1penalty = l1penaltys[ecnt] 184 | correct_sequence = correct_sequences[ecnt] 185 | noise_sequence = noise_sequences[ecnt] 186 | 187 | if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0: 188 | dcorrect_inputs = [d[ecnt] for d in dcorrect_inputss] 189 | dnoise_inputs = [d[ecnt] for d in dnoise_inputss] 190 | 191 | # print [d.shape for d in dcorrect_inputs] 192 | # print [d.shape for d in dnoise_inputs] 193 | # print "loss", loss.shape, loss 194 | # print "unpenalized_loss", unpenalized_loss.shape, unpenalized_loss 195 | # print "l1penalty", l1penalty.shape, l1penalty 196 | # print "correct_score", correct_score.shape, correct_score 197 | # print "noise_score", noise_score.shape, noise_score 198 | 199 | 200 | self.train_loss.add(loss) 201 | self.train_err.add(correct_score <= noise_score) 202 | self.train_lossnonzero.add(loss > 0) 203 | squashloss = 1./(1.+math.exp(-loss)) 204 | self.train_squashloss.add(squashloss) 205 | if not LBL: 206 | self.train_unpenalized_loss.add(unpenalized_loss) 207 | self.train_l1penalty.add(l1penalty) 208 | self.train_unpenalized_lossnonzero.add(unpenalized_loss > 0) 209 | self.train_correct_score.add(correct_score) 210 | self.train_noise_score.add(noise_score) 211 | 212 | self.train_cnt += 1 213 | if self.train_cnt % 10000 == 0: 214 | # if self.train_cnt % 1000 == 0: 215 | # print self.train_cnt 216 | # graph.COMPILE_MODE.print_summary() 217 | logging.info(("After %d updates, pre-update train loss %s" % (self.train_cnt, self.train_loss.verbose_string()))) 218 | logging.info(("After %d updates, pre-update train error %s" % (self.train_cnt, self.train_err.verbose_string()))) 219 | logging.info(("After %d updates, pre-update train Pr(loss != 0) %s" % (self.train_cnt, self.train_lossnonzero.verbose_string()))) 220 | logging.info(("After %d updates, pre-update train squash(loss) %s" % (self.train_cnt, self.train_squashloss.verbose_string()))) 221 | if not LBL: 222 | logging.info(("After %d updates, pre-update train unpenalized loss %s" % (self.train_cnt, self.train_unpenalized_loss.verbose_string()))) 223 | logging.info(("After %d updates, pre-update train l1penalty %s" % (self.train_cnt, self.train_l1penalty.verbose_string()))) 224 | logging.info(("After %d updates, pre-update train Pr(unpenalized loss != 0) %s" % (self.train_cnt, self.train_unpenalized_lossnonzero.verbose_string()))) 225 | logging.info(("After %d updates, pre-update train correct score %s" % (self.train_cnt, self.train_correct_score.verbose_string()))) 226 | logging.info(("After %d updates, pre-update train noise score %s" % (self.train_cnt, self.train_noise_score.verbose_string()))) 227 | 228 | self.debug_prehidden_values(correct_sequences) 229 | 230 | if LBL: 231 | i = 1. 232 | while i < wordmap.len: 233 | inti = int(i) 234 | str = "word %s, rank %d, score %f" % (wordmap.str(inti), inti, self.parameters.score_biases[inti]) 235 | logging.info("After %d updates, score biases: %s" % (self.train_cnt, str)) 236 | i *= 3.2 237 | 238 | # print(("After %d updates, pre-update train loss %s" % (self.train_cnt, self.train_loss.verbose_string()))) 239 | # print(("After %d updates, pre-update train error %s" % (self.train_cnt, self.train_err.verbose_string()))) 240 | 241 | 242 | # All weights must be the same, because of how we use a scalar learning rate 243 | assert HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"] 244 | if HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]: 245 | for w in weights: assert w == weights[0] 246 | embedding_learning_rate = HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] * weights[0] 247 | if loss == 0: 248 | if LBL: 249 | for di in dsequence + [dcorrect_repr, dnoise_repr]: 250 | # This tends to trigger if training diverges (NaN) 251 | assert (di == 0).all() 252 | # if not (di == 0).all(): 253 | # print "WARNING:", di 254 | # print "WARNING in ", dsequence + [dcorrect_repr, dnoise_repr] 255 | # print "loss = ", loss 256 | # print "predictrepr = ", predictrepr 257 | # print "correct_repr = ", correct_repr, self.embed(correct_repr)[0] 258 | # print "noise_repr = ", noise_repr, self.embed(noise_repr)[0] 259 | # print "correct_score = ", correct_score 260 | # print "noise_score = ", noise_score 261 | else: 262 | if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0: 263 | for di in dcorrect_inputs + dnoise_inputs: 264 | assert (di == 0).all() 265 | 266 | if loss != 0: 267 | if LBL: 268 | val = sequence + [correct_repr, noise_repr] 269 | dval = dsequence + [dcorrect_repr, dnoise_repr] 270 | # print val 271 | for (i, di) in zip(val, dval): 272 | # for (i, di) in zip(tuple(sequence + [correct_repr, noise_repr]), tuple(dsequence + [dcorrect_repr, dnoise_repr])): 273 | assert di.shape[0] == 1 274 | di.resize(di.size) 275 | # print i, di 276 | self.parameters.embeddings[i] -= 1.0 * embedding_learning_rate * di 277 | if HYPERPARAMETERS["NORMALIZE_EMBEDDINGS"]: 278 | to_normalize.add(i) 279 | 280 | for (i, di) in zip([correct_repr, noise_repr], [dcorrect_scorebias, dnoise_scorebias]): 281 | self.parameters.score_biases[i] -= 1.0 * embedding_learning_rate * di 282 | # print "REMOVEME", i, self.parameters.score_biases[i] 283 | else: 284 | if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0: 285 | for (i, di) in zip(correct_sequence, dcorrect_inputs): 286 | # assert di.shape[0] == 1 287 | # di.resize(di.size) 288 | # print i, di 289 | assert di.shape == (self.parameters.embedding_size,) 290 | self.parameters.embeddings[i] -= 1.0 * embedding_learning_rate * di 291 | if HYPERPARAMETERS["NORMALIZE_EMBEDDINGS"]: 292 | to_normalize.add(i) 293 | for (i, di) in zip(noise_sequence, dnoise_inputs): 294 | # assert di.shape[0] == 1 295 | # di.resize(di.size) 296 | # print i, di 297 | assert di.shape == (self.parameters.embedding_size,) 298 | self.parameters.embeddings[i] -= 1.0 * embedding_learning_rate * di 299 | if HYPERPARAMETERS["NORMALIZE_EMBEDDINGS"]: 300 | to_normalize.add(i) 301 | # print to_normalize 302 | 303 | if len(to_normalize) > 0: 304 | to_normalize = [i for i in to_normalize] 305 | # print "NORMALIZING", to_normalize 306 | self.parameters.normalize(to_normalize) 307 | 308 | 309 | 310 | def predict(self, sequence): 311 | if LBL: 312 | targetrepr = sequence[-1:] 313 | sequence = sequence[:-1] 314 | (predictrepr, score) = graph.predict(self.embed(sequence), self.embed(targetrepr)[0], self.parameters.score_biases[targetrepr], self.parameters) 315 | return score 316 | else: 317 | (score) = graph.predict(self.embed(sequence), self.parameters) 318 | return score 319 | 320 | def verbose_predict(self, sequence): 321 | if LBL: 322 | assert 0 323 | else: 324 | (score, prehidden) = graph.verbose_predict(self.embed(sequence)) 325 | return score, prehidden 326 | 327 | def debug_prehidden_values(self, sequences): 328 | """ 329 | Give debug output on pre-squash hidden values. 330 | """ 331 | import numpy 332 | for (i, ve) in enumerate(sequences): 333 | (score, prehidden) = self.verbose_predict(ve) 334 | abs_prehidden = numpy.abs(prehidden) 335 | med = numpy.median(abs_prehidden) 336 | abs_prehidden = abs_prehidden.tolist() 337 | assert len(abs_prehidden) == 1 338 | abs_prehidden = abs_prehidden[0] 339 | abs_prehidden.sort() 340 | abs_prehidden.reverse() 341 | 342 | logging.info("model %s, %s %s %s %s %s" % (self.modelname, self.train_cnt, "abs(pre-squash hidden) median =", med, "max =", abs_prehidden[:3])) 343 | if i+1 >= 3: break 344 | 345 | def validate(self, sequence): 346 | """ 347 | Get the rank of this final word, as opposed to all other words in the vocabulary. 348 | """ 349 | import random 350 | r = random.Random() 351 | r.seed(0) 352 | from hyperparameters import HYPERPARAMETERS 353 | 354 | import copy 355 | corrupt_sequence = copy.copy(sequence) 356 | rank = 1 357 | correct_score = self.predict(sequence) 358 | # print "CORRECT", correct_score, [wordmap.str(id) for id in sequence] 359 | for i in range(self.parameters.vocab_size): 360 | if r.random() > HYPERPARAMETERS["PERCENT OF NOISE EXAMPLES FOR VALIDATION LOGRANK"]: continue 361 | if i == sequence[-1]: continue 362 | corrupt_sequence[-1] = i 363 | corrupt_score = self.predict(corrupt_sequence) 364 | if correct_score <= corrupt_score: 365 | # print " CORRUPT", corrupt_score, [wordmap.str(id) for id in corrupt_sequence] 366 | rank += 1 367 | return rank 368 | 369 | def validate_errors(self, correct_sequences, noise_sequences): 370 | """ 371 | Count the errors in this validation batch. 372 | """ 373 | 374 | # r = graph.train(self.embeds(correct_sequences), self.embeds(noise_sequences), learning_rate * weights[0]) 375 | correct_scores = graph.predict(self.embeds(correct_sequences)) 376 | noise_scores = graph.predict(self.embeds(noise_sequences)) 377 | 378 | # print correct_scores 379 | # print noise_scores 380 | return correct_scores > noise_scores 381 | ## print "CORRECT", correct_score, [wordmap.str(id) for id in sequence] 382 | # for i in range(self.parameters.vocab_size): 383 | # if r.random() > HYPERPARAMETERS["PERCENT OF NOISE EXAMPLES FOR VALIDATION LOGRANK"]: continue 384 | # if i == sequence[-1]: continue 385 | # corrupt_sequence[-1] = i 386 | # corrupt_score = self.predict(corrupt_sequence) 387 | # if correct_score <= corrupt_score: 388 | ## print " CORRUPT", corrupt_score, [wordmap.str(id) for id in corrupt_sequence] 389 | # rank += 1 390 | # return rank 391 | -------------------------------------------------------------------------------- /scripts/model/parameters.py: -------------------------------------------------------------------------------- 1 | """ 2 | @todo: WRITEME 3 | """ 4 | 5 | from theano import config 6 | from theano.compile.sandbox import shared 7 | 8 | import copy 9 | 10 | floatX = config.floatX 11 | 12 | from hyperparameters import HYPERPARAMETERS 13 | LBL = HYPERPARAMETERS["LOG BILINEAR MODEL"] 14 | 15 | class Parameters: 16 | """ 17 | Parameters used by the L{Model}. 18 | @todo: Document these 19 | """ 20 | 21 | def __init__(self, window_size, vocab_size, embedding_size, hidden_size, seed, initial_embeddings, two_hidden_layers): 22 | """ 23 | Initialize L{Model} parameters. 24 | """ 25 | 26 | self.vocab_size = vocab_size 27 | self.window_size = window_size 28 | self.embedding_size = embedding_size 29 | self.two_hidden_layers = two_hidden_layers 30 | if LBL: 31 | self.hidden_size = hidden_size 32 | self.output_size = self.embedding_size 33 | else: 34 | self.hidden_size = hidden_size 35 | self.output_size = 1 36 | 37 | import numpy 38 | import hyperparameters 39 | 40 | from pylearn.algorithms.weights import random_weights 41 | numpy.random.seed(seed) 42 | if initial_embeddings is None: 43 | self.embeddings = numpy.asarray((numpy.random.rand(self.vocab_size, HYPERPARAMETERS["EMBEDDING_SIZE"]) - 0.5)*2 * HYPERPARAMETERS["INITIAL_EMBEDDING_RANGE"], dtype=floatX) 44 | else: 45 | assert initial_embeddings.shape == (self.vocab_size, HYPERPARAMETERS["EMBEDDING_SIZE"]) 46 | self.embeddings = copy.copy(initial_embeddings) 47 | if HYPERPARAMETERS["NORMALIZE_EMBEDDINGS"]: self.normalize(range(self.vocab_size)) 48 | if LBL: 49 | self.output_weights = shared(numpy.asarray(random_weights(self.input_size, self.output_size, scale_by=HYPERPARAMETERS["SCALE_INITIAL_WEIGHTS_BY"]), dtype=floatX)) 50 | self.output_biases = shared(numpy.asarray(numpy.zeros((1, self.output_size)), dtype=floatX)) 51 | self.score_biases = shared(numpy.asarray(numpy.zeros(self.vocab_size), dtype=floatX)) 52 | assert not self.two_hidden_layers 53 | else: 54 | self.hidden_weights = shared(numpy.asarray(random_weights(self.input_size, self.hidden_size, scale_by=HYPERPARAMETERS["SCALE_INITIAL_WEIGHTS_BY"]), dtype=floatX)) 55 | self.hidden_biases = shared(numpy.asarray(numpy.zeros((self.hidden_size,)), dtype=floatX)) 56 | if self.two_hidden_layers: 57 | self.hidden2_weights = shared(numpy.asarray(random_weights(self.hidden_size, self.hidden_size, scale_by=HYPERPARAMETERS["SCALE_INITIAL_WEIGHTS_BY"]), dtype=floatX)) 58 | self.hidden2_biases = shared(numpy.asarray(numpy.zeros((self.hidden_size,)), dtype=floatX)) 59 | self.output_weights = shared(numpy.asarray(random_weights(self.hidden_size, self.output_size, scale_by=HYPERPARAMETERS["SCALE_INITIAL_WEIGHTS_BY"]), dtype=floatX)) 60 | self.output_biases = shared(numpy.asarray(numpy.zeros((self.output_size,)), dtype=floatX)) 61 | 62 | input_size = property(lambda self: 63 | LBL*((self.window_size-1) * self.embedding_size) + (1-LBL)*(self.window_size * self.embedding_size)) 64 | 65 | def normalize(self, indices): 66 | """ 67 | Normalize such that the l2 norm of the embeddings indices passed in. 68 | @todo: l1 norm? 69 | @return: The normalized embeddings 70 | """ 71 | import numpy 72 | l2norm = numpy.square(self.embeddings[indices]).sum(axis=1) 73 | l2norm = numpy.sqrt(l2norm.reshape((len(indices), 1))) 74 | 75 | self.embeddings[indices] /= l2norm 76 | import math 77 | self.embeddings[indices] *= math.sqrt(self.embeddings.shape[1]) 78 | 79 | # TODO: Assert that norm is correct 80 | # l2norm = (embeddings * embeddings).sum(axis=1) 81 | # print l2norm.shape 82 | # print (l2norm == numpy.ones((vocabsize)) * HYPERPARAMETERS["EMBEDDING_SIZE"]) 83 | # print (l2norm == numpy.ones((vocabsize)) * HYPERPARAMETERS["EMBEDDING_SIZE"]).all() 84 | -------------------------------------------------------------------------------- /scripts/monolingual/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turian/neural-language-model/f7559a6cc4e9f4c34a553fbda974762f2d3f781b/scripts/monolingual/__init__.py -------------------------------------------------------------------------------- /scripts/monolingual/build-vocabulary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | if __name__ == "__main__": 4 | import common.hyperparameters, common.options 5 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 6 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 7 | import hyperparameters 8 | 9 | import vocabulary 10 | import common.idmap 11 | 12 | words = [] 13 | 14 | import string 15 | for i, l in enumerate(common.file.myopen(HYPERPARAMETERS["MONOLINGUAL_VOCABULARY"])): 16 | if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"] and i+1 >= HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"]: 17 | break 18 | if not HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"] and i >= HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"]: 19 | break 20 | (cnt, w) = string.split(l) 21 | words.append(w) 22 | 23 | v = common.idmap.IDmap(words, allow_unknown=HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]) 24 | assert v.len == HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"] 25 | vocabulary.write(v) 26 | -------------------------------------------------------------------------------- /scripts/monolingual/corrupt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Methods for corrupting examples. 3 | """ 4 | 5 | def corrupt_example(model, e): 6 | """ 7 | Return a corrupted version of example e, plus the weight of this example. 8 | """ 9 | from hyperparameters import HYPERPARAMETERS 10 | import random 11 | import copy 12 | e = copy.copy(e) 13 | last = e[-1] 14 | cnt = 0 15 | while e[-1] == last: 16 | if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0: 17 | e[-1] = random.randint(0, model.parameters.vocab_size-1) 18 | pr = 1./model.parameters.vocab_size 19 | elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1: 20 | import noise 21 | from common.myrandom import weighted_sample 22 | e[-1], pr = weighted_sample(noise.indexed_weights()) 23 | # from vocabulary import wordmap 24 | # print wordmap.str(e[-1]), pr 25 | else: 26 | assert 0 27 | cnt += 1 28 | # Backoff to 0gram smoothing if we fail 10 times to get noise. 29 | if cnt > 10: e[-1] = random.randint(0, model.parameters.vocab_size-1) 30 | weight = 1./pr 31 | return e, weight 32 | 33 | def corrupt_examples(model, correct_sequences): 34 | noise_sequences = [] 35 | weights = [] 36 | for e in correct_sequences: 37 | noise_sequence, weight = model.corrupt_example(e) 38 | noise_sequences.append(noise_sequence) 39 | weights.append(weight) 40 | return noise_sequences, weights 41 | -------------------------------------------------------------------------------- /scripts/monolingual/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Methods for getting examples. 3 | """ 4 | 5 | from common.stats import stats 6 | from common.file import myopen 7 | import string 8 | 9 | import common.hyperparameters 10 | import sys 11 | 12 | class TrainingExampleStream(object): 13 | def __init__(self): 14 | self.count = 0 15 | pass 16 | 17 | def __iter__(self): 18 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 19 | from vocabulary import wordmap 20 | self.filename = HYPERPARAMETERS["TRAIN_SENTENCES"] 21 | self.count = 0 22 | for l in myopen(self.filename): 23 | prevwords = [] 24 | for w in string.split(l): 25 | w = string.strip(w) 26 | id = None 27 | if wordmap.exists(w): 28 | prevwords.append(wordmap.id(w)) 29 | if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: 30 | self.count += 1 31 | yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:] 32 | else: 33 | # If we can learn an unknown word token, we should 34 | # delexicalize the word, not discard the example! 35 | if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0 36 | prevwords = [] 37 | 38 | def __getstate__(self): 39 | return self.filename, self.count 40 | 41 | def __setstate__(self, state): 42 | """ 43 | @warning: We ignore the filename. If we wanted 44 | to be really fastidious, we would assume that 45 | HYPERPARAMETERS["TRAIN_SENTENCES"] might change. The only 46 | problem is that if we change filesystems, the filename 47 | might change just because the base file is in a different 48 | path. So we issue a warning if the filename is different from 49 | """ 50 | filename, count = state 51 | print >> sys.stderr, ("__setstate__(%s)..." % `state`) 52 | print >> sys.stderr, (stats()) 53 | iter = self.__iter__() 54 | while count != self.count: 55 | # print count, self.count 56 | iter.next() 57 | if self.filename != filename: 58 | assert self.filename == HYPERPARAMETERS["TRAIN_SENTENCES"] 59 | print >> sys.stderr, ("self.filename %s != filename given to __setstate__ %s" % (self.filename, filename)) 60 | print >> sys.stderr, ("...__setstate__(%s)" % `state`) 61 | print >> sys.stderr, (stats()) 62 | 63 | class TrainingMinibatchStream(object): 64 | def __init__(self): 65 | pass 66 | 67 | def __iter__(self): 68 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 69 | minibatch = [] 70 | self.get_train_example = TrainingExampleStream() 71 | for e in self.get_train_example: 72 | # print self.get_train_example.__getstate__() 73 | minibatch.append(e) 74 | if len(minibatch) >= HYPERPARAMETERS["MINIBATCH SIZE"]: 75 | assert len(minibatch) == HYPERPARAMETERS["MINIBATCH SIZE"] 76 | yield minibatch 77 | minibatch = [] 78 | 79 | def __getstate__(self): 80 | return (self.get_train_example.__getstate__(),) 81 | 82 | def __setstate__(self, state): 83 | """ 84 | @warning: We ignore the filename. 85 | """ 86 | self.get_train_example = TrainingExampleStream() 87 | self.get_train_example.__setstate__(state[0]) 88 | 89 | def get_validation_example(): 90 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 91 | 92 | from vocabulary import wordmap 93 | for l in myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"]): 94 | prevwords = [] 95 | for w in string.split(l): 96 | w = string.strip(w) 97 | if wordmap.exists(w): 98 | prevwords.append(wordmap.id(w)) 99 | if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: 100 | yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:] 101 | else: 102 | # If we can learn an unknown word token, we should 103 | # delexicalize the word, not discard the example! 104 | if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0 105 | prevwords = [] 106 | -------------------------------------------------------------------------------- /scripts/monolingual/noise.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sophisticated training noise. 3 | """ 4 | 5 | from vocabulary import wordmap 6 | 7 | from common.myrandom import build 8 | import sys 9 | 10 | _indexed_weights = None 11 | def indexed_weights(): 12 | import common.hyperparameters, common.options 13 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 14 | global _indexed_weights 15 | if _indexed_weights is not None: 16 | return _indexed_weights 17 | print >> sys.stderr, wordmap.len, "=?=", HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"] 18 | assert wordmap.len == HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"] 19 | if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0: 20 | _indexed_weights = [1 for id in range(wordmap.len)] 21 | elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1: 22 | from common.json import load 23 | from common.file import myopen 24 | ngrams_file = HYPERPARAMETERS["NGRAMS"][(HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"], HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"])] 25 | print >> sys.stderr, "Reading ngrams from", ngrams_file, "..." 26 | from collections import defaultdict 27 | ngramcnt = defaultdict(int) 28 | for (ngram, cnt) in load(myopen(ngrams_file)): 29 | assert len(ngram) == 1 30 | ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS["TRAINING_NOISE_SMOOTHING_ADDITION"] 31 | _indexed_weights = [ngramcnt[wordmap.str(id)] for id in range(wordmap.len)] 32 | _indexed_weights = build(_indexed_weights) 33 | else: assert 0 34 | return _indexed_weights 35 | -------------------------------------------------------------------------------- /scripts/monolingual/state.py: -------------------------------------------------------------------------------- 1 | """ 2 | Save and load training state. 3 | @todo: Training state variables (cnt, epoch, trainstate) should all be combined into one object. 4 | """ 5 | 6 | import logging 7 | import os.path 8 | import cPickle 9 | 10 | from common.stats import stats 11 | from common.file import myopen 12 | import sys 13 | 14 | _lastfilename = None 15 | def save(model, cnt, epoch, trainstate, rundir, newkeystr): 16 | global _lastfilename 17 | 18 | filename = os.path.join(rundir, "model-%d%s.pkl" % (cnt, newkeystr)) 19 | logging.info("Writing model to %s..." % filename) 20 | logging.info(stats()) 21 | cPickle.dump(model, myopen(filename, "wb"), protocol=-1) 22 | logging.info("...done writing model to %s" % filename) 23 | logging.info(stats()) 24 | 25 | if _lastfilename is not None: 26 | logging.info("Removing old model %s..." % _lastfilename) 27 | try: 28 | os.remove(_lastfilename) 29 | logging.info("...removed %s" % _lastfilename) 30 | except: 31 | logging.info("Could NOT remove %s" % _lastfilename) 32 | _lastfilename = filename 33 | 34 | filename = os.path.join(rundir, "trainstate.pkl") 35 | cPickle.dump((trainstate, cnt, epoch), myopen(filename, "wb"), protocol=-1) 36 | 37 | filename = os.path.join(rundir, "newkeystr.txt") 38 | myopen(filename, "wt").write(newkeystr) 39 | 40 | def load(rundir, newkeystr): 41 | """ 42 | Read the directory and load the model, the training count, the training epoch, and the training state. 43 | """ 44 | global _lastfilename 45 | 46 | filename = os.path.join(rundir, "newkeystr.txt") 47 | assert newkeystr == myopen(filename).read() 48 | 49 | filename = os.path.join(rundir, "trainstate.pkl") 50 | (trainstate, cnt, epoch) = cPickle.load(myopen(filename)) 51 | 52 | filename = os.path.join(rundir, "model-%d%s.pkl" % (cnt, newkeystr)) 53 | print >> sys.stderr, ("Reading model from %s..." % filename) 54 | print >> sys.stderr, (stats()) 55 | model = cPickle.load(myopen(filename)) 56 | print >> sys.stderr, ("...done reading model from %s" % filename) 57 | print >> sys.stderr, (stats()) 58 | _lastfilename = filename 59 | 60 | return (model, cnt, epoch, trainstate) 61 | -------------------------------------------------------------------------------- /scripts/monolingual/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import string 5 | import common.dump 6 | from common.file import myopen 7 | from common.stats import stats 8 | 9 | import miscglobals 10 | import logging 11 | 12 | import examples 13 | import diagnostics 14 | import state 15 | 16 | def validate(cnt): 17 | import math 18 | logranks = [] 19 | logging.info("BEGINNING VALIDATION AT TRAINING STEP %d" % cnt) 20 | logging.info(stats()) 21 | i = 0 22 | for (i, ve) in enumerate(examples.get_validation_example()): 23 | # logging.info([wordmap.str(id) for id in ve]) 24 | logranks.append(math.log(m.validate(ve))) 25 | if (i+1) % 10 == 0: 26 | logging.info("Training step %d, validating example %d, mean(logrank) = %.2f, stddev(logrank) = %.2f" % (cnt, i+1, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)))) 27 | logging.info(stats()) 28 | logging.info("FINAL VALIDATION AT TRAINING STEP %d: mean(logrank) = %.2f, stddev(logrank) = %.2f, cnt = %d" % (cnt, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)), i+1)) 29 | logging.info(stats()) 30 | # print "FINAL VALIDATION AT TRAINING STEP %d: mean(logrank) = %.2f, stddev(logrank) = %.2f, cnt = %d" % (cnt, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)), i+1) 31 | # print stats() 32 | 33 | if __name__ == "__main__": 34 | import common.hyperparameters, common.options 35 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 36 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 37 | import hyperparameters 38 | 39 | from common import myyaml 40 | import sys 41 | print >> sys.stderr, myyaml.dump(common.dump.vars_seq([hyperparameters, miscglobals])) 42 | 43 | import noise 44 | indexed_weights = noise.indexed_weights() 45 | 46 | from rundir import rundir 47 | rundir = rundir() 48 | 49 | import os.path, os 50 | logfile = os.path.join(rundir, "log") 51 | if newkeystr != "": 52 | verboselogfile = os.path.join(rundir, "log%s" % newkeystr) 53 | print >> sys.stderr, "Logging to %s, and creating link %s" % (logfile, verboselogfile) 54 | os.system("ln -s log %s " % (verboselogfile)) 55 | else: 56 | print >> sys.stderr, "Logging to %s, not creating any link because of default settings" % logfile 57 | 58 | import random, numpy 59 | random.seed(miscglobals.RANDOMSEED) 60 | numpy.random.seed(miscglobals.RANDOMSEED) 61 | 62 | import vocabulary 63 | # logging.info("Reading vocab") 64 | # vocabulary.read() 65 | 66 | import model 67 | try: 68 | print >> sys.stderr, ("Trying to read training state for %s %s..." % (newkeystr, rundir)) 69 | (m, cnt, epoch, get_train_minibatch) = state.load(rundir, newkeystr) 70 | print >> sys.stderr, ("...success reading training state for %s %s" % (newkeystr, rundir)) 71 | print >> sys.stderr, logfile 72 | logging.basicConfig(filename=logfile, level=logging.DEBUG) 73 | # logging.basicConfig(filename=logfile, filemode="w", level=logging.DEBUG) 74 | logging.info("CONTINUING FROM TRAINING STATE") 75 | except IOError: 76 | print >> sys.stderr, ("...FAILURE reading training state for %s %s" % (newkeystr, rundir)) 77 | print >> sys.stderr, ("INITIALIZING") 78 | 79 | m = model.Model() 80 | cnt = 0 81 | epoch = 1 82 | get_train_minibatch = examples.TrainingMinibatchStream() 83 | logging.basicConfig(filename=logfile, filemode="w", level=logging.DEBUG) 84 | logging.info("INITIALIZING TRAINING STATE") 85 | 86 | logging.info(myyaml.dump(common.dump.vars_seq([hyperparameters, miscglobals]))) 87 | 88 | #validate(0) 89 | diagnostics.diagnostics(cnt, m) 90 | # diagnostics.visualizedebug(cnt, m, rundir) 91 | while 1: 92 | logging.info("STARTING EPOCH #%d" % epoch) 93 | for ebatch in get_train_minibatch: 94 | cnt += len(ebatch) 95 | # print [wordmap.str(id) for id in e] 96 | 97 | noise_sequences, weights = corrupt.corrupt_examples(m, ebatch) 98 | m.train(ebatch, noise_sequences, weights) 99 | 100 | #validate(cnt) 101 | if cnt % (int(1000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: 102 | logging.info("Finished training step %d (epoch %d)" % (cnt, epoch)) 103 | # print ("Finished training step %d (epoch %d)" % (cnt, epoch)) 104 | if cnt % (int(100000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: 105 | diagnostics.diagnostics(cnt, m) 106 | if os.path.exists(os.path.join(rundir, "BAD")): 107 | logging.info("Detected file: %s\nSTOPPING" % os.path.join(rundir, "BAD")) 108 | sys.stderr.write("Detected file: %s\nSTOPPING\n" % os.path.join(rundir, "BAD")) 109 | sys.exit(0) 110 | if cnt % (int(HYPERPARAMETERS["VALIDATE_EVERY"]*1./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0: 111 | state.save(m, cnt, epoch, get_train_minibatch, rundir, newkeystr) 112 | diagnostics.visualizedebug(cnt, m, rundir, newkeystr) 113 | # validate(cnt) 114 | get_train_minibatch = examples.TrainingMinibatchStream() 115 | epoch += 1 116 | -------------------------------------------------------------------------------- /scripts/monolingual/vocabulary.py: -------------------------------------------------------------------------------- 1 | """ 2 | Automatically load the wordmap, if available. 3 | """ 4 | 5 | import cPickle 6 | from common.file import myopen 7 | import sys 8 | 9 | def _wordmap_filename(name): 10 | import common.hyperparameters, common.options 11 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 12 | return HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_IDMAP_FILE"] 13 | 14 | wordmap = None 15 | try: 16 | wordmap = cPickle.load(myopen(_wordmap_filename())) 17 | wordmap.str = wordmap.key 18 | except: pass 19 | 20 | def write(wordmap, name=""): 21 | """ 22 | Write the word ID map, passed as a parameter. 23 | """ 24 | print >> sys.stderr, "Writing word map to %s..." % _wordmap_filename(name) 25 | cPickle.dump(wordmap, myopen(_wordmap_filename(name), "w")) 26 | -------------------------------------------------------------------------------- /scripts/ngrams.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Dump n-gram counts over entire training data as YAML. 4 | """ 5 | 6 | import sys 7 | from common.stats import stats 8 | 9 | from collections import defaultdict 10 | cnt = defaultdict(int) 11 | if __name__ == "__main__": 12 | import common.hyperparameters, common.options 13 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 14 | HYPERPARAMETERS, options, args = common.options.reparse(HYPERPARAMETERS) 15 | import hyperparameters 16 | 17 | import vocabulary 18 | print >> sys.stderr, "Reading vocab" 19 | vocabulary.read() 20 | from vocabulary import wordmap 21 | 22 | import train 23 | for (i, e) in enumerate(train.get_train_example()): 24 | cnt[tuple([wordmap.str(t) for t in e])] += 1 25 | if i % 10000 == 0: 26 | print >> sys.stderr, "Read %d examples" % i 27 | print >> sys.stderr, stats() 28 | if i > 100000000: 29 | break 30 | cnt = [(t, cnt[t]) for t in cnt] 31 | import common.json 32 | common.json.dump(cnt, sys.stdout) 33 | -------------------------------------------------------------------------------- /scripts/preprocess/filter-sentences-by-lemma.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """ 3 | For the N files given as command line arguments, filter the sentences 4 | to be only those in which the first file contains a word that lemmatizes 5 | to one of the W2W FOCUS LEMMAS. 6 | We write files that are prefixed by "filtered-" 7 | """ 8 | 9 | from common.str import percent 10 | import string 11 | import sys 12 | 13 | import common.hyperparameters, common.options 14 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 15 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 16 | 17 | if HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0: 18 | print >> sys.stderr, "There are no focus lemmas, hence we have nothing to filter" 19 | sys.exit(0) 20 | 21 | assert len(args) >= 1 22 | 23 | from common.stats import stats 24 | from lemmatizer import lemmatize 25 | 26 | print >> sys.stderr, "Loaded Morphological analyizer" 27 | print >> sys.stderr, stats() 28 | 29 | from itertools import izip 30 | import os.path, os 31 | 32 | filenames = args 33 | outfilenames = [os.path.join(os.path.dirname(f), "filtered-%s" % os.path.basename(f)) for f in filenames] 34 | 35 | print >> sys.stderr, "Reading from %s" % `filenames` 36 | print >> sys.stderr, "Writing to %s" % `outfilenames` 37 | 38 | for f in filenames: assert os.path.exists(f) 39 | for f in outfilenames: 40 | if os.path.exists(f): 41 | print >> sys.stderr, "Warning, going to overwrite %s" % f 42 | 43 | #print "Sleeping for 10 seconds..." 44 | #import time 45 | #time.sleep(10) 46 | 47 | inf = [open(f) for f in filenames] 48 | outf = [open(f, "wt") for f in outfilenames] 49 | 50 | tot = 0 51 | cnt = 0 52 | for lines in izip(*inf): 53 | tot += 1 54 | keep = False 55 | for w in string.split(lines[0]): 56 | if lemmatize("en", w) in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: 57 | keep = True 58 | break 59 | if keep: 60 | cnt += 1 61 | for l, f in izip(lines, outf): 62 | f.write(l) 63 | if tot % 10000 == 0: 64 | print >> sys.stderr, "%s lines kept" % percent(cnt, tot) 65 | print >> sys.stderr, stats() 66 | -------------------------------------------------------------------------------- /scripts/preprocess/lemmatizer.py: -------------------------------------------------------------------------------- 1 | ../lemmatizer.py -------------------------------------------------------------------------------- /scripts/preprocess/lowercase.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # Lowercase <> 4 | # 5 | 6 | while(<>) { 7 | print lc $_; 8 | } 9 | -------------------------------------------------------------------------------- /scripts/preprocess/preprocess-validation.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # Transform the validation data into a form that it can be used by the system. 4 | # 5 | 6 | $VDIR = "/u/turian/data/SemEval-2-2010/Task 3 - Cross-Lingual Word Sense Disambiguation"; 7 | 8 | foreach $f (`find '$VDIR' -name \*.data`) { 9 | open(F, "<$f") or die $!; 10 | while () { 11 | $lemma = $1 if //; 12 | if (/(.*)<\/context>/) { 13 | $l = $1; 14 | $l =~ s/[^<>]*<\/head>/$lemma/g; 15 | open(O, "| ~/data/europarl-v5/europarl/tools/tokenizer.perl -l en | ~/data/europarl-v5/preprocessed/lowercase.perl | ~/utils/src/treetagger-3.2/l.py en > /tmp/removeme.txt"); 16 | print O $l; 17 | $l = `cat /tmp/removeme.txt`; 18 | print $l; 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /scripts/preprocess/reverse-alignment.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # USAGE: 4 | # ./reverse-alignment.pl filename.align.l1-l2 [...] 5 | # 6 | # Create a file filename.align.l2-l1 with the alignments reversed. 7 | # 8 | 9 | die $! unless scalar @ARGV >= 1; 10 | 11 | foreach $f (@ARGV) { 12 | if ($f =~ m/(.*\.align\.)(..)-(..)$/) { 13 | $fnew = "$1$3-$2"; 14 | } else { 15 | die $!; 16 | } 17 | 18 | if (-e $fnew) { 19 | print "$fnew already exists"; 20 | next; 21 | } 22 | 23 | $cmd = "cat $f | perl -ne 's/(\\d+)-(\\d+)/\$2-\$1/g; print' > $fnew"; 24 | print "$cmd\n"; 25 | system("$cmd"); 26 | 27 | print "SANITY CHECK... (shouldn't see any output after this command)\n"; 28 | $cmd = "cat $fnew | perl -ne 's/(\\d+)-(\\d+)/\$2-\$1/g; print' | diff - $f"; 29 | print "$cmd\n"; 30 | system("$cmd"); 31 | } 32 | -------------------------------------------------------------------------------- /scripts/random-validation-examples.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Print out validation examples, disregarding vocabulary. 4 | # 5 | # @TODO: Don't duplicate get_example code here and twice in train.py 6 | # 7 | 8 | from common.file import myopen 9 | import string 10 | import sys 11 | 12 | def get_example(f): 13 | import common.hyperparameters 14 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 15 | for l in myopen(f): 16 | prevwords = [] 17 | for w in string.split(l): 18 | w = string.strip(w) 19 | prevwords.append(w) 20 | if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]: 21 | yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:] 22 | 23 | if __name__ == "__main__": 24 | import common.hyperparameters, common.options 25 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 26 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 27 | import hyperparameters 28 | 29 | print >> sys.stderr, "Reading examples from %s" % HYPERPARAMETERS["ORIGINAL VALIDATION_SENTENCES"] 30 | ves = [e for e in get_example(HYPERPARAMETERS["ORIGINAL VALIDATION_SENTENCES"])] 31 | import random 32 | random.shuffle(ves) 33 | print >> sys.stderr, "Reading %d examples to %s" % (HYPERPARAMETERS["VALIDATION EXAMPLES"], HYPERPARAMETERS["VALIDATION_SENTENCES"]) 34 | o = myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"], "w") 35 | for e in ves[:HYPERPARAMETERS["VALIDATION EXAMPLES"]]: 36 | o.write(string.join(e) + "\n") 37 | -------------------------------------------------------------------------------- /scripts/rundir.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run directory 3 | """ 4 | 5 | import common.hyperparameters, common.options, common.dump 6 | 7 | _rundir = None 8 | def rundir(): 9 | global _rundir 10 | if _rundir is None: 11 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 12 | _rundir = common.dump.create_canonical_directory(HYPERPARAMETERS) 13 | return _rundir 14 | -------------------------------------------------------------------------------- /scripts/w2w/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/turian/neural-language-model/f7559a6cc4e9f4c34a553fbda974762f2d3f781b/scripts/w2w/__init__.py -------------------------------------------------------------------------------- /scripts/w2w/build-example-cache.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Extract all training examples, and cache them. 4 | """ 5 | 6 | if __name__ == "__main__": 7 | import common.hyperparameters, common.options 8 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 9 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 10 | import hyperparameters 11 | 12 | import logging 13 | logging.basicConfig(level=logging.INFO) 14 | 15 | import w2w.examples 16 | w2w.examples.all_training_examples_cached() 17 | -------------------------------------------------------------------------------- /scripts/w2w/build-initial-embeddings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Given embeddings in one language, initialize embeddings in all languages 4 | using these monolingual embeddings. We do this as a weighted average 5 | of the translations of the target word in the embedding language. 6 | (However, we only do the weighted average over words that have 7 | embeddings. By comparison, we could do the weighted average and treat 8 | words without embeddings as *UNKNOWN* in the embedding language, and 9 | include these embeddings. But we don't.) 10 | """ 11 | 12 | def visualize(embeddings, idxs, name, PERPLEXITY=30): 13 | idxs = [w % embeddings.shape[0] for w in idxs] 14 | titles = [wordform(w) for w in idxs] 15 | import os.path 16 | filename = HYPERPARAMETERS["INITIAL_EMBEDDINGS"] + ".visualize-%s.png" % name 17 | try: 18 | from textSNE.calc_tsne import tsne 19 | # from textSNE.tsne import tsne 20 | out = tsne(embeddings[idxs], perplexity=PERPLEXITY) 21 | from textSNE.render import render 22 | render([(title, point[0], point[1]) for title, point in zip(titles, out)], filename) 23 | except IOError: 24 | logging.info("ERROR visualizing", filename, ". Continuing...") 25 | 26 | 27 | if __name__ == "__main__": 28 | import common.hyperparameters, common.options 29 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 30 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 31 | import hyperparameters 32 | 33 | import sys 34 | from common.stats import stats 35 | from common.str import percent 36 | import common.file 37 | import numpy 38 | import string 39 | import copy 40 | import cPickle 41 | 42 | import logging 43 | logging.basicConfig(level=logging.DEBUG) 44 | 45 | from w2w.vocabulary import wordmap, language, wordform 46 | from w2w.targetvocabulary import targetmap 47 | 48 | # Read in the embeddings 49 | print >> sys.stderr, "Reading embeddings from %s..." % HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"] 50 | print >> sys.stderr, stats() 51 | original_embeddings = {} 52 | tot = 0 53 | for l in common.file.myopen(HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]): 54 | vals = string.split(l) 55 | word = vals[0] 56 | if HYPERPARAMETERS["W2W LOWERCASE INITIAL EMBEDDINGS BEFORE INITIALIZATION"] and word != "*UNKNOWN*": 57 | if (word[0] == '*' and word[-1] == '*' and len(word) > 1): 58 | print >> sys.stderr, "WEIRD WORD: %s" % word 59 | word = string.lower(word) 60 | assert len(vals[1:]) == HYPERPARAMETERS["EMBEDDING_SIZE"] 61 | tot += 1 62 | if tot % 10000 == 0: 63 | print >> sys.stderr, "\tRead %d lines from %s" % (tot, HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]) 64 | if word in original_embeddings: 65 | # print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0]) 66 | continue 67 | else: 68 | original_embeddings[word] = numpy.array([float(v) for v in vals[1:]]) 69 | print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"] 70 | print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(tot-len(original_embeddings), tot) 71 | print >> sys.stderr, stats() 72 | 73 | reversemap = targetmap(name="reverse") 74 | 75 | embeddings = numpy.zeros((wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])) 76 | assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]) 77 | 78 | ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"] 79 | for w in range(wordmap().len): 80 | embedding = None 81 | # If this word is in a different language than the embeddings. 82 | if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]: 83 | if w not in reversemap: 84 | print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % `wordmap().str(w)` 85 | embedding = original_embeddings["*UNKNOWN*"] 86 | elif ELANG not in reversemap[w]: 87 | print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % (ELANG, wordmap().str(w), reversemap[w].keys()) 88 | embedding = original_embeddings["*UNKNOWN*"] 89 | else: 90 | # Mix the target word embedding over the weighted translation into the source language 91 | 92 | mixcnt = {} 93 | for w2 in reversemap[w][ELANG]: 94 | if language(w2) is None: 95 | assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] 96 | continue 97 | assert language(w2) == ELANG 98 | if wordform(w2) not in original_embeddings: 99 | print >> sys.stderr, "%s is NOT mixed by %s %d (no embedding)" % (wordmap().str(w), wordmap().str(w2), reversemap[w][ELANG][w2]) 100 | continue 101 | mixcnt[w2] = reversemap[w][ELANG][w2] 102 | 103 | tot = 0 104 | for w2 in mixcnt: tot += mixcnt[w2] 105 | 106 | if tot == 0: 107 | print >> sys.stderr, "Unable to mix ANY translations for %s, using *UNKNOWN*" % `wordmap().str(w)` 108 | embedding = original_embeddings["*UNKNOWN*"] 109 | else: 110 | embedding = numpy.zeros((HYPERPARAMETERS["EMBEDDING_SIZE"])) 111 | for w2 in mixcnt: 112 | embedding += 1. * mixcnt[w2] / tot * (original_embeddings[wordform(w2)]) 113 | # print >> sys.stderr, "%s is mixed %s by %s" % (wordmap().str(w), percent(mixcnt[w2], tot), wordmap().str(w2)) 114 | else: 115 | if wordform(w) not in original_embeddings: 116 | print >> sys.stderr, "Word %s has no embedding, using *UNKNOWN*" % `wordmap().str(w)` 117 | embedding = original_embeddings["*UNKNOWN*"] 118 | else: 119 | embedding = original_embeddings[wordform(w)] 120 | embeddings[w] = copy.copy(embedding) 121 | 122 | # print wordform(w), language(w), 123 | # for v in embeddings[w]: 124 | # print v, 125 | # print 126 | 127 | print >> sys.stderr, "Dumping initial embeddings to %s" % HYPERPARAMETERS["INITIAL_EMBEDDINGS"] 128 | cPickle.dump(embeddings, common.file.myopen(HYPERPARAMETERS["INITIAL_EMBEDDINGS"], "w")) 129 | 130 | import random 131 | WORDCNT = 500 132 | idxs = range(wordmap().len) 133 | random.shuffle(idxs) 134 | idxs = idxs[:WORDCNT] 135 | 136 | visualize(embeddings, idxs, "randomized") 137 | visualize(embeddings, range(WORDCNT), "mostcommon") 138 | visualize(embeddings, range(-1, -WORDCNT*50, -50), "leastcommon") 139 | visualize(embeddings, range(wordmap().len/2-WORDCNT*20/2,wordmap().len/2+WORDCNT*20/2, 20), "midcommon") 140 | -------------------------------------------------------------------------------- /scripts/w2w/build-target-vocabulary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Read in the w2w corpora (bi + monolingual), and build the translation 4 | vocabulary (for each source word, what target words it can translate to). 5 | Note: Each corpus is weighted in proportion to its length. (i.e. all 6 | words are equally weighted.) 7 | """ 8 | 9 | import sys 10 | 11 | if __name__ == "__main__": 12 | import common.hyperparameters, common.options 13 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 14 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 15 | import hyperparameters 16 | 17 | import logging 18 | logging.basicConfig(level=logging.DEBUG) 19 | 20 | import w2w.corpora 21 | from w2w.vocabulary import wordmap, language, wordform 22 | from collections import defaultdict 23 | from common.mydict import sort as dictsort 24 | 25 | cnt = {} 26 | reversecnt = {} 27 | for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames(): 28 | for ws1, ws2, links in w2w.corpora.bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign): 29 | for i1, i2 in links: 30 | if len(ws1) <= i1 or len(ws2) <= i2: 31 | print >> sys.stderr, "This is going to break on link (%d, %d) because lens = (%d, %d)" % (i1,i2, len(ws1), len(ws2)) 32 | print >> sys.stderr, [wordform(w) for w in ws1] 33 | print >> sys.stderr, [wordform(w) for w in ws2] 34 | print >> sys.stderr, links 35 | w1 = ws1[i1] 36 | w2 = ws2[i2] 37 | # print wordmap.str(w1)[1], wordmap.str(w2)[1] 38 | 39 | l2new = language(w2) 40 | 41 | assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] 42 | # Skip translations to unknown words 43 | if wordform(w2) == "*UNKNOWN*": continue 44 | 45 | assert l2new == l2 46 | 47 | 48 | # We don't filter here, otherwise we will get a reversemap that only maps to focus lemmas. 49 | # # If we are filtering examples by lemma 50 | # if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0): 51 | # assert language(w1) == "en" 52 | # from lemmatizer import lemmatize 53 | # if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: 54 | ## logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1)))) 55 | # continue 56 | 57 | if w1 not in cnt: cnt[w1] = {} 58 | if l2 not in cnt[w1]: cnt[w1][l2] = defaultdict(int) 59 | cnt[w1][l2][w2] += 1 60 | 61 | if w2 not in reversecnt: reversecnt[w2] = {} 62 | if l1 not in reversecnt[w2]: reversecnt[w2][l1] = defaultdict(int) 63 | reversecnt[w2][l1][w1] += 1 64 | 65 | # for w1 in cnt: 66 | # for l2 in cnt[w1]: 67 | # print wordmap().str(w1), l2, [(n, wordmap().str(w2)) for n, w2 in dictsort(cnt[w1][l2])] 68 | 69 | # words = {} 70 | # for (l, w) in wordfreq: 71 | # if l not in words: words[l] = [] 72 | # if wordfreq[(l, w)] >= HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"]: 73 | # words[l].append(w) 74 | 75 | import w2w.targetvocabulary 76 | w2w.targetvocabulary.write(cnt) 77 | w2w.targetvocabulary.write(reversecnt, name="reverse") 78 | -------------------------------------------------------------------------------- /scripts/w2w/build-vocabulary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Read in the w2w corpora (bi + monolingual), and build the vocabulary as 4 | all words per language that occur at least HYPERPARAMETERS["W2W MINIMUM 5 | WORD FREQUENCY"] times. 6 | Each corpus is weighted in proportion to its length. (i.e. all words are equally weighted.) 7 | """ 8 | 9 | import sys 10 | from common.stats import stats 11 | 12 | def readwords(filename): 13 | print >> sys.stderr, "Processing %s" % filename 14 | i = 0 15 | for line in open(filename): 16 | i += 1 17 | if i % 100000 == 0: 18 | print >> sys.stderr, "Read line %d of %s..." % (i, filename) 19 | print >> sys.stderr, stats() 20 | for w in string.split(line): 21 | yield w 22 | 23 | if __name__ == "__main__": 24 | import common.hyperparameters, common.options 25 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 26 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 27 | import hyperparameters 28 | 29 | import logging 30 | logging.basicConfig(level=logging.DEBUG) 31 | 32 | import w2w.corpora 33 | import string 34 | 35 | from common.mydict import sort as dictsort 36 | 37 | from collections import defaultdict 38 | wordfreq = defaultdict(int) 39 | for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames(): 40 | for w in readwords(f1): wordfreq[(l1,w)] += 1 41 | for w in readwords(f2): wordfreq[(l2,w)] += 1 42 | 43 | for l, f in w2w.corpora.monocorpora_filenames(): 44 | assert 0 45 | 46 | for (l, w) in wordfreq.keys(): 47 | if wordfreq[(l, w)] < HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"]: 48 | del wordfreq[(l, w)] 49 | if w == "*UNKNOWN*": 50 | del wordfreq[(l, w)] 51 | 52 | import w2w.vocabulary 53 | import common.idmap 54 | 55 | wordfreqkeys = [key for cnt, key in dictsort(wordfreq)] 56 | 57 | # for k in wordfreq.keys(): 58 | # print k 59 | v = common.idmap.IDmap([(None, "*LBOUNDARY*"), (None, "*RBOUNDARY*")] + wordfreqkeys, allow_unknown=HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"], unknown_key=(None, "*UNKNOWN*")) 60 | w2w.vocabulary.write(v) 61 | -------------------------------------------------------------------------------- /scripts/w2w/corpora.py: -------------------------------------------------------------------------------- 1 | """ 2 | Methods for reading corpora. 3 | """ 4 | 5 | from os.path import join, isdir, exists 6 | import sys 7 | import os 8 | import re 9 | import itertools 10 | import string 11 | import logging 12 | 13 | from common.stats import stats 14 | from common.str import percent 15 | 16 | def bicorpora_filenames(): 17 | """ 18 | For each bicorpora language pair in "W2W BICORPORA", traverse that 19 | language pair's subdirectory of DATA_DIR. Find all corpora files in 20 | that directory. 21 | Generator yields: tuples of type (l1, l2, f1, f2, falign), where l1 = 22 | source language, l2 = target language, f1 = source filename, f2 = 23 | target filename, falign = alignment file. 24 | """ 25 | import common.hyperparameters, hyperparameters 26 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 27 | 28 | for (l1, l2) in HYPERPARAMETERS["W2W BICORPORA"]: 29 | d = join(HYPERPARAMETERS["DATA_DIR"], "%s-%s" % (l1, l2)) 30 | assert isdir(d) 31 | l1re = re.compile("%s$" % l1) 32 | alignre = re.compile("align.*-%s$" % l1) 33 | for f1 in os.listdir(d): 34 | f1 = join(d, f1) 35 | if not l1re.search(f1) or alignre.search(f1): continue 36 | f2 = l1re.sub(l2, f1) 37 | assert exists(f2) 38 | falign = l1re.sub("align.%s-%s" % (l1, l2), f1) 39 | assert exists(falign) 40 | yield l1, l2, f1, f2, falign 41 | 42 | def monocorpora_filenames(): 43 | import common.hyperparameters, hyperparameters 44 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 45 | # Not yet implemented 46 | assert len(HYPERPARAMETERS["W2W MONOCORPORA"]) == 0 47 | return [] 48 | 49 | def bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign): 50 | """ 51 | Given languages l1 and l2 and their bicorpus filenames f1, f2, and falign, 52 | yield tuples of the former (ws1, ws2, links), 53 | where ws1 are the word ids in the sentence from f1, 54 | where ws2 are the word ids in the sentence from f2, 55 | and links is a list of (i1, i2) word indexes that are linked. 56 | """ 57 | from w2w.vocabulary import wordmap 58 | 59 | i = 0 60 | emptycnt = 0 61 | logging.info("Reading %s,%s sentences and alignments from %s, %s, %s" % (l1, l2, f1, f2, falign)) 62 | fil1, fil2, filalign = open(f1), open(f2), open(falign) 63 | for (s1, s2, salign) in itertools.izip(fil1, fil2, filalign): 64 | # print s1, s2, salign, 65 | i += 1 66 | if i % 100000 == 0: 67 | logging.info("\tRead line %d of %s, %s, %s..." % (i, f1, f2, falign)) 68 | logging.info("\tEmpty sentences are %s..." % (percent(emptycnt, i))) 69 | logging.info("\t%s" % stats()) 70 | 71 | ws1 = [(l1, w1) for w1 in string.split(s1)] 72 | ws2 = [(l2, w2) for w2 in string.split(s2)] 73 | ws1 = [wordmap().id(tok) for tok in ws1] 74 | ws2 = [wordmap().id(tok) for tok in ws2] 75 | 76 | if len(ws1) == 0 or len(ws2) == 0: 77 | emptycnt += 1 78 | continue 79 | 80 | # print ws2, [w2w.vocabulary.wordmap.str(w2) for w2 in ws2] 81 | links = [string.split(link, sep="-") for link in string.split(salign)] 82 | links = [(int(i1), int(i2)) for i1, i2 in links] 83 | 84 | yield ws1, ws2, links 85 | 86 | # Make sure all iterators are exhausted 87 | alldone = 0 88 | try: value = fil1.next() 89 | except StopIteration: alldone += 1 90 | try: value = fil2.next() 91 | except StopIteration: alldone += 1 92 | try: value = filalign.next() 93 | except StopIteration: alldone += 1 94 | assert alldone == 3 95 | 96 | logging.info("DONE. Read line %d of %s, %s, %s..." % (i, f1, f2, falign)) 97 | logging.info("Empty sentences are %s..." % (percent(emptycnt, i))) 98 | logging.info(stats()) 99 | 100 | if __name__ == "__main__": 101 | for l1, l2, f1, f2, falign in bicorpora_filenames(): 102 | print l1, l2, f1, f2, falign 103 | print monocorpora_filenames() 104 | -------------------------------------------------------------------------------- /scripts/w2w/dump-example-cache.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Dump the w2w target vocabulary. 4 | """ 5 | 6 | import sys 7 | 8 | if __name__ == "__main__": 9 | import common.hyperparameters, common.options 10 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 11 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 12 | 13 | import logging 14 | logging.basicConfig(level=logging.INFO) 15 | 16 | import w2w.examples 17 | for e in w2w.examples.get_all_training_examples_cached(): 18 | print e 19 | -------------------------------------------------------------------------------- /scripts/w2w/dump-target-vocabulary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Dump the w2w target vocabulary. 4 | """ 5 | 6 | import sys 7 | 8 | if __name__ == "__main__": 9 | import common.hyperparameters, common.options 10 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 11 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 12 | import hyperparameters 13 | 14 | from common.mydict import sort as dictsort 15 | from common.str import percent 16 | 17 | from vocabulary import wordmap, wordform, language 18 | from targetvocabulary import targetmap 19 | 20 | for w1 in wordmap().all: 21 | w1 = wordmap().id(w1) 22 | # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD 23 | assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] 24 | if language(w1) is None: 25 | print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)` 26 | continue 27 | if w1 not in targetmap(): 28 | print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)` 29 | continue 30 | for l2 in targetmap()[w1]: 31 | totcnt = 0 32 | for cnt, w2 in dictsort(targetmap()[w1][l2]): totcnt += cnt 33 | print wordmap().str(w1), l2, [(percent(cnt, totcnt), wordform(w2)) for cnt, w2 in dictsort(targetmap()[w1][l2])] 34 | 35 | print >> sys.stderr, "REVERSE MAP NOW" 36 | 37 | for w1 in wordmap().all: 38 | w1 = wordmap().id(w1) 39 | # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD 40 | assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] 41 | if language(w1) is None: 42 | print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)` 43 | continue 44 | if w1 not in targetmap(name="reverse"): 45 | print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)` 46 | continue 47 | for l2 in targetmap(name="reverse")[w1]: 48 | totcnt = 0 49 | for cnt, w2 in dictsort(targetmap(name="reverse")[w1][l2]): totcnt += cnt 50 | print wordmap().str(w1), l2, [(percent(cnt, totcnt), wordform(w2)) for cnt, w2 in dictsort(targetmap(name="reverse")[w1][l2])] 51 | -------------------------------------------------------------------------------- /scripts/w2w/dump-vocabulary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Dump the w2w vocaulary. 4 | """ 5 | 6 | if __name__ == "__main__": 7 | import common.hyperparameters, common.options 8 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 9 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 10 | import hyperparameters 11 | 12 | from vocabulary import wordmap 13 | for w in wordmap().all: 14 | print w 15 | -------------------------------------------------------------------------------- /scripts/w2w/examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Streaming examples. 3 | """ 4 | 5 | from w2w.corpora import bicorpora_filenames, monocorpora_filenames, bicorpus_sentences_and_alignments 6 | from common.file import myopen 7 | from common.stats import stats 8 | 9 | from w2w.targetvocabulary import targetmap 10 | from w2w.vocabulary import wordmap, language, wordform 11 | import string 12 | import logging 13 | 14 | import random 15 | from rundir import rundir 16 | import os.path 17 | import cPickle 18 | 19 | import murmur 20 | 21 | class MonolingualExample: 22 | def __init__(self, l1, l1seq, w1): 23 | """ 24 | l1 = source language 25 | l1seq = sequence of word IDs in source language 26 | w1 = focus word ID in source language 27 | """ 28 | self.l1 = l1 29 | self.l1seq = l1seq 30 | self.w1 = w1 31 | 32 | if wordform(self.w1) != "*UNKNOWN*": 33 | assert self.l1 == language(self.w1) 34 | 35 | def __str__(self): 36 | return "%s" % `(self.l1, wordform(self.w1), [wordmap().str(w)[1] for w in self.l1seq])` 37 | 38 | class BilingualExample(MonolingualExample): 39 | def __init__(self, l1, l1seq, w1, w2): 40 | """ 41 | l1 = source language 42 | l1seq = sequence of word IDs in source language 43 | w1 = focus word ID in source language 44 | w2 = focus word ID in target language 45 | """ 46 | MonolingualExample.__init__(self, l1, l1seq, w1) 47 | self.w2 = w2 48 | 49 | @property 50 | def l2(self): 51 | return language(self.w2) 52 | 53 | @property 54 | def corrupt(self): 55 | """ 56 | Return a (notw2, weight), a corrupt target word and its weight. 57 | Note: This will return a different random value every call. 58 | """ 59 | from hyperparameters import HYPERPARAMETERS 60 | import random 61 | possible_targets = targetmap()[self.w1][self.l2] 62 | assert len(possible_targets) > 1 63 | assert self.w2 in possible_targets 64 | notw2 = self.w2 65 | cnt = 0 66 | while self.w2 == notw2: 67 | if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0: 68 | notw2 = random.choice(possible_targets) 69 | pr = 1./len(possible_targets) 70 | elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1: 71 | assert 0 72 | # import noise 73 | # from common.myrandom import weighted_sample 74 | # e[-1], pr = weighted_sample(noise.indexed_weights()) 75 | ## from vocabulary import wordmap 76 | ## print wordmap.str(e[-1]), pr 77 | else: 78 | assert 0 79 | cnt += 1 80 | # Backoff to 0gram smoothing if we fail 10 times to get noise. 81 | if cnt > 10: notw2 = random.choice(possible_targets) 82 | 83 | if HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]: 84 | weight = 1. 85 | else: 86 | weight = 1./pr 87 | return notw2, weight 88 | 89 | def __str__(self): 90 | return "%s" % `(wordmap().str(self.w2), self.l1, wordform(self.w1), [wordmap().str(w)[1] for w in self.l1seq])` 91 | 92 | def get_training_biexample(l1, l2, f1, f2, falign): 93 | """ 94 | Generator of bilingual training examples from this bicorpus. 95 | """ 96 | import common.hyperparameters 97 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 98 | WINDOW = HYPERPARAMETERS["WINDOW_SIZE"] 99 | 100 | for ws1, ws2, links in bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign): 101 | for i1, i2 in links: 102 | w1 = ws1[i1] 103 | w2 = ws2[i2] 104 | 105 | l2new = language(w2) 106 | assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] 107 | # Skip translations to unknown words 108 | if wordform(w2) == "*UNKNOWN*": continue 109 | assert l2new == l2 110 | 111 | # Skip translations from unknown words 112 | if wordform(w1) == "*UNKNOWN*": continue 113 | 114 | # If we are filtering examples by lemma 115 | if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0): 116 | # print wordmap().str(w1), wordmap().str(w2) 117 | assert language(w1) == "en" 118 | # from lemmatizer import lemmatize 119 | # if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: 120 | # logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1)))) 121 | if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: 122 | logging.debug("Focus word %s not in our list of focus lemmas" % (`wordmap().str(w1)`)) 123 | continue 124 | 125 | if w1 not in targetmap(): 126 | logging.warning("No translations for word %s, skipping" % (`wordmap().str(w1)`)) 127 | continue 128 | 129 | if l2new not in targetmap()[w1]: 130 | logging.warning("Word %s has no translations for language %s, skipping" % (`wordmap().str(w1)`, l2new)) 131 | continue 132 | 133 | if w2 not in targetmap()[w1][l2new]: 134 | logging.error("Word %s cannot translate to word %s, skipping" % (`wordmap().str(w1)`, `wordmap().str(w2)`)) 135 | continue 136 | 137 | if len(targetmap()[w1][l2new]) == 1: 138 | logging.debug("Word %s has only one translation in language %s, skipping" % (`wordmap().str(w1)`, l2new)) 139 | continue 140 | 141 | # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary. 142 | min = i1 - (WINDOW-1)/2 143 | max = i1 + (WINDOW-1)/2 144 | lpad = 0 145 | rpad = 0 146 | if min < 0: 147 | lpad = -min 148 | min = 0 149 | if max >= len(ws1): 150 | rpad = max - (len(ws1)-1) 151 | max = len(ws1)-1 152 | assert lpad + (max - min + 1) + rpad == WINDOW 153 | 154 | # print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2 155 | # print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad) 156 | seq = [wordmap().id((None, "*LBOUNDARY*"))]*lpad + ws1[min:max+1] + [wordmap().id((None, "*RBOUNDARY*"))]*rpad 157 | # print [wordmap.str(w) for w in seq] 158 | assert len(seq) == WINDOW 159 | # print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2] 160 | 161 | assert seq[(WINDOW-1)/2] == w1 162 | yield BilingualExample(l1, seq, w1, w2) 163 | 164 | def is_validation_example(e): 165 | import common.hyperparameters 166 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 167 | examples_per_validation = int(1/HYPERPARAMETERS["PERCENT_OF_TRAINING_EXAMPLES_FOR_VALIDATION"]) 168 | return murmur.string_hash(`e`) % examples_per_validation == 0 169 | 170 | def get_training_minibatch_online(): 171 | """ 172 | Warning: The approach has the weird property that if one language 173 | pair's corpus is way longer than others, it will be the only examples 174 | for a while after the other corpora are exhausted. 175 | """ 176 | 177 | assert 0 # We need to filter validation examples 178 | 179 | import common.hyperparameters 180 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 181 | MINIBATCH_SIZE = HYPERPARAMETERS["MINIBATCH SIZE"] 182 | 183 | generators = [] 184 | for l1, l2, f1, f2, falign in bicorpora_filenames(): 185 | # print l1, l2, f1, f2, falign 186 | generators.append(get_training_biexample(l1, l2, f1, f2, falign)) 187 | for l, f in monocorpora_filenames(): assert 0 188 | 189 | # Cycles over generators. 190 | idx = 0 191 | last_minibatch = None 192 | while 1: 193 | minibatch = [] 194 | for e in generators[idx]: 195 | minibatch.append(e) 196 | if len(minibatch) >= MINIBATCH_SIZE: 197 | break 198 | if len(minibatch) > 0: 199 | last_minibatch = idx 200 | yield minibatch 201 | elif last_minibatch == idx: 202 | # We haven't had any minibatch in the last cycle over the generators. 203 | # So we are done will all corpora. 204 | break 205 | 206 | # Go to the next corpus 207 | idx = (idx + 1) % len(generators) 208 | 209 | def training_examples_cache_filename(): 210 | import common.hyperparameters, hyperparameters 211 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 212 | return os.path.join(HYPERPARAMETERS["DATA_DIR"], "examples-cache.minfreq=%d.include_unknown=%s.window-%d.pkl.gz" % (HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"], HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"], HYPERPARAMETERS["WINDOW_SIZE"])) 213 | 214 | _all_examples = None 215 | def all_training_examples_cached(): 216 | global _all_examples 217 | if _all_examples is None: 218 | try: 219 | _all_examples, cnt = cPickle.load(myopen(training_examples_cache_filename())) 220 | assert len(_all_examples) == cnt 221 | logging.info("Successfully read %d training examples from %s" % (cnt, training_examples_cache_filename())) 222 | logging.info(stats()) 223 | except: 224 | logging.info("(Couldn't read training examples from %s, sorry)" % (training_examples_cache_filename())) 225 | logging.info("Caching all training examples...") 226 | logging.info(stats()) 227 | _all_examples = [] 228 | for l1, l2, f1, f2, falign in bicorpora_filenames(): 229 | for e in get_training_biexample(l1, l2, f1, f2, falign): 230 | _all_examples.append(e) 231 | if len(_all_examples) % 10000 == 0: 232 | logging.info("\tcurrently have read %d training examples" % len(_all_examples)) 233 | logging.info(stats()) 234 | random.shuffle(_all_examples) 235 | logging.info("...done caching all %d training examples" % len(_all_examples)) 236 | logging.info(stats()) 237 | 238 | cnt = len(_all_examples) 239 | cPickle.dump((_all_examples, cnt), myopen(training_examples_cache_filename(), "wb"), protocol=-1) 240 | assert len(_all_examples) == cnt 241 | logging.info("Wrote %d training examples to %s" % (cnt, training_examples_cache_filename())) 242 | logging.info(stats()) 243 | assert _all_examples is not None 244 | return _all_examples 245 | 246 | def get_all_training_examples_cached(): 247 | for e in all_training_examples_cached(): 248 | if is_validation_example(e): continue 249 | yield e 250 | 251 | def get_all_validation_examples_cached(): 252 | for e in all_training_examples_cached(): 253 | if not is_validation_example(e): continue 254 | yield e 255 | 256 | def get_training_minibatch_cached(): 257 | import common.hyperparameters 258 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 259 | MINIBATCH_SIZE = HYPERPARAMETERS["MINIBATCH SIZE"] 260 | 261 | minibatch = [] 262 | for e in get_all_training_examples_cached(): 263 | minibatch.append(e) 264 | if len(minibatch) >= MINIBATCH_SIZE: 265 | yield minibatch 266 | minibatch = [] 267 | if len(minibatch) > 0: 268 | yield minibatch 269 | minibatch = [] 270 | 271 | if __name__ == "__main__": 272 | for minibatch in get_training_minibatch_cached(): 273 | # print len(minibatch) 274 | for e in minibatch: 275 | print e 276 | -------------------------------------------------------------------------------- /scripts/w2w/state.py: -------------------------------------------------------------------------------- 1 | """ 2 | Save and load training state. 3 | @todo: Training state variables (cnt, epoch) should all be combined into one object. 4 | """ 5 | 6 | import logging 7 | import os.path 8 | import cPickle 9 | 10 | from common.stats import stats 11 | from common.file import myopen 12 | import common.json 13 | import sys 14 | 15 | _lastfilename = None 16 | def save(translation_model, cnt, lastcnt, epoch, rundir, newkeystr): 17 | global _lastfilename 18 | 19 | filename = os.path.join(rundir, "translation_model-%d%s.pkl" % (cnt, newkeystr)) 20 | logging.info("Writing translation_model to %s..." % filename) 21 | logging.info(stats()) 22 | cPickle.dump(translation_model, myopen(filename, "wb"), protocol=-1) 23 | logging.info("...done writing translation_model to %s" % filename) 24 | logging.info(stats()) 25 | 26 | # if _lastfilename is not None: 27 | # logging.info("Removing old translation_model %s..." % _lastfilename) 28 | # try: 29 | # os.remove(_lastfilename) 30 | # logging.info("...removed %s" % _lastfilename) 31 | # except: 32 | # logging.info("Could NOT remove %s" % _lastfilename) 33 | _lastfilename = filename 34 | 35 | common.json.dumpfile((cnt, lastcnt, epoch, filename), os.path.join(rundir, "trainstate.json")) 36 | 37 | filename = os.path.join(rundir, "newkeystr.txt") 38 | myopen(filename, "wt").write(newkeystr) 39 | 40 | def load(rundir, newkeystr): 41 | """ 42 | Read the directory and load the translation_model, the training count, the training epoch, and the training state. 43 | """ 44 | global _lastfilename 45 | 46 | filename = os.path.join(rundir, "newkeystr.txt") 47 | assert newkeystr == myopen(filename).read() 48 | 49 | (cnt, lastcnt, epoch, filename) = common.json.loadfile(os.path.join(rundir, "trainstate.json")) 50 | 51 | # filename = os.path.join(rundir, "translation_model-%d%s.pkl" % (cnt, newkeystr)) 52 | print >> sys.stderr, ("Reading translation_model from %s..." % filename) 53 | print >> sys.stderr, (stats()) 54 | translation_model = cPickle.load(myopen(filename)) 55 | print >> sys.stderr, ("...done reading translation_model from %s" % filename) 56 | print >> sys.stderr, (stats()) 57 | _lastfilename = filename 58 | 59 | return (translation_model, cnt, lastcnt, epoch) 60 | -------------------------------------------------------------------------------- /scripts/w2w/targetvocabulary.py: -------------------------------------------------------------------------------- 1 | """ 2 | targetmap[w1][l2][w2] = c means that source word ID w1 mapped to target 3 | language l2 and target word ID w2 with count c. 4 | """ 5 | 6 | import cPickle 7 | from common.file import myopen 8 | from common.stats import stats 9 | import sys 10 | from os.path import join 11 | 12 | def _targetmap_filename(name=""): 13 | import common.hyperparameters, common.options, hyperparameters 14 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 15 | return join(HYPERPARAMETERS["DATA_DIR"], "%stargetmap.minfreq=%d.include_unknown=%s.pkl.gz" % (name, HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"], HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"])) 16 | 17 | _targetmap = {} 18 | def targetmap(name=""): 19 | global _targetmap 20 | if name not in _targetmap: 21 | f = _targetmap_filename(name=name) 22 | print >> sys.stderr, "Reading target map from %s..." % f 23 | print >> sys.stderr, stats() 24 | _targetmap[name] = cPickle.load(myopen(f)) 25 | print >> sys.stderr, "...done reading target map from %s" % f 26 | print >> sys.stderr, stats() 27 | return _targetmap[name] 28 | 29 | def write(_targetmap_new, name=""): 30 | """ 31 | Write the word ID map, passed as a parameter. 32 | """ 33 | global _targetmap 34 | assert name not in _targetmap 35 | _targetmap[name] = _targetmap_new 36 | f = _targetmap_filename(name=name) 37 | print >> sys.stderr, "Writing target map to %s..." % f 38 | cPickle.dump(_targetmap[name], myopen(f, "w")) 39 | -------------------------------------------------------------------------------- /scripts/w2w/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import string 5 | import common.dump 6 | from common.file import myopen 7 | from common.stats import stats 8 | from common.str import percent 9 | 10 | import miscglobals 11 | import logging 12 | 13 | import w2w.examples 14 | import diagnostics 15 | import state 16 | 17 | import cPickle 18 | 19 | def validate(translation_model, cnt): 20 | import math 21 | # logranks = [] 22 | # logging.info("BEGINNING VALIDATION AT TRAINING STEP %d" % cnt) 23 | # logging.info(stats()) 24 | i = 0 25 | tot = 0 26 | correct = 0 27 | for (i, ve) in enumerate(w2w.examples.get_all_validation_examples_cached()): 28 | correct_sequences, noise_sequences, weights = ebatch_to_sequences([ve]) 29 | source_language = ve.l1 30 | is_correct = translation_model[source_language].validate_errors(correct_sequences, noise_sequences) 31 | # print r 32 | for w in weights: assert w == 1.0 33 | 34 | tot += 1 35 | if is_correct: correct += 1 36 | 37 | if i % 1000 == 0: logging.info("\tvalidating %d examples done..." % i) 38 | # logging.info("Validation of model %s at cnt %d: validation err %s" % (translation_model[source_language].modelname, cnt, percent(correct, tot))) 39 | logging.info("VALIDATION of model at cnt %d: validation accuracy %s" % (cnt, percent(correct, tot))) 40 | ## logging.info([wordmap.str(id) for id in ve]) 41 | # logranks.append(math.log(m.validate(ve))) 42 | # if (i+1) % 10 == 0: 43 | # logging.info("Training step %d, validating example %d, mean(logrank) = %.2f, stddev(logrank) = %.2f" % (cnt, i+1, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)))) 44 | # logging.info(stats()) 45 | # logging.info("FINAL VALIDATION AT TRAINING STEP %d: mean(logrank) = %.2f, stddev(logrank) = %.2f, cnt = %d" % (cnt, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)), i+1)) 46 | # logging.info(stats()) 47 | ## print "FINAL VALIDATION AT TRAINING STEP %d: mean(logrank) = %.2f, stddev(logrank) = %.2f, cnt = %d" % (cnt, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)), i+1) 48 | ## print stats() 49 | 50 | def ebatch_to_sequences(ebatch): 51 | """ 52 | Convert example batch to sequences. 53 | """ 54 | correct_sequences = [] 55 | noise_sequences = [] 56 | weights = [] 57 | for e in ebatch: 58 | notw2, weight = e.corrupt 59 | correct_sequences.append(e.l1seq + [e.w2]) 60 | noise_sequences.append(e.l1seq + [notw2]) 61 | weights.append(weight) 62 | assert len(ebatch) == len(correct_sequences) 63 | assert len(ebatch) == len(noise_sequences) 64 | assert len(ebatch) == len(weights) 65 | return correct_sequences, noise_sequences, weights 66 | 67 | if __name__ == "__main__": 68 | import common.hyperparameters, common.options 69 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 70 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS) 71 | import hyperparameters 72 | 73 | from common import myyaml 74 | import sys 75 | print >> sys.stderr, myyaml.dump(common.dump.vars_seq([hyperparameters, miscglobals])) 76 | 77 | # We do not allow sophisticated training noise 78 | assert HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0 79 | 80 | from rundir import rundir 81 | rundir = rundir() 82 | 83 | import os.path, os 84 | logfile = os.path.join(rundir, "log") 85 | if newkeystr != "": 86 | verboselogfile = os.path.join(rundir, "log%s" % newkeystr) 87 | print >> sys.stderr, "Logging to %s, and creating link %s" % (logfile, verboselogfile) 88 | os.system("ln -s log %s " % (verboselogfile)) 89 | else: 90 | print >> sys.stderr, "Logging to %s, not creating any link because of default settings" % logfile 91 | 92 | import random, numpy 93 | random.seed(miscglobals.RANDOMSEED) 94 | numpy.random.seed(miscglobals.RANDOMSEED) 95 | 96 | # Random wait if we are a batch job 97 | import time 98 | if not HYPERPARAMETERS["console"]: 99 | wait = 100 * random.random() 100 | print >> sys.stderr, "Waiting %f seconds..." % wait 101 | time.sleep(wait) 102 | 103 | # import vocabulary 104 | ## logging.info("Reading vocab") 105 | ## vocabulary.read() 106 | # 107 | import model 108 | try: 109 | print >> sys.stderr, ("Trying to read training state for %s %s..." % (newkeystr, rundir)) 110 | (translation_model, cnt, lastcnt, epoch) = state.load(rundir, newkeystr) 111 | print >> sys.stderr, ("...success reading training state for %s %s" % (newkeystr, rundir)) 112 | print >> sys.stderr, logfile 113 | print >> sys.stderr, "CONTINUING FROM TRAINING STATE" 114 | except IOError: 115 | print >> sys.stderr, ("...FAILURE reading training state for %s %s" % (newkeystr, rundir)) 116 | print >> sys.stderr, ("INITIALIZING") 117 | 118 | translation_model = {} 119 | print >> sys.stderr, "Loading initial embeddings from %s" % HYPERPARAMETERS["INITIAL_EMBEDDINGS"] 120 | # TODO: If we want more than one model, we should SHARE the embeddings parameters 121 | embeddings = cPickle.load(common.file.myopen(HYPERPARAMETERS["INITIAL_EMBEDDINGS"])) 122 | 123 | print >> sys.stderr, "INITIALIZING TRAINING STATE" 124 | 125 | all_l1 = {} 126 | for l1, l2 in HYPERPARAMETERS["W2W BICORPORA"]: all_l1[l1] = True 127 | for l1 in all_l1: 128 | translation_model[l1] = model.Model(modelname="translate-from-%s" % l1, window_size=HYPERPARAMETERS["WINDOW_SIZE"]+1, initial_embeddings=embeddings) 129 | # TODO: I'd like to free this memory, but translation_model doesn't make a copy. 130 | # embeddings = None 131 | cnt = 0 132 | lastcnt = 0 133 | epoch = 1 134 | # get_train_minibatch = examples.TrainingMinibatchStream() 135 | 136 | if HYPERPARAMETERS["console"]: 137 | print >> sys.stderr, "Console mode (not batch mode)." 138 | logging.basicConfig(level=logging.INFO) 139 | else: 140 | print >> sys.stderr, "YOU ARE RUNNING IN BATCH, NOT CONSOLE MODE. THIS WILL BE THE LAST MESSAGE TO STDERR." 141 | logging.basicConfig(filename=logfile, filemode="w", level=logging.INFO) 142 | 143 | assert len(translation_model) == 1 144 | for l1 in HYPERPARAMETERS["W2W MONOCORPORA"]: 145 | assert 0 146 | 147 | # get_train_minibatch = w2w.examples.get_training_minibatch_online() 148 | get_train_minibatch = w2w.examples.get_training_minibatch_cached() 149 | 150 | logging.info(myyaml.dump(common.dump.vars_seq([hyperparameters, miscglobals]))) 151 | 152 | validate(translation_model, 0) 153 | # diagnostics.diagnostics(cnt, m) 154 | ## diagnostics.visualizedebug(cnt, m, rundir) 155 | # state.save(translation_model, cnt, lastcnt, epoch, rundir, newkeystr) 156 | while 1: 157 | logging.info("STARTING EPOCH #%d" % epoch) 158 | for ebatch in get_train_minibatch: 159 | lastcnt = cnt 160 | cnt += len(ebatch) 161 | # # print [wordmap.str(id) for id in e] 162 | 163 | source_language = ebatch[0].l1 164 | for e in ebatch: 165 | # Make sure all examples have the same source language 166 | assert e.l1 == source_language 167 | 168 | # The following is code for training on bilingual examples. 169 | # TODO: Monolingual examples? 170 | 171 | correct_sequences, noise_sequences, weights = ebatch_to_sequences(ebatch) 172 | translation_model[source_language].train(correct_sequences, noise_sequences, weights) 173 | 174 | #validate(translation_model, cnt) 175 | if int(cnt/1000) > int(lastcnt/1000): 176 | logging.info("Finished training step %d (epoch %d)" % (cnt, epoch)) 177 | # print ("Finished training step %d (epoch %d)" % (cnt, epoch)) 178 | if int(cnt/10000) > int(lastcnt/10000): 179 | for l1 in translation_model: 180 | diagnostics.diagnostics(cnt, translation_model[l1]) 181 | if os.path.exists(os.path.join(rundir, "BAD")): 182 | logging.info("Detected file: %s\nSTOPPING" % os.path.join(rundir, "BAD")) 183 | sys.stderr.write("Detected file: %s\nSTOPPING\n" % os.path.join(rundir, "BAD")) 184 | sys.exit(0) 185 | if int(cnt/HYPERPARAMETERS["VALIDATE_EVERY"]) > int(lastcnt/HYPERPARAMETERS["VALIDATE_EVERY"]): 186 | validate(translation_model, cnt) 187 | pass 188 | # for l1 in translation_model: 189 | # diagnostics.visualizedebug(cnt, translation_model[l1], rundir, newkeystr) 190 | 191 | validate(translation_model, cnt) 192 | # get_train_minibatch = w2w.examples.get_training_minibatch_online() 193 | get_train_minibatch = w2w.examples.get_training_minibatch_cached() 194 | epoch += 1 195 | 196 | state.save(translation_model, cnt, lastcnt, epoch, rundir, newkeystr) 197 | # validate(cnt) 198 | -------------------------------------------------------------------------------- /scripts/w2w/vocabulary.py: -------------------------------------------------------------------------------- 1 | """ 2 | wordmap is a map from id to (language, wordform) 3 | """ 4 | 5 | import cPickle 6 | from common.file import myopen 7 | import sys 8 | from os.path import join 9 | 10 | def _wordmap_filename(): 11 | import common.hyperparameters, common.options, hyperparameters 12 | HYPERPARAMETERS = common.hyperparameters.read("language-model") 13 | return join(HYPERPARAMETERS["DATA_DIR"], "idmap.minfreq=%d.include_unknown=%s.pkl.gz" % (HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"], HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"])) 14 | 15 | _wordmap = None 16 | def wordmap(): 17 | global _wordmap 18 | if _wordmap is None: 19 | _wordmap = cPickle.load(myopen(_wordmap_filename())) 20 | _wordmap.str = _wordmap.key 21 | return _wordmap 22 | 23 | def language(id): 24 | """ 25 | Get the language of this word id. 26 | """ 27 | return wordmap().str(id)[0] 28 | 29 | def wordform(id): 30 | """ 31 | Get the word form of this word id. 32 | """ 33 | return wordmap().str(id)[1] 34 | 35 | def write(_wordmap_new): 36 | """ 37 | Write the word ID map, passed as a parameter. 38 | """ 39 | global _wordmap 40 | assert _wordmap is None 41 | _wordmap = _wordmap_new 42 | print >> sys.stderr, "Writing word map with %d words to %s..." % (_wordmap.len, _wordmap_filename()) 43 | cPickle.dump(_wordmap, myopen(_wordmap_filename(), "w")) 44 | -------------------------------------------------------------------------------- /scripts/weight-histogram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Plot a histogram of the absolute values of model embeddings 4 | # 5 | # 6 | 7 | PERCENT = 0.01 8 | import random 9 | 10 | import sys 11 | import matplotlib 12 | matplotlib.use( 'Agg' ) # Use non-GUI backend 13 | import pylab 14 | 15 | from optparse import OptionParser 16 | parser = OptionParser() 17 | parser.add_option("-m", "--modelfile", dest="modelfile") 18 | (options, args) = parser.parse_args() 19 | assert options.modelfile is not None 20 | 21 | histfile = "%s.weight-histogram.png" % options.modelfile 22 | 23 | import cPickle 24 | m = cPickle.load(open(options.modelfile)) 25 | #print m.parameters.embeddings.shape 26 | 27 | values = [] 28 | 29 | from vocabulary import wordmap 30 | for i in range(m.parameters.vocab_size): 31 | for v in m.parameters.embeddings[i]: 32 | if random.random() < PERCENT: 33 | values.append(abs(v)) 34 | values.sort() 35 | 36 | print >> sys.stderr, "%d values read (at %f percent) of %d embeddings, %d/%f/%d = %f" % (len(values), PERCENT, m.parameters.vocab_size, len(values), PERCENT, m.parameters.vocab_size, len(values)/PERCENT/m.parameters.vocab_size) 37 | 38 | x = [] 39 | for i, v in enumerate(values): 40 | x.append(1./(len(values)-1) * i) 41 | 42 | print >> sys.stderr, 'Writing weight histogram to %s' % histfile 43 | 44 | pylab.ylim(ymin=0, ymax=1.) 45 | pylab.plot(x, values) 46 | pylab.ylim(ymin=0, ymax=1.) 47 | pylab.savefig(histfile) 48 | pylab.show() 49 | --------------------------------------------------------------------------------