├── .gitignore
├── .hgignore
├── README.txt
├── data-sample-bilingual
    ├── en-de
    │   ├── training.align.en-de
    │   ├── training.de
    │   └── training.en
    ├── en-es
    │   ├── training.align.en-es
    │   ├── training.align.es-en
    │   ├── training.en
    │   └── training.es
    ├── en-fr
    │   ├── training2.align.en-fr
    │   ├── training2.align.fr-en
    │   ├── training2.en
    │   └── training2.fr
    ├── en-it
    │   ├── training.align.en-it
    │   ├── training.en
    │   └── training.it
    └── en-nl
    │   ├── training.align.en-nl
    │   ├── training.en
    │   └── training.nl
├── data.txt
├── data
    ├── README.txt
    ├── allwords.gz
    ├── allwords.vocabulary-200.txt
    ├── allwords.vocabulary.txt.gz
    └── batch
└── scripts
    ├── LOGS
    ├── LOGS.NOBACKUP
        └── .keep
    ├── batch
    ├── batch-build-examples
    ├── batch-short
    ├── batch-w2w
    ├── batch-w2w2
    ├── batch_ngrams
    ├── diagnostics.py
    ├── dump-embeddings.py
    ├── eda
        ├── badrun.py
        ├── batch-make-curves.sh
        ├── make-graphs-trainerror.pl
        ├── old
        │   ├── batch-make-curves.sh
        │   ├── make-graphs-trainloss.pl
        │   └── make-graphs-validationlogrankloss.pl
        └── remove-nonfinal-models.pl
    ├── hyperparameters.language-model.full.yaml
    ├── hyperparameters.language-model.sample.yaml
    ├── hyperparameters.py
    ├── lemmatizer.py
    ├── miscglobals.py
    ├── model
        ├── __init__.py
        ├── graphcw.py
        ├── graphlbl.py
        ├── model.py
        └── parameters.py
    ├── monolingual
        ├── __init__.py
        ├── build-vocabulary.py
        ├── corrupt.py
        ├── examples.py
        ├── noise.py
        ├── state.py
        ├── train.py
        └── vocabulary.py
    ├── ngrams.py
    ├── preprocess
        ├── filter-sentences-by-lemma.py
        ├── lemmatizer.py
        ├── lowercase.pl
        ├── preprocess-validation.pl
        └── reverse-alignment.pl
    ├── random-validation-examples.py
    ├── rundir.py
    ├── w2w
        ├── __init__.py
        ├── build-example-cache.py
        ├── build-initial-embeddings.py
        ├── build-target-vocabulary.py
        ├── build-vocabulary.py
        ├── corpora.py
        ├── dump-example-cache.py
        ├── dump-target-vocabulary.py
        ├── dump-vocabulary.py
        ├── examples.py
        ├── state.py
        ├── targetvocabulary.py
        ├── train.py
        └── vocabulary.py
    └── weight-histogram.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | syntax: glob
 2 | 
 3 | *~
 4 | \#*\#
 5 | bak
 6 | .coverage
 7 | *.dag
 8 | *.dag.*
 9 | data
10 | data-sample-bilingual/*.pkl.gz
11 | data-sample-bilingual/*.png
12 | fmap*.pkl.gz
13 | html
14 | hyperparameters.language-model.yaml
15 | LOGS
16 | LOGS*/[A-Za-z]*
17 | *.o
18 | old
19 | *.orig
20 | out
21 | *.out
22 | *.out.gz
23 | pdf
24 | *.pyc
25 | results
26 | *.so
27 | *.sw?
28 | TMP_DBI
29 | wsj_10*
30 | 


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
 1 | syntax: glob
 2 | 
 3 | *~
 4 | \#*\#
 5 | bak
 6 | .coverage
 7 | *.dag
 8 | *.dag.*
 9 | data
10 | data-sample-bilingual/*.pkl.gz
11 | data-sample-bilingual/*.png
12 | fmap*.pkl.gz
13 | html
14 | hyperparameters.language-model.yaml
15 | LOGS
16 | LOGS*/[A-Za-z]*
17 | *.o
18 | old
19 | *.orig
20 | out
21 | *.out
22 | *.out.gz
23 | pdf
24 | *.pyc
25 | results
26 | *.so
27 | *.sw?
28 | TMP_DBI
29 | wsj_10*
30 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | Approach based upon language model in Bengio et al ICML 09 "Curriculum Learning".
 2 | 
 3 | 
 4 | You will need my common python library:
 5 |     http://github.com/turian/common
 6 | and my textSNE wrapper for t-SNE:
 7 |     http://github.com:turian/textSNE
 8 | 
 9 | You will need Murmur for hashing.
10 |     easy_install Murmur
11 | 
12 | To train a monolingual language model, probably you should run:
13 |     [edit hyperparameters.language-model.yaml]
14 |     ./build-vocabulary.py
15 |     ./train.py
16 | 
17 | To train word-to-word multilingual model, probably you should run:
18 |     cd scripts; ln -s hyperparameters.language-model.sample.yaml s hyperparameters.language-model.yaml
19 | 
20 |     # Create validation data:
21 |     ./preprocess-validation.pl > ~/data/SemEval-2-2010/Task\ 3\ -\ Cross-Lingual\ Word\ Sense\ Disambiguation/validation.txt Tokenizer v3
22 | 
23 |     # [optional: Lemmatize]
24 |     Tadpole --skip=tmp -t ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-nl/filtered-training.nl | perl -ne 's/\t/ /g; print lc($_);' | chop 3 | from-one-line-per-word-to-one-line-per-sentence.py > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-nl/filtered-training-lemmas.nl
25 |     #
26 | 
27 |     [TODO:
28 |     * Initialize using monolingual language model in source language.
29 |     * Loss = logistic, not margin.
30 |     ]
31 | 
32 |     # [optional: Run the following if your alignment for language pair l1-l2
33 |     # is in form l2-l1]
34 |     ./scripts/preprocess/reverse-alignment.pl
35 | 
36 |     ./w2w/build-vocabulary.py
37 |     # Then see the output with ./w2w/dump-vocabulary.py, to see if you want
38 |     # to adjust the w2w minfreq hyperparameter
39 | 
40 |     ./w2w/build-target-vocabulary.py
41 |     # Then see the output with ./w2w/dump-target-vocabulary.py
42 | 
43 |     ./w2w/build-initial-embeddings.py
44 | 
45 |     # [optional: Filter the corpora only to include sentences with certain
46 |     # focus words.]
47 |     # You want to make sure this happens AFTER
48 |     # ./w2w/build-initial-embeddings.py, so you have good embeddings for words
49 |     # that aren't as common in the filtered corpora.
50 |     ./scripts/preprocess/filter-sentences-by-lemma.py
51 |     # You should then move the filtered corpora to a new data directory.]
52 | 
53 |     #[optional: This will cache all the training examples onto disk. This will
54 |     # happen automatically during training anyhow.]
55 |     ./scripts/w2w/build-example-cache.py
56 | 
57 |     ./w2w/train.py
58 | 
59 | TODO:
60 |     * sqrt scaling of SGD updates
61 |     * Use normalization of embeddings?
62 |     * How do we initialize embeddings?
63 |     * Use tanh, not softsign?
64 |     * When doing SGD on embeddings, use sqrt scaling of embedding size?
65 | 


--------------------------------------------------------------------------------
/data-sample-bilingual/en-fr/training2.align.en-fr:
--------------------------------------------------------------------------------
  1 | 17-22 3-3 6-7 9-12 12-15 15-20 1-2 18-23 4-4 7-9 7-8 10-13 13-19 16-21 2-0 19-24 5-6 8-11 11-14 0-1 
  2 | 0-0 3-1 
  3 | 0-0 3-1 
  4 | 3-4 6-7 12-15 9-10 2-3 5-6 11-14 8-9 1-2 4-5 7-8 10-13 0-0 
  5 | 6-9 9-13 12-16 2-4 5-8 11-17 8-11 1-3 4-7 7-10 10-15 10-14 13-18 0-2 0-1 0-0 3-5 
  6 | 1-1 0-0 
  7 | 11-8 3-1 14-11 17-14 6-3 9-6 20-16 1-2 12-9 15-12 4-1 18-15 21-17 10-7 2-0 13-10 5-4 16-13 19-16 8-5 
  8 | 13-12 6-7 23-20 2-5 9-9 16-13 19-17 12-11 18-16 1-2 21-19 7-8 17-15 0-0 20-18 
  9 | 3-4 8-0 3-3 22-10 17-13 4-5 9-1 20-11 23-16 18-15 5-6 10-1 16-14 21-8 19-15 
 10 | 1-1 0-2 3-3 
 11 | 1-1 0-0 
 12 | 14-13 17-16 6-6 9-9 1-3 12-12 4-5 15-14 7-7 18-17 21-21 10-10 16-15 5-4 19-19 8-8 19-18 22-22 11-11 
 13 | 1-1 2-2 3-5 4-6 0-0 
 14 | 3-3 6-6 12-15 9-10 9-9 2-2 5-5 8-8 11-12 14-16 1-1 4-4 7-7 10-11 0-0 13-14 
 15 | 17-23 3-1 6-8 9-12 12-14 1-2 15-18 4-3 7-7 13-15 16-20 5-4 8-11 11-13 0-0 
 16 | 3-4 6-9 12-15 15-21 1-2 4-5 7-10 10-11 13-17 16-22 2-3 5-8 5-7 8-14 11-13 14-20 0-0 
 17 | 3-4 6-6 9-9 5-5 8-8 1-2 4-5 7-7 0-0 
 18 | 13-13 3-3 6-5 9-9 12-12 2-2 8-8 11-11 4-4 7-6 0-1 10-10 0-0 
 19 | 1-3 0-2 
 20 | 3-4 25-24 6-9 17-18 9-12 20-21 12-15 1-2 15-17 26-26 4-5 7-10 18-19 10-13 21-22 13-16 24-25 24-24 2-3 5-8 8-11 19-20 11-14 0-0 
 21 | 1-3 2-4 0-0 
 22 | 9-14 2-3 8-13 12-10 1-2 1-1 14-15 4-5 11-9 6-12 0-0 3-5 
 23 | 4-4 1-1 5-6 2-2 6-7 3-3 0-0 
 24 | 3-5 14-13 17-17 20-19 9-8 6-0 12-12 23-22 4-6 15-14 18-18 21-20 10-9 7-2 2-4 16-15 22-21 11-10 
 25 | 9-14 3-1 2-3 7-12 1-0 10-15 0-4 4-2 6-11 
 26 | 14-15 6-5 9-9 1-3 12-12 15-16 18-20 4-1 7-7 10-10 13-13 2-2 16-17 19-23 5-4 11-11 0-0 
 27 | 2-10 12-17 5-13 1-9 11-16 4-12 0-8 3-11 10-15 13-18 8-1 8-0 
 28 | 14-15 6-6 9-9 20-18 12-12 1-1 15-14 4-3 7-7 18-16 10-10 13-13 5-4 8-8 19-17 0-2 11-11 
 29 | 14-16 17-20 3-2 3-1 20-24 6-7 9-11 12-13 15-18 4-5 4-4 18-21 7-9 21-25 10-12 13-15 2-3 16-19 5-6 19-23 8-10 22-26 11-14 0-0 
 30 | 16-18 3-3 6-6 12-15 9-10 15-17 2-2 5-5 11-14 8-8 14-16 1-1 4-4 10-13 7-7 0-0 
 31 | 3-2 6-6 9-10 12-14 2-1 5-5 8-9 11-13 1-1 7-8 10-11 13-15 0-0 
 32 | 2-1 3-5 0-2 1-4 
 33 | 6-8 12-18 15-23 2-3 5-7 11-17 14-22 8-10 1-2 10-16 4-5 7-9 13-19 3-6 0-0 
 34 | 6-7 9-10 12-14 2-2 5-6 8-9 11-13 1-1 14-15 4-4 7-8 10-11 0-0 13-14 3-5 
 35 | 17-16 9-12 23-25 12-15 1-2 4-9 7-11 10-13 13-18 21-21 16-19 2-1 11-14 8-5 22-22 0-0 14-17 
 36 | 10-8 13-11 6-4 9-7 2-2 12-10 5-3 1-1 8-5 11-9 0-0 7-4 
 37 | 3-5 6-11 9-16 12-19 15-22 1-1 4-8 7-12 10-17 21-27 13-18 5-10 2-2 19-26 8-15 11-20 14-21 0-0 
 38 | 5-7 4-2 2-3 0-4 1-6 3-1 
 39 | 3-4 14-11 17-17 6-6 20-20 1-1 15-12 18-19 18-18 7-7 2-3 2-2 16-16 8-8 0-0 
 40 | 0-0 3-1 
 41 | 0-0 3-1 
 42 | 14-13 6-7 17-16 9-9 12-12 15-14 4-3 7-8 18-17 10-10 2-2 2-1 13-11 5-6 16-15 19-18 0-0 
 43 | 14-16 3-4 17-19 6-7 20-22 20-21 9-10 23-26 12-14 1-1 15-17 4-5 18-20 7-8 21-25 10-13 13-15 2-3 16-18 5-6 19-20 8-9 22-23 11-12 0-0 
 44 | 9-14 12-17 6-6 3-0 8-13 11-16 5-5 10-15 7-8 7-7 4-1 
 45 | 14-16 19-12 3-3 6-8 9-14 17-10 1-1 15-17 20-13 4-4 10-15 7-7 21-19 18-11 2-2 5-6 13-9 0-0 
 46 | 3-4 11-7 14-10 17-14 20-20 6-1 23-23 12-8 4-2 15-12 18-17 21-22 0-16 7-1 2-3 10-6 16-13 19-18 5-0 22-21 
 47 | 6-10 12-21 9-5 1-1 4-7 13-23 7-6 5-9 5-8 11-20 14-24 8-4 0-0 3-7 
 48 | 1-17 8-1 11-4 14-8 20-22 17-13 7-21 2-19 9-2 12-5 15-9 5-16 18-14 3-20 10-3 13-7 16-10 19-15 0-0 
 49 | 3-4 9-11 12-13 2-3 5-7 8-10 14-15 4-6 0-5 7-9 10-14 
 50 | 17-23 20-25 3-0 6-5 9-10 12-15 15-21 18-24 21-26 7-7 10-12 10-11 13-17 16-22 5-1 8-8 11-14 14-18 
 51 | 5-8 1-1 2-2 3-4 4-7 4-6 0-0 
 52 | 6-9 9-13 12-17 15-22 18-27 1-1 4-8 7-10 10-15 13-18 16-25 19-28 23-6 2-2 8-14 5-5 11-16 14-20 17-26 21-4 25-29 0-0 
 53 | 13-12 3-1 16-13 9-8 2-0 8-7 14-15 17-16 7-6 10-11 7-5 
 54 | 6-9 9-12 2-6 12-15 15-18 5-8 11-14 1-4 14-17 4-7 7-10 0-5 10-13 13-16 
 55 | 11-9 25-24 14-13 3-0 17-17 20-20 9-6 23-23 12-10 15-15 26-25 4-1 18-14 21-21 7-2 10-9 13-12 16-16 19-19 5-1 19-18 22-22 8-3 
 56 | 10-8 3-4 9-7 2-3 5-6 1-2 4-5 0-0 
 57 | 6-7 9-11 2-3 5-6 8-8 1-2 1-1 4-4 7-8 10-12 0-0 3-5 
 58 | 9-14 3-4 5-9 2-1 1-2 7-11 10-15 0-0 6-10 
 59 | 3-6 4-24 9-18 6-9 15-21 1-2 18-26 7-14 10-19 13-23 16-25 2-5 19-28 5-10 8-15 11-22 17-27 0-1 0-0 
 60 | 1-2 0-1 
 61 | 22-19 3-4 25-23 14-11 28-26 17-15 20-21 1-1 12-9 4-5 23-18 26-24 15-8 2-2 24-22 13-10 18-6 27-25 16-7 0-0 
 62 | 1-1 0-0 
 63 | 14-16 3-2 3-1 6-5 9-10 12-13 15-15 26-24 4-3 18-18 7-7 21-22 10-11 13-14 2-1 16-17 27-25 5-4 8-9 19-19 11-13 0-0 
 64 | 14-16 25-24 3-2 17-19 11-4 9-10 23-25 1-0 15-17 7-11 4-2 10-12 21-22 24-26 16-20 2-1 27-27 8-13 13-8 19-21 22-23 0-0 
 65 | 3-2 6-5 9-9 12-14 2-1 5-4 11-13 8-8 4-3 7-6 17-15 0-0 13-14 
 66 | 12-19 8-15 5-9 11-18 1-5 4-8 7-12 10-17 0-2 0-1 3-6 9-16 0-0 6-10 
 67 | 17-21 25-23 14-12 28-28 11-3 6-5 15-19 12-10 9-2 26-24 21-22 7-4 16-20 13-11 27-27 5-6 10-2 8-8 0-0 
 68 | 3-4 6-7 9-10 12-13 2-3 5-6 8-9 14-17 1-2 10-16 7-8 0-1 13-15 
 69 | 3-4 6-8 12-13 1-2 15-18 4-5 7-9 10-11 13-15 2-3 16-19 5-6 8-10 11-12 14-18 0-0 
 70 | 14-5 16-15 5-9 12-11 15-14 15-13 4-8 1-2 3-7 0-1 
 71 | 10-8 3-1 6-4 9-7 2-0 5-2 8-6 4-3 7-5 
 72 | 4-4 1-1 2-2 3-3 0-0 
 73 | 1-1 5-4 6-6 7-9 6-5 3-2 0-0 
 74 | 1-1 0-0 
 75 | 16-18 15-22 12-16 5-9 18-23 2-3 14-21 4-11 11-15 1-2 7-12 17-19 10-14 3-8 13-17 3-7 0-0 
 76 | 3-4 6-6 9-11 2-3 8-10 5-5 11-13 1-2 7-9 10-12 0-0 
 77 | 11-9 3-2 17-16 6-5 20-21 9-7 1-1 12-10 15-15 4-3 7-6 21-22 10-8 13-11 5-4 19-20 0-0 
 78 | 11-7 14-10 15-24 6-2 9-5 12-8 10-14 16-25 7-3 19-29 13-9 5-1 17-28 8-4 
 79 | 22-19 11-8 3-2 25-22 17-14 9-6 20-15 1-1 23-20 26-23 18-16 7-5 7-4 10-7 2-2 13-11 5-3 16-12 19-17 0-0 
 80 | 13-13 5-9 2-3 12-12 8-10 4-8 1-2 3-6 0-0 
 81 | 6-9 12-18 8-16 1-10 5-8 14-20 11-14 4-7 7-12 13-19 9-17 3-6 0-0 2-11 
 82 | 9-12 12-16 2-4 5-9 8-12 11-15 4-7 1-1 7-11 10-14 0-0 6-10 
 83 | 9-14 12-17 2-7 5-11 11-20 8-13 1-8 4-9 10-18 13-21 0-5 6-12 3-6 
 84 | 14-16 20-21 6-4 9-9 1-2 12-12 4-7 18-18 21-24 10-10 2-6 13-15 24-25 5-8 16-13 19-19 0-1 
 85 | 12-19 8-15 2-3 11-18 14-21 7-14 1-2 10-17 1-1 13-20 9-16 0-0 3-5 
 86 | 3-2 2-4 1-3 4-6 14-15 7-11 10-13 0-0 13-14 6-10 
 87 | 3-4 10-8 13-13 16-15 19-18 2-3 9-7 12-9 4-10 18-17 15-11 1-2 1-1 14-14 17-16 0-0 
 88 | 14-14 3-1 17-16 9-10 1-6 20-19 12-13 4-3 4-2 18-17 21-20 13-15 5-4 19-18 11-11 22-21 
 89 | 3-3 6-6 14-8 12-15 1-1 23-19 4-4 7-7 10-13 21-22 13-17 24-23 5-5 8-11 11-14 0-0 
 90 | 13-12 3-2 16-15 6-5 9-8 12-11 15-14 8-7 5-1 11-10 14-13 0-3 7-6 4-0 10-9 
 91 | 3-4 6-7 9-11 2-3 5-6 8-10 11-13 1-2 4-5 7-9 7-8 10-12 0-0 
 92 | 14-16 19-12 3-3 6-7 12-18 17-11 1-0 15-17 20-13 4-4 13-16 7-2 18-10 2-1 21-14 5-5 8-8 11-15 22-21 22-20 
 93 | 3-4 9-12 6-6 12-13 2-3 5-7 17-21 14-15 7-9 0-1 16-20 10-10 0-0 13-14 
 94 | 4-4 1-1 2-2 3-3 0-0 
 95 | 3-6 25-26 8-1 14-14 17-17 20-21 9-9 23-24 1-3 12-13 4-7 26-27 15-15 18-19 18-18 29-28 21-22 10-10 2-5 24-25 2-4 7-0 5-8 16-16 19-20 22-23 0-2 11-12 11-11 
 96 | 1-3 8-8 4-2 3-6 7-7 0-0 
 97 | 6-9 2-4 5-7 1-3 4-6 11-10 14-13 7-8 10-11 0-0 3-5 
 98 | 19-12 3-0 12-20 6-6 15-24 9-9 17-11 1-3 20-15 13-22 7-7 2-4 18-10 21-14 5-5 11-19 11-18 14-23 8-8 22-25 0-2 
 99 | 6-8 2-6 9-10 3-0 12-13 1-5 11-12 5-1 7-9 0-3 10-11 
100 | 5-13 8-16 7-15 1-1 3-11 6-14 4-2 0-0 2-11 
101 | 3-4 6-7 2-3 5-8 12-10 8-9 11-13 1-2 4-6 7-11 10-12 13-16 0-0 9-15 
102 | 3-6 20-28 6-8 9-12 12-18 1-2 4-7 21-29 7-10 10-16 13-19 16-24 2-5 19-26 8-11 11-17 14-21 17-25 0-0 
103 | 3-2 6-8 20-23 12-15 9-7 18-21 4-1 7-6 10-10 13-16 5-5 19-22 11-11 0-0 
104 | 17-21 3-3 6-8 9-10 12-15 1-1 15-18 18-22 4-4 10-12 10-11 13-16 16-19 2-2 5-6 11-14 0-0 14-17 
105 | 4-3 5-4 2-1 6-5 3-2 0-0 7-6 
106 | 10-8 6-7 12-14 11-13 1-1 4-6 7-9 0-0 3-5 
107 | 3-6 17-20 20-25 9-11 12-15 1-2 4-7 18-21 7-8 5-18 21-24 10-13 13-16 2-5 16-19 19-23 19-22 8-10 22-26 0-1 14-17 
108 | 4-4 1-2 5-5 3-3 0-1 0-0 
109 | 14-16 3-3 17-18 6-6 9-12 20-21 23-24 15-16 4-4 18-20 7-7 21-22 10-9 24-25 2-2 16-17 5-5 8-11 19-21 11-13 22-23 0-1 
110 | 6-7 3-0 2-0 5-5 8-9 1-2 4-4 7-8 0-1 
111 | 1-1 2-2 3-4 0-0 4-5 
112 | 3-4 17-18 20-21 9-11 12-13 1-2 4-5 15-15 18-19 7-8 10-10 13-14 2-3 5-6 16-16 19-20 8-9 11-12 0-1 0-0 
113 | 3-2 6-6 12-15 9-9 2-1 5-5 8-8 11-11 4-3 7-7 10-10 0-0 
114 | 6-5 20-22 9-12 12-17 23-25 1-1 15-15 4-3 18-20 21-23 10-9 24-26 13-14 2-2 16-16 5-4 8-11 19-21 22-24 11-13 0-0 25-27 
115 | 8-9 4-4 1-2 5-5 6-6 3-3 0-1 0-0 
116 | 20-16 13-12 10-6 2-5 9-9 19-15 12-8 22-17 1-4 18-14 11-7 21-16 0-1 0-0 17-13 
117 | 3-6 14-15 6-11 12-20 9-13 15-26 1-2 4-8 7-10 10-14 13-17 2-3 5-9 11-20 8-12 0-0 
118 | 11-9 14-11 17-14 20-21 9-7 1-1 4-3 15-11 7-6 18-15 21-20 10-8 2-2 13-12 5-4 16-13 19-19 8-7 22-22 0-0 
119 | 1-1 0-0 
120 | 3-4 25-24 3-3 6-9 9-15 12-16 15-19 23-22 1-1 7-13 18-20 13-17 24-23 2-2 5-8 8-14 11-11 14-18 22-21 0-0 
121 | 13-12 13-11 7-1 9-9 2-4 12-12 8-8 1-3 4-7 5-0 11-10 14-13 0-2 3-6 
122 | 22-19 3-4 14-13 25-23 6-7 17-15 1-5 12-11 10-21 23-20 15-14 18-16 2-3 13-12 11-22 24-20 5-6 0-2 
123 | 17-23 6-9 9-12 1-1 18-24 4-6 15-16 7-11 10-13 13-17 2-3 19-25 8-10 11-14 14-18 0-0 
124 | 6-7 13-11 3-0 16-14 15-17 12-12 1-2 11-10 8-4 14-13 17-18 0-3 7-5 10-9 
125 | 3-1 6-5 10-4 9-3 11-9 4-2 7-6 0-0 
126 | 19-10 6-0 9-6 20-14 12-8 23-18 15-11 18-15 7-1 10-7 13-9 5-2 16-12 
127 | 6-4 9-9 8-8 5-2 1-1 7-7 10-10 0-0 3-5 
128 | 6-10 2-3 3-4 4-8 5-9 
129 | 3-4 6-8 12-16 9-10 2-3 5-7 11-14 1-2 4-6 7-9 10-11 0-1 0-0 
130 | 6-11 1-1 2-5 3-6 4-7 0-0 
131 | 13-9 6-3 9-5 5-2 11-11 14-10 4-1 17-14 0-0 7-4 
132 | 3-3 6-6 9-9 19-17 12-13 2-1 5-5 8-8 1-2 14-16 4-4 10-14 7-7 13-15 0-0 
133 | 5-8 2-6 6-9 0-4 1-5 
134 | 3-2 12-17 5-9 11-16 2-0 7-15 1-1 8-4 4-3 6-10 
135 | 3-1 6-6 9-8 5-6 1-2 11-10 4-4 7-7 0-0 10-9 
136 | 3-4 6-8 16-17 9-11 8-14 12-13 2-3 15-16 5-5 11-12 1-2 14-15 4-5 17-18 7-6 0-0 
137 | 12-15 15-20 15-19 6-3 18-22 2-1 11-14 14-18 5-2 17-21 1-1 4-5 10-13 3-8 13-16 0-0 
138 | 3-3 6-5 2-2 5-6 8-8 1-1 4-4 7-7 0-0 
139 | 3-4 25-24 17-17 6-7 9-9 20-18 23-23 12-12 1-0 26-25 4-4 15-13 7-8 10-10 21-19 2-3 24-23 16-16 5-5 8-8 0-2 19-15 11-11 
140 | 1-2 2-3 0-0 3-1 
141 | 3-4 6-9 9-11 5-8 1-2 7-10 10-12 0-1 
142 | 10-7 3-1 9-4 12-8 8-3 11-6 0-2 7-5 
143 | 14-15 3-3 17-19 6-6 23-29 9-9 12-12 1-1 15-16 4-4 18-20 21-26 7-7 10-10 13-15 2-2 16-17 5-5 19-22 19-21 8-8 22-25 11-11 0-0 
144 | 3-4 9-13 12-15 5-9 2-3 8-12 11-17 1-2 4-6 7-11 4-5 10-14 13-18 0-0 6-10 
145 | 4-21 6-7 9-9 1-2 23-22 26-28 15-13 5-20 7-8 10-11 24-23 2-1 16-16 8-10 11-12 0-0 25-27 
146 | 5-15 10-7 12-16 9-10 8-14 2-2 4-11 7-13 1-1 11-8 6-12 0-0 
147 | 5-9 8-12 1-8 0-7 7-11 3-6 6-10 
148 | 14-14 3-3 9-9 17-11 12-13 1-1 4-4 7-7 18-15 21-16 2-2 5-5 8-8 16-10 0-0 
149 | 16-17 9-11 12-14 2-2 2-1 8-10 11-13 5-0 7-9 10-12 13-16 3-5 
150 | 4-20 17-10 18-25 10-4 3-19 6-22 16-8 12-7 2-17 5-21 11-6 14-11 19-26 1-17 9-15 
151 | 5-8 1-2 1-1 2-3 3-4 4-7 0-0 
152 | 13-10 10-5 3-0 6-4 16-12 9-6 12-9 5-2 8-7 17-22 11-8 14-11 4-1 
153 | 2-4 6-7 0-2 7-8 3-3 0-1 
154 | 3-3 6-5 9-9 12-12 2-0 5-4 8-8 11-11 1-1 4-4 7-6 10-10 
155 | 1-1 0-2 3-3 
156 | 3-4 3-3 6-8 12-14 5-7 2-1 8-10 11-13 1-0 7-9 13-15 0-0 
157 | 14-14 17-19 3-0 12-21 6-6 20-22 9-10 1-5 15-16 4-2 7-7 21-23 10-12 10-11 16-18 16-17 5-3 8-9 0-4 11-13 
158 | 6-7 12-11 9-5 8-9 1-1 11-10 7-8 0-0 
159 | 14-15 14-14 17-20 20-23 9-5 12-12 15-16 18-21 4-2 7-4 24-28 13-13 16-19 19-22 5-3 8-8 22-24 11-10 
160 | 3-4 3-3 14-11 6-6 17-15 9-8 20-17 1-1 4-5 15-12 7-6 10-9 2-2 13-10 16-13 8-7 19-16 0-0 
161 | 3-4 6-10 17-18 9-12 20-21 23-25 1-1 4-7 15-16 7-11 18-19 10-13 21-23 2-6 24-26 5-8 16-17 19-20 11-14 22-24 0-0 
162 | 1-3 2-6 3-7 0-2 
163 | 17-19 3-1 14-8 20-21 23-26 26-28 15-17 4-4 7-11 12-6 18-20 10-14 21-22 24-27 16-18 2-0 13-7 8-10 19-20 11-16 22-24 0-3 25-27 
164 | 17-23 3-2 20-26 6-6 9-9 12-15 15-22 18-23 4-3 21-27 10-10 13-17 2-1 2-0 19-24 5-5 8-8 14-19 11-11 
165 | 4-4 1-1 5-5 2-2 6-6 3-3 7-7 0-0 
166 | 3-6 6-12 17-22 16-2 9-15 12-18 4-10 14-1 10-17 7-8 0-16 2-5 15-3 5-11 8-9 13-0 3-7 
167 | 6-13 17-23 22-17 20-18 4-11 7-15 18-24 9-0 12-3 2-9 2-8 5-12 10-1 19-21 13-4 3-10 
168 | 10-8 6-4 9-7 12-11 2-1 5-3 1-1 8-5 11-10 4-2 7-6 0-0 
169 | 14-16 17-17 6-7 9-11 12-15 1-1 15-18 18-22 4-3 7-8 10-12 13-14 2-2 16-17 5-6 19-23 8-10 11-13 0-0 
170 | 5-6 1-0 2-3 3-4 0-0 4-5 
171 | 6-9 15-22 5-8 14-21 17-24 11-14 20-27 4-7 1-1 10-16 13-20 16-23 9-17 19-25 0-0 
172 | 10-8 3-3 9-9 2-2 5-4 8-7 1-1 11-10 4-5 7-6 0-0 
173 | 3-4 6-9 9-11 12-15 2-3 5-7 8-10 11-14 11-13 1-2 4-6 7-8 10-12 13-16 0-0 
174 | 7-4 8-5 5-2 2-0 6-3 3-1 
175 | 1-1 5-6 2-2 6-7 3-3 0-0 4-5 
176 | 10-7 6-5 9-10 12-13 2-3 5-8 2-2 8-9 1-1 4-4 11-7 0-0 
177 | 3-4 9-9 12-14 15-18 8-13 2-2 14-17 14-16 1-1 11-10 4-5 10-12 13-15 0-0 16-19 
178 | 1-1 2-4 3-4 3-3 0-0 4-5 
179 | 7-5 4-1 5-2 6-3 0-0 1-4 
180 | 1-2 8-8 5-5 2-3 6-6 3-4 7-7 0-0 
181 | 3-4 6-8 9-12 2-4 5-7 8-11 11-14 1-1 4-6 7-10 10-13 
182 | 3-3 6-7 9-12 2-2 8-11 18-17 11-13 1-1 4-4 10-13 17-16 13-15 0-0 
183 | 7-3 6-6 8-15 1-10 5-5 0-9 3-13 10-16 4-4 2-12 9-15 
184 | 14-16 3-4 17-19 6-6 9-11 12-14 1-2 15-18 7-8 10-12 13-15 2-3 16-18 5-5 8-10 11-13 0-0 
185 | 6-8 12-18 9-12 11-17 5-6 8-11 1-5 14-19 4-7 10-16 7-10 0-3 13-15 
186 | 4-4 1-1 2-2 3-3 0-0 
187 | 3-4 9-10 2-3 5-6 8-10 1-2 4-7 7-8 10-11 0-0 
188 | 17-23 14-15 20-27 3-2 20-26 9-10 12-13 15-20 1-1 18-24 4-4 21-28 7-7 10-11 13-14 16-21 2-1 19-25 5-5 8-9 11-12 0-0 
189 | 3-3 6-6 9-9 2-2 5-5 8-8 11-11 1-1 4-4 7-7 10-10 0-0 
190 | 6-6 13-10 16-15 3-0 2-1 12-9 15-14 1-4 5-3 8-8 14-13 4-2 7-7 17-16 11-5 
191 | 13-13 3-3 16-16 12-15 2-3 5-8 11-14 18-17 4-4 14-13 20-19 6-10 
192 | 3-5 6-11 17-18 20-23 12-15 1-4 15-20 4-6 18-21 7-9 10-12 13-16 2-5 5-10 16-19 19-22 11-14 0-3 14-17 
193 | 9-13 3-2 12-14 5-9 2-2 14-16 1-1 4-6 7-11 10-12 13-15 0-0 6-10 
194 | 2-1 0-3 2-0 3-2 1-4 
195 | 10-8 1-11 6-4 9-7 11-13 5-3 5-2 8-6 4-1 2-12 7-5 0-0 
196 | 7-0 2-5 5-7 1-3 11-8 4-4 0-2 0-1 3-6 
197 | 3-2 17-18 20-20 9-9 23-23 1-0 15-16 4-3 18-15 21-22 7-4 2-1 16-17 5-6 19-19 8-5 22-21 0-0 
198 | 7-3 14-6 3-1 10-4 6-2 2-0 16-8 15-7 11-5 
199 | 12-16 6-6 9-11 2-2 8-10 11-15 5-4 1-1 4-3 10-13 13-17 7-7 0-0 
200 | 13-13 7-3 10-6 1-12 3-0 6-2 9-5 12-8 5-1 8-4 11-7 
201 | 10-8 3-4 9-7 2-3 5-6 1-2 4-5 0-0 
202 | 1-1 5-4 2-3 6-7 4-6 0-0 
203 | 3-2 12-21 17-14 9-7 1-1 15-18 4-5 18-15 10-11 16-17 19-23 11-13 11-12 14-19 8-4 0-0 
204 | 9-14 6-9 8-13 5-8 8-12 2-1 1-1 4-6 7-10 0-0 3-5 
205 | 7-2 10-7 2-4 9-3 1-1 8-5 11-8 3-6 0-0 
206 | 7-3 10-6 3-1 1-10 6-2 9-5 2-0 11-11 8-4 
207 | 3-2 6-6 2-2 12-11 5-4 8-8 1-1 11-10 4-3 7-7 0-0 10-9 
208 | 1-2 0-1 0-0 
209 | 13-13 3-4 6-9 9-12 2-3 8-11 5-5 1-2 7-10 0-0 
210 | 14-12 3-1 6-5 20-22 9-9 12-11 15-16 4-2 7-7 18-17 7-6 21-23 13-13 2-0 5-4 19-21 8-8 22-24 11-10 
211 | 10-7 7-2 3-0 16-13 6-1 5-5 5-4 15-10 8-6 4-4 11-8 17-14 
212 | 25-26 14-14 3-3 17-19 6-6 20-22 9-9 23-24 1-1 15-15 4-4 18-20 7-7 21-23 10-11 13-16 24-25 2-2 5-5 19-21 8-8 22-23 0-0 
213 | 4-3 1-1 3-2 0-0 
214 | 9-14 12-17 2-6 5-9 8-13 11-16 1-5 1-4 4-8 7-12 10-15 13-18 0-2 3-7 0-1 0-0 6-10 
215 | 3-4 13-13 16-16 19-19 2-3 5-7 15-15 8-9 18-18 11-11 1-1 4-5 14-14 7-8 17-17 10-10 0-0 
216 | 3-4 6-8 8-14 2-3 8-13 11-17 5-6 1-2 7-12 4-5 0-0 9-15 12-20 
217 | 11-9 20-20 2-25 6-2 17-12 15-10 21-21 24-28 18-13 7-3 3-23 13-8 1-24 5-1 16-11 19-16 25-29 8-4 
218 | 2-5 3-7 0-3 4-8 1-4 
219 | 6-11 3-4 12-21 20-24 9-14 15-20 1-2 4-9 7-12 26-25 21-24 2-3 5-10 8-13 11-19 14-22 0-1 0-0 
220 | 3-3 6-5 9-8 2-1 5-4 8-7 1-2 4-3 7-6 10-10 0-0 
221 | 14-16 3-3 17-19 9-10 12-12 1-1 15-17 4-4 18-20 7-5 10-11 13-13 2-2 16-18 8-9 0-0 
222 | 14-15 3-1 20-24 6-6 23-28 9-10 12-13 15-17 18-21 4-2 21-25 7-7 24-29 10-11 13-18 16-20 16-19 2-0 19-23 8-8 11-12 
223 | 3-4 15-18 5-7 2-1 14-19 7-12 17-21 4-6 10-14 13-16 16-20 6-11 0-0 
224 | 12-19 18-25 2-5 5-9 11-18 1-4 9-2 4-8 15-10 13-20 0-3 8-1 19-26 3-6 
225 | 11-9 14-12 3-1 6-3 9-7 23-23 12-10 15-13 4-2 18-18 7-4 21-21 10-8 13-11 8-5 22-22 0-0 
226 | 14-14 17-21 3-3 6-8 23-26 12-13 26-29 15-19 1-0 18-23 4-4 7-9 24-28 13-15 16-20 2-1 5-5 19-22 8-10 11-16 22-25 25-27 
227 | 2-7 5-11 8-14 11-19 8-13 1-4 4-9 7-12 10-15 9-18 0-0 3-5 6-10 
228 | 25-25 20-26 6-6 17-14 9-9 1-1 23-20 4-3 21-27 26-22 7-7 18-16 10-10 13-12 24-21 27-28 5-5 16-13 8-8 19-18 19-17 22-24 0-0 
229 | 6-8 13-12 9-11 16-13 12-14 2-4 5-7 8-10 1-1 4-6 14-15 7-9 0-3 3-5 
230 | 14-14 17-16 6-5 20-19 9-9 12-13 1-1 15-17 7-6 24-25 10-7 2-2 16-15 5-4 5-3 8-8 19-18 22-22 25-29 0-0 11-10 
231 | 14-14 17-17 6-6 20-19 9-8 1-1 12-11 1-0 4-4 15-14 18-18 7-7 21-21 10-9 2-3 13-12 5-5 16-15 19-20 8-8 0-2 
232 | 14-15 3-3 17-20 20-23 6-6 9-9 1-1 15-16 4-4 18-21 7-7 21-24 10-10 13-14 2-2 16-17 5-5 19-22 8-8 11-13 0-0 
233 | 3-3 11-22 7-20 2-2 1-1 4-6 6-14 5-18 0-0 8-21 
234 | 3-1 3-0 16-13 12-10 5-4 15-12 1-2 8-5 11-9 7-7 
235 | 3-5 19-10 22-15 6-8 25-19 11-2 15-21 4-6 20-11 23-14 2-4 5-7 21-12 10-1 24-16 27-22 0-3 14-20 
236 | 16-17 3-2 6-5 9-9 15-16 8-8 11-11 1-1 14-15 4-4 7-7 10-10 0-0 
237 | 13-13 3-1 6-6 9-8 12-12 2-0 15-14 1-5 8-8 11-10 14-14 0-3 4-2 7-7 17-15 
238 | 1-13 3-1 6-3 9-6 12-11 15-15 5-2 8-5 14-14 4-1 10-10 0-0 7-4 
239 | 3-4 6-6 2-3 5-5 1-1 7-7 4-2 0-0 
240 | 11-7 3-2 17-17 1-1 9-4 23-21 18-18 15-9 10-6 13-10 5-3 16-13 19-19 8-4 0-0 22-20 
241 | 6-8 9-11 12-13 2-3 15-16 5-7 8-10 1-2 14-15 4-6 7-9 10-12 0-0 13-14 3-5 
242 | 1-2 2-3 3-6 3-5 4-7 0-1 5-10 6-14 
243 | 3-4 9-12 5-8 8-11 4-7 1-1 0-0 
244 | 23-28 9-10 15-22 12-14 1-2 18-24 7-8 10-13 16-23 13-15 2-3 19-26 5-7 22-27 8-9 0-1 11-11 14-18 17-25 
245 | 3-5 14-13 6-7 20-23 9-10 15-21 1-1 18-20 7-8 10-11 13-16 16-22 2-3 5-6 8-9 19-17 11-12 0-0 
246 | 5-6 1-0 2-3 3-5 3-4 0-0 4-5 
247 | 2-4 4-14 8-11 5-5 3-13 1-1 0-3 7-7 9-15 
248 | 10-3 5-8 12-11 9-2 4-7 8-1 7-6 11-5 
249 | 19-11 22-13 14-8 12-19 12-18 1-1 20-10 4-3 23-15 13-20 15-7 24-25 18-9 2-2 5-4 11-17 8-5 0-0 
250 | 16-4 6-10 9-17 12-20 14-5 4-9 23-22 1-1 22-2 7-12 10-18 20-3 24-23 5-8 8-13 11-19 13-7 0-0 
251 | 3-3 6-6 2-2 5-5 7-12 1-2 4-4 0-0 
252 | 3-0 0-2 4-6 1-5 
253 | 14-16 22-19 3-3 25-22 6-7 9-10 12-13 1-1 15-17 4-5 23-18 26-25 7-8 10-11 21-20 13-15 2-2 24-21 5-6 8-9 11-11 0-0 
254 | 19-22 12-16 3-0 9-10 15-18 15-17 18-21 14-20 1-5 8-9 5-3 11-12 7-8 4-1 10-11 16-19 
255 | 3-2 14-9 17-15 20-19 12-12 23-20 4-3 10-10 21-18 13-13 24-21 2-1 16-17 8-11 19-16 0-0 11-10 
256 | 3-4 6-7 9-10 2-2 5-6 8-9 1-1 4-5 7-8 0-0 
257 | 4-3 1-2 5-4 3-3 0-1 0-0 
258 | 11-23 2-7 5-12 8-17 5-11 1-6 4-10 10-20 7-14 0-4 3-9 9-19 12-24 
259 | 6-8 9-12 12-15 2-2 5-7 8-11 8-10 11-14 1-2 14-16 4-6 7-9 10-13 0-1 13-15 3-5 
260 | 17-15 17-14 1-7 20-19 6-0 4-9 9-5 9-4 12-11 15-13 18-17 21-20 7-1 10-6 13-12 16-13 19-18 3-8 8-3 11-10 
261 | 17-20 11-4 7-22 4-14 1-1 9-3 15-17 12-5 13-21 5-16 5-15 2-2 16-19 19-23 0-0 
262 | 3-4 6-8 9-13 12-17 15-20 1-2 1-1 18-24 18-23 4-5 7-10 13-18 16-22 2-3 19-25 5-7 8-11 11-16 14-19 0-0 
263 | 17-21 3-3 20-24 6-7 6-6 9-12 12-16 18-25 15-17 4-4 21-26 7-8 10-13 16-20 2-0 19-23 5-5 8-9 11-15 11-14 0-2 
264 | 14-14 3-3 17-17 6-6 9-9 12-12 1-1 15-15 4-4 18-18 7-7 10-10 13-13 2-2 16-16 5-5 19-19 8-8 11-11 0-0 
265 | 13-12 16-17 9-11 2-5 5-8 15-15 8-10 1-4 4-7 14-16 7-9 3-6 0-0 
266 | 6-5 2-4 8-9 4-8 1-3 1-2 10-10 0-0 
267 | 14-14 17-18 20-24 6-5 9-8 1-2 12-11 15-15 4-3 18-19 7-6 10-10 21-20 10-9 2-4 13-12 16-17 19-23 22-25 8-7 0-0 
268 | 9-10 2-4 2-3 8-9 4-7 1-2 7-8 10-11 0-1 
269 | 17-20 3-2 6-6 20-23 9-9 12-16 15-18 18-21 4-3 21-26 7-7 10-13 16-19 13-11 2-0 22-29 19-22 5-4 0-5 8-8 11-14 14-17 
270 | 22-17 3-3 9-10 28-21 20-16 1-1 4-5 23-18 4-4 26-19 18-14 2-2 5-8 8-9 27-20 19-15 0-0 
271 | 14-14 3-3 17-16 6-6 9-8 12-13 1-1 15-15 18-22 4-4 7-7 10-10 2-2 13-12 16-16 5-5 8-9 11-11 0-0 
272 | 3-6 9-17 17-19 14-10 12-18 4-11 1-2 7-13 18-20 10-16 10-15 15-9 2-5 5-12 16-19 0-0 
273 | 3-6 17-22 6-8 9-11 12-14 15-19 18-23 4-3 7-9 10-12 13-15 16-20 2-1 5-7 8-10 11-13 14-18 0-0 
274 | 20-26 6-8 9-11 12-16 15-19 1-1 4-6 18-23 18-22 21-27 7-9 10-15 13-20 2-4 5-7 19-24 16-17 22-28 11-14 14-18 0-0 
275 | 7-2 2-8 5-11 10-4 12-12 4-10 9-0 8-3 3-9 
276 | 6-13 3-5 25-21 17-16 9-10 20-19 1-2 4-6 15-15 12-7 26-23 26-22 18-17 10-9 5-11 2-2 24-20 27-24 16-14 19-18 
277 | 4-4 1-2 5-5 2-3 3-4 0-0 
278 | 14-15 17-18 20-23 6-4 23-26 9-9 1-5 12-14 15-16 4-3 18-19 7-7 21-24 10-10 13-14 16-17 5-6 19-20 8-8 22-25 0-2 
279 | 5-7 2-2 3-3 4-6 1-5 0-0 
280 | 16-18 3-3 13-11 12-10 15-13 1-1 4-5 14-14 11-9 4-4 0-0 
281 | 13-11 10-6 3-1 16-13 9-4 15-12 18-14 8-3 4-2 11-5 0-0 17-13 
282 | 11-8 22-17 14-11 25-19 6-3 9-6 20-15 12-9 23-17 15-12 4-1 26-20 7-4 10-7 21-16 13-10 24-18 16-14 5-2 8-4 0-0 
283 | 16-17 2-5 5-10 8-14 15-16 11-15 1-4 4-9 7-12 0-6 10-15 3-7 
284 | 3-3 5-12 12-16 9-7 18-22 2-2 11-16 4-10 1-4 0-9 17-21 7-11 10-15 13-17 16-19 
285 | 16-5 3-3 12-15 4-11 9-7 23-23 1-1 17-6 7-10 29-25 13-16 5-13 10-8 8-9 19-17 22-24 0-0 6-14 
286 | 9-14 12-18 15-22 2-7 8-15 11-18 6-1 14-20 1-5 17-23 10-17 5-0 13-21 4-4 3-8 16-22 
287 | 6-8 9-11 12-14 2-3 5-7 8-10 11-13 1-0 7-9 0-4 10-12 3-6 
288 | 3-2 6-8 9-11 15-17 4-3 7-9 10-11 21-21 13-15 5-4 8-10 11-12 14-18 0-0 
289 | 5-9 5-8 2-1 1-4 4-7 4-6 7-10 0-3 3-5 
290 | 1-1 7-11 2-2 4-7 0-0 
291 | 16-17 3-1 6-6 12-15 13-9 15-16 5-5 9-4 9-3 8-7 14-16 
292 | 3-4 6-6 9-9 2-3 5-6 8-8 1-1 4-5 7-7 0-0 
293 | 8-8 4-2 5-5 9-9 6-6 3-1 
294 | 7-5 1-1 4-2 8-6 9-7 3-2 6-3 0-0 
295 | 5-8 1-1 6-9 2-3 3-7 4-6 0-0 4-5 
296 | 2-5 9-9 5-8 8-11 1-6 11-15 13-22 7-10 0-4 10-12 3-7 12-21 
297 | 9-11 2-4 6-2 8-10 1-5 11-14 5-1 10-13 0-3 
298 | 10-7 13-11 5-10 16-13 5-9 8-8 15-12 18-16 1-2 4-4 21-17 14-12 17-15 0-0 
299 | 6-11 17-21 20-24 15-22 12-14 1-1 4-7 7-12 10-17 13-18 16-23 2-5 5-6 8-13 3-10 0-0 
300 | 4-4 0-7 6-8 2-2 1-6 3-3 
301 | 7-3 3-2 9-6 2-1 5-5 0-10 11-13 1-0 8-4 10-12 
302 | 6-9 12-15 2-3 5-8 11-14 4-7 7-11 1-0 0-2 13-16 0-1 3-6 10-10 
303 | 4-4 1-1 2-2 3-3 0-0 
304 | 4-4 1-1 2-2 3-3 
305 | 13-11 6-6 9-9 2-2 5-5 15-14 8-8 1-1 14-13 4-4 7-7 10-10 0-0 
306 | 3-4 9-14 6-7 12-15 5-6 8-9 1-2 1-1 11-10 4-5 7-8 10-13 0-0 
307 | 7-3 4-2 1-0 8-5 9-6 6-4 0-1 
308 | 9-14 16-16 6-2 8-13 12-12 15-16 18-19 5-0 14-15 7-11 0-4 3-5 
309 | 3-3 10-5 16-14 13-8 6-4 2-2 9-5 15-12 12-7 8-6 1-1 14-11 0-0 7-4 
310 | 14-16 17-21 3-4 6-7 9-10 12-14 1-2 15-17 4-5 7-8 10-11 13-15 16-20 2-3 5-6 8-9 11-13 0-1 
311 | 14-14 17-18 3-0 1-5 12-12 15-15 4-1 10-11 2-6 13-13 5-7 16-16 19-23 8-9 0-3 
312 | 13-12 10-6 12-11 6-1 8-9 5-2 11-10 14-13 4-0 7-4 
313 | 3-2 6-5 16-13 9-8 16-12 19-17 5-4 15-11 8-6 1-1 11-10 4-3 17-16 14-11 0-0 10-9 
314 | 1-1 5-6 2-2 6-7 3-4 3-3 0-0 4-5 
315 | 3-4 6-7 20-20 17-11 1-1 4-5 12-8 15-9 18-12 2-2 5-6 11-19 16-10 19-16 0-0 
316 | 14-14 17-18 20-22 9-6 12-12 26-28 15-15 18-20 4-1 21-23 7-5 24-27 10-9 13-13 27-29 16-16 19-21 22-24 8-4 11-11 25-28 0-0 
317 | 3-5 14-15 17-18 6-8 9-11 1-1 23-21 15-16 26-22 18-17 10-10 13-14 2-4 16-19 5-7 8-9 0-0 22-20 
318 | 17-21 3-3 6-8 20-23 12-16 15-19 1-1 4-7 7-12 18-22 10-13 13-17 24-24 5-11 5-10 16-20 2-2 8-15 22-25 11-14 25-29 14-18 0-0 
319 | 6-9 9-13 2-6 3-0 5-8 8-11 1-4 7-10 10-14 4-1 
320 | 9-12 15-21 4-9 7-14 14-18 1-1 16-26 10-15 3-8 6-13 13-17 0-0 
321 | 17-12 3-3 10-7 13-10 9-6 5-5 12-8 1-1 18-13 11-9 4-4 14-11 0-0 
322 | 3-4 12-19 9-9 9-8 1-2 4-5 13-25 7-7 16-26 10-11 2-3 5-6 14-22 17-27 11-12 11-11 0-0 
323 | 2-10 9-13 6-3 11-17 5-6 14-20 8-9 3-11 4-5 13-19 10-14 7-8 0-2 
324 | 1-1 5-6 2-3 6-8 3-5 7-9 0-0 4-5 
325 | 6-7 9-9 2-2 5-6 8-8 1-1 0-0 
326 | 17-23 3-1 14-11 6-5 20-22 18-24 4-3 15-12 21-26 7-6 8-17 2-2 19-25 5-4 16-8 0-0 
327 | 4-4 1-2 5-7 2-3 3-6 0-0 
328 | 3-2 9-8 2-1 5-5 8-7 1-0 0-4 4-3 7-6 10-9 
329 | 20-17 3-3 10-6 9-10 19-13 8-8 12-7 15-12 18-16 14-11 17-14 0-0 7-4 
330 | 3-2 16-15 6-6 9-9 12-14 2-1 5-5 11-13 8-8 1-0 7-7 10-11 10-10 
331 | 3-1 10-4 13-7 5-9 8-13 2-0 1-3 4-8 14-14 7-10 0-2 11-5 6-11 
332 | 4-3 8-6 5-4 9-8 2-0 10-9 3-2 
333 | 1-17 17-20 11-5 14-11 6-1 2-18 12-6 15-13 7-2 3-19 10-4 13-7 16-14 0-0 8-3 
334 | 14-15 22-18 17-21 6-6 9-12 12-13 20-16 1-2 15-19 23-22 4-4 7-8 13-14 21-17 2-3 16-20 5-5 8-11 0-1 3-7 
335 | 17-23 6-9 20-24 9-11 15-19 1-1 4-6 18-22 10-13 13-17 2-5 16-20 5-8 19-22 8-10 11-14 0-3 14-18 3-7 
336 | 7-4 1-1 8-5 2-3 6-4 3-2 0-0 
337 | 3-4 14-12 17-16 6-6 1-2 12-11 4-3 15-13 7-9 18-17 10-7 2-1 5-5 16-15 19-18 0-0 11-10 
338 | 5-7 1-1 6-8 3-5 4-6 0-0 
339 | 1-3 5-8 2-6 4-7 0-1 
340 | 1-3 0-0 
341 | 6-4 17-14 9-10 12-17 1-5 15-21 18-23 21-28 7-6 16-24 19-25 11-20 5-3 8-7 
342 | 1-1 8-5 2-3 9-6 3-2 0-0 10-7 
343 | 17-21 3-2 12-20 1-10 9-12 6-4 15-18 4-3 10-15 7-6 16-18 8-13 5-5 11-19 0-1 14-17 
344 | 7-2 13-7 6-3 6-2 9-5 2-1 12-6 1-0 8-4 14-8 
345 | 22-27 17-10 7-0 12-16 6-5 15-20 16-10 21-26 11-15 14-19 19-11 4-2 8-1 13-17 
346 | 22-18 3-3 6-7 14-9 28-25 1-1 12-11 23-20 4-4 15-13 18-14 21-17 2-2 13-12 24-21 5-6 5-5 27-24 19-15 0-0 11-10 
347 | 7-5 4-2 1-1 2-1 6-4 6-3 0-0 
348 | 12-17 6-6 9-11 2-2 5-5 8-10 14-19 11-13 17-21 1-1 4-4 7-9 13-18 10-12 16-20 0-0 
349 | 3-2 13-10 16-12 9-6 2-1 12-10 5-3 1-1 8-5 14-11 0-0 17-13 10-9 7-4 
350 | 4-4 1-1 5-5 2-3 6-6 7-7 0-0 
351 | 6-10 3-1 9-11 12-14 15-20 1-0 4-2 7-7 10-12 13-18 2-1 5-6 8-8 11-13 14-19 0-0 
352 | 13-12 2-5 12-14 1-4 8-7 14-15 11-10 0-3 7-6 4-0 10-9 
353 | 6-9 9-14 6-8 15-18 2-2 5-7 8-11 14-17 1-1 4-5 7-10 10-15 13-16 3-6 0-0 16-19 
354 | 11-8 4-15 9-11 9-10 15-23 15-22 7-18 18-28 2-14 12-5 5-16 10-12 16-25 8-19 0-9 14-26 3-13 6-17 17-27 
355 | 19-9 3-3 6-5 17-12 9-7 1-1 20-11 4-4 20-10 15-13 7-6 10-8 21-15 2-2 16-14 8-7 0-0 
356 | 6-4 9-8 12-12 5-3 8-7 11-10 11-9 0-5 13-16 7-6 10-11 
357 | 3-3 20-27 9-12 15-25 6-4 12-15 21-28 7-9 10-14 10-13 16-24 13-16 2-2 19-26 5-5 8-10 11-14 0-0 14-17 
358 | 2-9 10-2 10-1 1-8 13-5 4-12 9-0 0-7 12-4 7-11 14-13 11-3 
359 | 3-2 6-4 9-7 2-1 5-3 8-6 1-0 4-3 7-5 0-0 
360 | 6-8 9-11 5-7 2-1 8-10 4-6 1-0 7-9 0-3 3-5 
361 | 3-0 6-5 9-10 6-4 6-3 12-13 8-9 11-12 5-1 7-8 4-2 10-11 13-14 
362 | 4-4 1-1 5-5 2-3 6-6 7-7 0-0 
363 | 7-14 3-8 4-9 0-3 5-12 1-4 2-8 6-13 
364 | 17-15 12-16 15-20 1-2 4-4 18-21 7-5 10-12 13-18 2-3 11-17 8-8 14-19 0-1 0-0 
365 | 6-8 6-7 3-1 2-3 5-6 1-2 8-5 7-9 4-4 10-10 0-0 
366 | 1-3 2-1 3-5 8-12 0-0 
367 | 1-1 0-0 
368 | 1-1 6-8 2-2 7-9 3-4 0-0 4-5 
369 | 3-2 9-11 12-14 15-16 2-1 8-11 14-19 11-13 4-7 1-1 6-17 13-21 16-22 10-12 0-0 
370 | 3-6 3-5 25-25 6-10 17-19 9-12 12-16 23-23 15-20 1-2 4-7 26-26 7-11 10-15 10-14 13-17 24-24 2-3 5-9 27-27 22-21 0-0 
371 | 3-6 9-17 1-4 7-18 15-14 10-12 13-16 8-19 16-21 2-4 11-20 0-1 
372 | 3-4 9-12 12-16 8-15 2-3 5-6 7-14 1-1 4-5 11-8 0-0 6-10 10-9 
373 | 4-2 6-7 7-8 1-5 0-0 1-4 
374 | 5-7 2-4 3-2 0-0 4-5 
375 | 3-3 14-13 17-18 6-4 20-19 9-8 9-7 23-23 1-1 12-11 15-14 4-2 18-16 21-20 10-9 13-12 16-15 22-22 0-0 11-10 
376 | 18-4 2-7 9-11 5-10 14-1 11-18 17-3 22-19 20-6 13-2 4-9 20-5 3-8 12-0 
377 | 3-4 7-2 9-13 12-15 6-1 4-7 5-0 10-14 
378 | 6-5 3-0 2-2 1-3 9-1 10-14 
379 | 3-4 6-7 9-12 12-15 2-3 8-11 5-6 11-14 1-2 4-5 0-1 0-0 
380 | 8-9 6-0 2-5 7-2 3-6 5-3 4-8 1-4 
381 | 1-1 2-2 3-5 4-6 0-0 
382 | 3-4 25-24 14-11 6-6 17-14 9-9 1-2 23-20 26-25 15-12 7-7 18-16 18-15 10-8 2-3 24-21 13-10 27-26 5-5 16-13 19-17 0-1 
383 | 3-5 20-27 9-13 6-2 15-21 15-20 18-26 1-1 4-6 7-11 10-14 13-18 16-22 2-4 5-7 19-24 8-12 11-15 0-3 14-19 
384 | 6-7 12-15 15-20 9-9 2-3 14-20 5-4 11-14 8-8 1-1 10-13 7-6 16-21 13-16 0-0 
385 | 3-2 17-18 6-8 9-11 12-16 1-1 4-3 7-9 18-19 10-12 13-15 2-4 5-5 8-10 19-20 11-13 0-0 
386 | 6-13 3-5 15-25 1-2 7-14 12-10 4-6 10-15 16-26 19-28 2-3 11-22 17-27 0-0 14-17 
387 | 3-3 6-7 9-10 2-3 5-7 11-14 8-9 1-2 4-6 7-8 13-15 0-0 
388 | 3-3 6-7 9-12 2-2 8-11 5-5 11-14 1-1 7-10 10-13 13-15 0-0 16-19 
389 | 10-6 1-12 6-4 11-20 7-19 4-14 5-7 9-5 3-13 2-13 0-1 
390 | 22-18 3-4 25-19 6-6 28-23 20-16 1-2 23-20 15-14 7-7 10-12 29-24 21-17 2-3 24-21 5-5 27-22 8-8 19-15 0-1 
391 | 6-9 9-13 12-14 2-4 5-8 17-28 8-12 15-16 16-27 1-1 4-6 14-15 7-10 13-17 0-0 3-5 
392 | 4-3 5-4 9-7 6-6 2-0 3-2 10-8 
393 | 9-13 3-2 6-5 12-14 8-8 11-10 4-1 7-6 10-11 13-15 0-0 
394 | 5-9 2-1 1-5 4-7 3-6 0-0 6-10 
395 | 14-16 25-25 17-20 28-28 1-10 20-23 9-8 12-13 15-16 26-26 18-21 7-7 21-24 10-12 13-15 16-18 27-27 19-22 11-14 8-5 0-0 
396 | 6-11 2-4 3-8 7-12 4-9 1-6 5-10 0-0 
397 | 17-23 3-2 20-25 6-5 9-10 12-12 18-24 15-14 4-3 7-6 10-11 13-15 2-1 16-18 19-22 5-4 8-7 11-13 
398 | 6-9 2-4 5-7 11-13 4-8 7-11 10-12 0-0 3-5 
399 | 3-4 6-7 9-12 12-15 2-2 5-6 8-10 11-14 8-9 1-1 4-5 7-8 10-13 13-16 0-0 
400 | 3-2 6-7 2-1 5-6 4-5 1-0 0-4 7-8 
401 | 3-1 6-8 9-11 12-16 1-3 15-18 4-6 7-9 10-13 13-17 16-21 2-4 5-7 8-10 11-15 14-20 0-2 
402 | 2-2 3-6 0-1 1-5 
403 | 1-1 5-2 6-5 0-0 7-6 
404 | 3-4 13-9 16-12 2-3 19-15 12-8 15-11 1-1 18-14 11-7 6-13 14-10 0-0 3-5 
405 | 3-4 14-13 17-17 9-8 1-2 12-11 4-5 15-14 18-18 10-9 21-19 2-3 13-12 5-6 16-15 8-7 0-1 0-0 11-10 22-20 
406 | 17-23 23-28 9-11 6-3 9-10 12-13 15-20 18-27 1-1 4-4 7-5 10-12 2-8 2-7 13-16 16-22 16-21 19-24 5-2 11-15 14-19 0-0 
407 | 8-16 14-24 1-7 1-6 2-22 7-15 11-12 5-23 3-9 6-14 0-2 4-1 15-25 
408 | 3-5 14-14 17-18 20-22 9-9 6-1 23-23 12-12 4-7 15-15 18-19 21-22 10-10 13-13 5-8 16-17 19-22 22-22 11-11 
409 | 5-9 9-5 1-4 11-13 4-7 3-12 0-3 2-11 
410 | 1-14 4-16 7-21 12-10 15-9 15-8 8-22 16-23 0-13 2-2 11-12 11-11 3-7 17-24 
411 | 3-3 6-7 16-14 9-10 12-13 2-2 2-1 5-5 8-9 11-11 4-4 7-8 0-0 
412 | 9-17 20-26 6-9 15-24 4-10 12-13 26-29 7-8 16-25 13-14 5-7 8-11 11-12 
413 | 3-4 14-11 20-23 20-22 12-15 9-7 15-19 1-1 18-20 21-24 10-8 2-4 19-21 22-25 8-6 0-2 11-12 
414 | 14-14 3-3 6-4 9-11 20-20 12-13 1-1 15-17 15-16 7-5 10-12 13-15 2-2 16-18 5-6 19-19 0-0 
415 | 6-9 9-13 3-2 2-5 8-12 11-15 5-4 1-1 7-11 10-14 4-3 0-0 
416 | 20-27 3-2 17-19 6-7 9-9 12-14 15-21 1-1 18-24 4-5 7-12 21-28 13-15 16-22 10-8 2-3 19-26 5-6 22-29 8-9 11-13 14-18 0-0 
417 | 13-12 10-7 3-2 9-6 2-1 9-5 12-10 5-4 1-0 14-13 11-8 4-3 0-0 
418 | 3-4 6-9 9-12 12-16 2-3 5-6 8-11 11-15 1-2 4-7 7-10 10-14 10-13 0-0 3-5 
419 | 5-12 2-3 7-14 1-2 3-11 4-4 6-13 0-0 
420 | 3-6 17-19 9-11 23-27 9-10 12-16 1-4 26-29 4-8 15-18 18-20 10-12 24-28 13-16 2-5 5-9 19-24 22-26 0-2 25-28 6-14 14-17 
421 | 3-3 14-12 17-18 20-23 1-1 12-10 4-6 18-22 15-13 5-16 2-2 13-11 16-14 19-21 8-9 6-15 0-0 
422 | 11-8 3-4 3-3 14-13 17-19 1-1 15-18 12-10 4-5 18-20 10-7 2-2 5-6 13-9 19-21 0-0 
423 | 4-4 1-2 1-1 5-6 3-5 
424 | 7-2 10-6 0-18 3-23 3-22 6-1 9-5 9-4 12-8 15-11 5-0 8-3 16-24 11-7 14-10 
425 | 3-3 9-12 12-16 6-6 5-10 2-2 8-11 11-15 1-1 4-4 7-9 10-14 
426 | 3-3 6-8 2-7 16-15 12-11 5-5 15-13 1-2 14-14 7-9 4-4 10-10 0-0 
427 | 9-11 5-3 8-8 5-2 0-7 4-6 4-5 7-9 
428 | 4-3 1-1 2-1 3-2 0-0 
429 | 12-17 6-5 9-10 2-2 8-9 11-14 1-1 4-4 13-18 7-6 10-11 0-0 16-19 
430 | 14-16 3-3 6-9 20-23 9-12 23-26 12-14 1-1 15-18 4-6 18-21 7-10 21-24 10-13 13-15 2-2 5-8 16-18 19-22 8-11 22-25 
431 | 6-10 17-20 20-24 9-13 12-16 1-2 4-8 18-22 7-11 18-21 21-25 10-14 2-6 16-19 5-9 8-13 19-23 11-15 0-1 3-7 
432 | 13-11 16-16 9-10 2-5 6-4 12-12 8-9 5-3 1-2 4-7 14-15 11-8 3-6 0-0 
433 | 3-4 16-17 6-7 9-10 2-3 15-16 5-6 8-9 11-12 1-2 14-15 4-5 17-18 7-8 10-11 0-0 13-14 
434 | 13-13 6-4 9-7 12-12 2-2 11-11 8-6 1-0 4-3 0-1 7-5 
435 | 9-13 3-2 15-18 5-8 2-1 8-11 1-6 11-15 14-17 7-10 10-14 4-3 13-16 0-0 
436 | 5-13 13-10 10-5 2-3 9-6 4-11 12-8 26-15 1-2 3-11 8-4 18-12 6-14 11-7 14-9 0-0 
437 | 3-3 16-15 6-5 9-10 12-13 2-3 18-19 5-4 8-9 1-2 11-11 17-18 7-7 0-0 13-14 
438 | 10-7 5-10 2-3 9-6 15-15 4-12 12-8 1-2 14-14 7-5 0-0 
439 | 25-25 3-3 14-13 28-28 17-17 6-5 9-8 23-23 1-0 15-16 26-26 4-4 18-20 7-6 21-21 10-9 24-24 2-2 13-12 27-27 16-15 19-19 8-7 22-22 0-0 11-10 
440 | 11-9 25-24 3-3 14-12 6-5 20-18 9-7 23-23 1-1 12-10 4-4 15-13 18-16 21-19 10-8 24-23 2-2 5-6 16-14 19-17 0-0 22-20 
441 | 6-9 2-3 8-13 5-8 5-7 7-14 1-1 3-11 4-5 0-0 9-15 
442 | 1-2 5-5 2-3 6-8 3-4 4-7 0-0 
443 | 


--------------------------------------------------------------------------------
/data.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # We take the text, tokenize it, maxlength 40 it, 
 3 | # lemmatize it, and align it to get:
 4 | 
 5 | cd data/maxlength40-lemmas
 6 | 
 7 | mkdir en-fr ; cp ~/data/multitext/align-lemmas/aligner.fr-en/training.?? en-fr/ ; cp ~/data/multitext/align-lemmas/aligner.fr-en/training.align en-fr/training.align.fr-en
 8 | mkdir en-nl ; cp ~/data/multitext/align-lemmas/aligner.nl-en/training.?? en-nl/ ; cp ~/data/multitext/align-lemmas/aligner.nl-en/training.align en-nl/training.align.nl-en
 9 | mkdir en-de ; cp ~/data/multitext/align-lemmas/aligner.de-en/training.?? en-de/ ; cp ~/data/multitext/align-lemmas/aligner.de-en/training.align en-de/training.align.de-en
10 | mkdir en-it ; cp ~/data/multitext/align-lemmas/aligner.it-en/training.?? en-it/ ; cp ~/data/multitext/align-lemmas/aligner.it-en/training.align en-it/training.align.it-en
11 | mkdir en-es ; cp ~/data/multitext/align-lemmas/aligner.es-en/training.?? en-es/ ; cp ~/data/multitext/align-lemmas/aligner.es-en/training.align en-es/training.align.es-en
12 | 
13 | ../../scripts/preprocess/reverse-alignment.pl en-de/training.align.de-en
14 | ../../scripts/preprocess/reverse-alignment.pl en-nl/training.align.nl-en
15 | ../../scripts/preprocess/reverse-alignment.pl en-it/training.align.it-en
16 | ../../scripts/preprocess/reverse-alignment.pl en-es/training.align.es-en
17 | ../../scripts/preprocess/reverse-alignment.pl en-fr/training.align.fr-en
18 | 
19 | # At this point, we run the preprocessing, because we want to build the
20 | # initial embeddings BEFORE filtering
21 | 
22 | ../../scripts/preprocess/filter-sentences-by-lemma.py en-de/training.en  en-de/training.de en-de/training.align.en-de &
23 | ../../scripts/preprocess/filter-sentences-by-lemma.py en-it/training.en  en-it/training.it en-it/training.align.en-it &
24 | ../../scripts/preprocess/filter-sentences-by-lemma.py en-fr/training.en  en-fr/training.fr en-fr/training.align.en-fr &
25 | ../../scripts/preprocess/filter-sentences-by-lemma.py en-nl/training.en  en-nl/training.nl en-nl/training.align.en-nl &
26 | ../../scripts/preprocess/filter-sentences-by-lemma.py en-es/training.en  en-es/training.es en-es/training.align.en-es &
27 | 
28 | ##################################################################
29 | ###  Below is deprecated
30 | ##################################################################
31 | 
32 | ##### Wait, using freeling for es will changes the number of tokens. grr....
33 | ##### Let's use treetagger for es instead
34 | 
35 | 
36 | cp ~/data/multitext/align/aligner.fr-en.filtered.maxlength-40/training.en data/filtered-full-bilingual/
37 | cp ~/data/multitext/align/aligner.fr-en.filtered.maxlength-40/training.fr data/filtered-full-bilingual/
38 | cp ~/data/multitext/align/aligner.fr-en.filtered.maxlength-40/training.align data/filtered-full-bilingual/training.align.fr-en
39 | 
40 | ../../scripts/preprocess/reverse-alignment.pl en-fr/training.align.fr-en 
41 | 
42 |     Tadpole --skip=tmp -t ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-nl/filtered-training.nl | perl -ne 's/\t/ /g; print lc($_);' | chop 3 | from-one-line-per-word-to-one-line-per-sentence.py > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-nl/filtered-training-lemmas.nl
43 | 
44 | ~/utils/src/treetagger-3.2/lemmatizer.py french-utf8 ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-fr/training.fr > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-fr/filtered-training-lemmas.fr
45 | ~/utils/src/treetagger-3.2/lemmatizer.py german ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-de/filtered-training.de > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-de/filtered-training-lemmas.de
46 | ~/utils/src/treetagger-3.2/lemmatizer.py spanish ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-es/filtered-training.es> ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-es/filtered-training-lemmas.es
47 | 
48 | ~/utils/src/libiconv-1.13.1/src/iconv --byte-subst="<0x%x>" --unicode-subst="<0x%x>" ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-de/filtered-training.de > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-de/filtered-training.de.latin1
49 | ~/utils/src/treetagger-3.2/lemmatizer.py german ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-de/filtered-training.de.latin1 > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-de/filtered-training-lemmas.de
50 | 
51 | cd /u/turian/utils/src/FreeLing-2.1/src/main/simple_examples
52 | ./justmorph.py it ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-it/filtered-training.it > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-it/filtered-training-lemmas.it
53 | #./justmorph.py es ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-es/filtered-training.es > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-es/filtered-training-lemmas.es
54 | 
55 | ./justmorph.py en ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-es/filtered-training.en > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-es/filtered-training-lemmas.en
56 | ./justmorph.py en ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-fr/training.en > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-fr/filtered-training-lemmas.en
57 | ./justmorph.py en ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-de/filtered-training.en > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-de/filtered-training-lemmas.en
58 | ./justmorph.py en ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-nl/filtered-training.en > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-nl/filtered-training-lemmas.en
59 | ./justmorph.py en ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual/en-it/filtered-training.en > ~/dev/python/mt-language-model/neural-language-model/data/filtered-full-bilingual-lemmas/en-it/filtered-training-lemmas.en
60 | 
61 | cp ../filtered-full-bilingual/en-fr/filtered-training.align.en-fr ../filtered-full-bilingual-lemmas/en-fr/filtered-training-lemmas.align.en-fr
62 | cp ../filtered-full-bilingual/en-nl/filtered-training.align.en-nl ../filtered-full-bilingual-lemmas/en-nl/filtered-training-lemmas.align.en-nl
63 | cp ../filtered-full-bilingual/en-de/filtered-training.align.en-de ../filtered-full-bilingual-lemmas/en-de/filtered-training-lemmas.align.en-de
64 | cp ../filtered-full-bilingual/en-es/filtered-training.align.en-es ../filtered-full-bilingual-lemmas/en-es/filtered-training-lemmas.align.en-es
65 | cp ../filtered-full-bilingual/en-it/filtered-training.align.en-it ../filtered-full-bilingual-lemmas/en-it/filtered-training-lemmas.align.en-it
66 | 


--------------------------------------------------------------------------------
/data/README.txt:
--------------------------------------------------------------------------------
 1 | allwords.gz is from Childes corpus, Eng-USA/allwords.gz
 2 | 
 3 | Create vocabulary:
 4 | zcat allwords.gz | sort | uniq -c | sort -rn > allwords.vocabulary.txt
 5 | 
 6 | test is the first 10K words.
 7 | validation is the next 10K words.
 8 | train is the rest.
 9 | 
10 | =============
11 | 
12 | wikitext.txt.gz is preprocessed English wikipedia, broken into sentences and
13 | tokenized and shuffled.
14 | 
15 | ls | grep gz | ~/common/scripts/shuffle.sh | xargs zcat | ../../scripts/preprocess.pl  | grep . | ~/common/scripts/shuffle.sh | gzip -c > ../wikitext.txt.gz
16 | 
17 | zcat wikitext.txt.gz | head -10000 | gzip -c > wikitext.test.txt.gz
18 | zcat wikitext.txt.gz | head -20000 | tail -10000 | gzip -c > wikitext.validation.txt.gz
19 | zcat wikitext.txt.gz | tail -66151742 | gzip -c > wikitext.train.txt.gz
20 | 
21 | =============
22 | 
23 | italian-wikitext.txt.gz is preprocessed Italian wikipedia:
24 | 
25 | bzcat ~/data/italian_SemaWiki_attardi/3_tokenized.txt.bz2 | ~/data/italian_SemaWiki_attardi/one-sentence-per-line.pl | ../scripts/preprocess.pl  | grep . | ~/common/scripts/shuffle.sh | gzip -c > italian-wikitext.txt.gz
26 | 
27 | zcat italian-wikitext.txt.gz | head -10000 | gzip -c > italian-wikitext.test.txt.gz
28 | zcat italian-wikitext.txt.gz | head -20000 | tail -10000 | gzip -c > italian-wikitext.validation.txt.gz
29 | zcat italian-wikitext.txt.gz | tail -5672365 | gzip -c > italian-wikitext.train.txt.gz
30 | 
31 | # Sanity check
32 | zcat italian-wikitext.test.txt.gz italian-wikitext.validation.txt.gz italian-wikitext.train.txt.gz | md5sum
33 | zcat italian-wikitext.txt.gz | md5sum
34 | 
35 | ../scripts/examples.py italian-wikitext.validation.txt.gz | ~/common/scripts/shuffle.sh | head -1000 | gzip -c > italian-wikitext.validation-1000.txt.gz 
36 | 
37 | 
38 | # Vocabulary
39 | zcat italian-wikitext.train.txt.gz | perl -ne 's/ /\n/g; print' | grep . | sort | uniq -c | sort -rn | gzip -c > vocabulary-italian-wikitext.txt.gz
40 | zcat vocabulary-italian-wikitext.txt.gz | head -20000 | gzip -c > vocabulary-italian-wikitext-20000.txt.gz
41 | 
42 | =============
43 | 
44 | For case sensitive embeddings:
45 | 
46 | find wikitext/ | grep gz | ~/common/scripts/shuffle.sh | xargs zcat | grep . | ~/common/scripts/shuffle.sh | gzip -c > english-wikitext.case-intact.txt.gz
47 | 
48 | zcat english-wikitext.case-intact.txt.gz | head -10000 | gzip -c > english-wikitext.case-intact.test.txt.gz
49 | zcat english-wikitext.case-intact.txt.gz | head -20000 | tail -10000 | gzip -c > english-wikitext.case-intact.validation.txt.gz
50 | zcat english-wikitext.case-intact.txt.gz | tail -66151742 | gzip -c > english-wikitext.case-intact.train.txt.gz
51 | 
52 | # Sanity check
53 | zcat english-wikitext.case-intact.test.txt.gz english-wikitext.case-intact.validation.txt.gz english-wikitext.case-intact.train2.txt.gz | md5sum
54 | zcat english-wikitext.case-intact.txt.gz | md5sum
55 | 
56 | # Vocabulary
57 | zcat english-wikitext.case-intact.train2.txt.gz | perl -ne 's/ /\n/g; print' | grep . | sort  -T /cluster/paralisi3/turian/tmp | uniq -c | sort -rn | gzip -c > vocabulary-english-wikitext.case-intact.txt.gz
58 | zcat vocabulary-english-wikitext.case-intact.txt.gz | head -20000 | gzip -c > vocabulary-english-wikitext.case-intact-20000.txt.gz
59 | zcat vocabulary-english-wikitext.case-intact.txt.gz | head -50000 | gzip -c > vocabulary-english-wikitext.case-intact-50000.txt.gz
60 | 
61 | # Enter scripts directory
62 | ./build-vocabulary.py
63 | ./random-validation-examples.py
64 | 
65 | =============
66 | 


--------------------------------------------------------------------------------
/data/allwords.gz:
--------------------------------------------------------------------------------
1 | /u/turian/data/childes/childes-original/Eng-USA/allwords.gz


--------------------------------------------------------------------------------
/data/allwords.vocabulary-200.txt:
--------------------------------------------------------------------------------
  1 | 1554809 .
  2 |  514944 ?
  3 |  321320 you
  4 |  265588 the
  5 |  237190 #
  6 |  204922 I
  7 |  196906 a
  8 |  174804 it
  9 |  142331 to
 10 |  128046 and
 11 |  128028 that
 12 |  122771 what
 13 |  121857 [:
 14 |  107318 !
 15 |   97090 is
 16 |   92242 this
 17 |   89117 in
 18 |   85166 xxx
 19 |   84145 yeah
 20 |   81372 xx
 21 |   80737 do
 22 |   78573 no
 23 |   78149 on
 24 |   74463 yy
 25 |   65570 oh
 26 |   61896 one
 27 |   60334 okay
 28 |   60044 your
 29 |   59387 have
 30 |   59367 that's
 31 |   58699 don't
 32 |   58381 want
 33 |   56844 go
 34 |   56517 he
 35 |   53963 there
 36 |   52772 here
 37 |   51600 we
 38 |   51501 +...
 39 |   50561 can
 40 |   49654 like
 41 |   49376 see
 42 |   49317 of
 43 |   48794 me
 44 |   47579 are
 45 |   46246 it's
 46 |   46012 right
 47 |   43048 my
 48 |   42616 put
 49 |   42462 0
 50 |   42362 get
 51 |   40781 know
 52 |   40443 up
 53 |   39846 [>]
 54 |   39621 [<]
 55 |   37921 for
 56 |   36659 going
 57 |   35350 with
 58 |   35312 all
 59 |   35148 not
 60 |   34426 I'm
 61 |   34344 did
 62 |   33208 what's
 63 |   32993 gonna
 64 |   32619 to]
 65 |   32560 look
 66 |   32547 was
 67 |   32520 +"
 68 |   31515 now
 69 |   30824 they
 70 |   30376 [?]
 71 |   29451 [=
 72 |   29099 some
 73 |   28792 [//]
 74 |   28388 at
 75 |   27291 out
 76 |   27082 how
 77 |   27001 little
 78 |   26538 come
 79 |   25990 good
 80 |   25596 where
 81 |   25374 so
 82 |   25270 she
 83 |   25181 got
 84 |   25028 you're
 85 |   24777 be
 86 |   24557 think
 87 |   24014 just
 88 |   23786 [=!
 89 |   23468 [/]
 90 |   23364 down
 91 |   22599 but
 92 |   21649 why
 93 |   21044 wanna
 94 |   20782 yes]
 95 |   20669 huh
 96 |   20167 [!]
 97 |   20056 let's
 98 |   19925 Mommy
 99 |   19648 yes
100 |   19283 too
101 |   19173 well
102 |   18847 more
103 |   18583 then
104 |   18577 say
105 |   17953 make
106 |   17737 if
107 |   17710 when
108 |   17546 uh
109 |   17253 back
110 |   17113 two
111 |   17086 take
112 |   16944 over
113 |   16849 her
114 |   16774 big
115 |   16518 his
116 |   15957 eat
117 |   15849 um
118 |   15844 play
119 |   15799 them
120 |   15759 he's
121 |   15426 I'll
122 |   15289 there's
123 |   15185 him
124 |   14829 off
125 |   14396 does
126 |   14141 who
127 |   13941 baby
128 |   13715 about
129 |   13600 where's
130 |   13441 can't
131 |   12845 these
132 |   12638 let
133 |   12532 said
134 |   12431 ["]
135 |   12321 would
136 |   12162 hmm
137 |   11747 or
138 |   11726 didn't
139 |   11574 +/.
140 |   11524 very
141 |   11497 book
142 |   11291 those
143 |   11167 doing
144 |   10967 other
145 |   10553 need
146 |   10435 could
147 |   10360 will
148 |   10295 tell
149 |   10093 way
150 |    9974 has
151 |    9932 mmhm
152 |    9844 hey
153 |    9792 because
154 |    9630 something
155 |    9623 time
156 |    9620 read
157 |    9614 give
158 |    9606 had
159 |    9550 &=laughs
160 |    9549 another
161 |    9474 uhhuh
162 |    9464 sit
163 |    8999 car
164 |    8923 they're
165 |    8852 here's
166 |    8845 were
167 |    8809 turn
168 |    8733 house
169 |    8727 an
170 |    8723 mommy
171 |    8570 ,
172 |    8488 oh,
173 |    8485 ##
174 |    8382 goes
175 |    8368 www
176 |    8331 an(d)
177 |    8124 three
178 |    8124 really
179 |    8118 Daddy
180 |    8062 we're
181 |    8044 again
182 |    8013 nice
183 |    7985 boy
184 |    7789 doesn't
185 |    7730 else
186 |    7692 (be)cause
187 |    7690 ball
188 |    7586 please
189 |    7572 +//.
190 |    7536 &=noise
191 |    7530 hi
192 |    7466 this]
193 |    7421 yyy
194 |    7357 water
195 |    7305 from
196 |    7300 dis
197 |    7264 wait
198 |    7224 went
199 |    7152 [+
200 |    7128 ya
201 | 


--------------------------------------------------------------------------------
/data/allwords.vocabulary.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turian/neural-language-model/f7559a6cc4e9f4c34a553fbda974762f2d3f781b/data/allwords.vocabulary.txt.gz


--------------------------------------------------------------------------------
/data/batch:
--------------------------------------------------------------------------------
 1 | # Generate data from the English wikipedia, case information intact
 2 | 
 3 | find wikitext/ | grep gz | ~/common/scripts/shuffle.sh | xargs zcat | grep . | ~/common/scripts/shuffle.sh | gzip -c > english-wikitext.case-intact.txt.gz
 4 | 
 5 | zcat english-wikitext.case-intact.txt.gz | head -10000 | gzip -c > english-wikitext.case-intact.test.txt.gz
 6 | zcat english-wikitext.case-intact.txt.gz | head -20000 | tail -10000 | gzip -c > english-wikitext.case-intact.validation.txt.gz
 7 | zcat english-wikitext.case-intact.txt.gz | tail -66151742 | gzip -c > english-wikitext.case-intact.train.txt.gz
 8 | 
 9 | # Sanity check
10 | zcat english-wikitext.case-intact.test.txt.gz english-wikitext.case-intact.validation.txt.gz english-wikitext.case-intact.train.txt.gz | md5sum
11 | zcat english-wikitext.case-intact.txt.gz | md5sum
12 | 
13 | # Vocabulary
14 | zcat english-wikitext.case-intact.train.txt.gz | perl -ne 's/ /\n/g; print' | grep . | sort | uniq -c | sort -rn | gzip -c > vocabulary-english-wikitext.case-intact.txt.gz
15 | zcat vocabulary-english-wikitext.case-intact.txt.gz | head -20000 | gzip -c > vocabulary-english-wikitext.case-intact-20000.txt.gz
16 | 


--------------------------------------------------------------------------------
/scripts/LOGS:
--------------------------------------------------------------------------------
1 | LOGS.NOBACKUP/


--------------------------------------------------------------------------------
/scripts/LOGS.NOBACKUP/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turian/neural-language-model/f7559a6cc4e9f4c34a553fbda974762f2d3f781b/scripts/LOGS.NOBACKUP/.keep


--------------------------------------------------------------------------------
/scripts/batch:
--------------------------------------------------------------------------------
 1 | #./train.py 2>&1 | tee train-0.1.out
 2 | #./train.py 2>&1 | tee train-0.01.out
 3 | #./train.py 2>&1 | tee train-0.001.out
 4 | 
 5 | # From 20K italian and 20K english, we want the following hyperparams
 6 | #
 7 | #       2 EMBEDDING_LEARNING_RATE=0_000001.dat
 8 | #       9 EMBEDDING_LEARNING_RATE=0_0000032.dat
 9 | #       6 EMBEDDING_LEARNING_RATE=0_00001.dat
10 | #       6 EMBEDDING_LEARNING_RATE=0_000032.dat
11 | #       3 EMBEDDING_LEARNING_RATE=0_0001.dat
12 | # 
13 | #      10 LEARNING_RATE=0_00000001
14 | #       8 LEARNING_RATE=0_000000032
15 | #       7 LEARNING_RATE=0_0000001
16 | #       1 LEARNING_RATE=0_00000032
17 | #
18 | # From 100K english lowercase, we want the following hyperparams
19 | #
20 | #      3 EMBEDDING_LEARNING_RATE=0_000001.dat
21 | #      3 EMBEDDING_LEARNING_RATE=0_0000032.dat
22 | #
23 | #      3 LEARNING_RATE=0_0000000032
24 | #      2 LEARNING_RATE=0_000000001
25 | #      2 LEARNING_RATE=0_000000032
26 | #
27 | # From 280K RCV1 english with case, hyperparams:
28 | #       EMBEDDING_LEARNING_RATE=0_00001.dat and LEARNING_RATE=0_000000001
29 | # seemed to be the only version that worked well, when experimenting with values x10 and /10.
30 | #
31 | #
32 | # From 20090916-rcv1.case-intact, the best hyperparams are:
33 | #       LEARNING_RATE\=0_000000001
34 | #       EMBEDDING_LEARNING_RATE\=0_0000032
35 | #
36 | # From 20090923-rcv1.unclean.CoNLL03-tokenize.case-intact, the best hyperparams are:
37 | #      3 EMBEDDING_LEARNING_RATE\=0_0000032
38 | #
39 | #      1 LEARNING_RATE\=0_0000000001
40 | #      1 LEARNING_RATE\=0_00000000032
41 | #      1 LEARNING_RATE\=0_000000001
42 | #
43 | # From 20091011-corpus-conll2003-ner and 20091011-corpus-ptb2-entirewsj, the best hyperparams are:
44 | #      2 EMBEDDING_LEARNING_RATE=0_00001
45 | #      3 EMBEDDING_LEARNING_RATE=0_000032
46 | #
47 | #      1 LEARNING_RATE=0_00000000032
48 | #      2 LEARNING_RATE=0_000000001
49 | #      2 LEARNING_RATE=0_0000000032
50 | #
51 | 
52 | 
53 | dbidispatch --exp_dir="T" ./train.py \
54 |     '--no_LOG_BILINEAR_MODEL' \
55 |     '--no_NORMALIZE_EMBEDDINGS' \
56 |     '--EMBEDDING_SIZE=500' \
57 |     '--HIDDEN_SIZE=100' \
58 |     '--CW_EMBEDDING_L1_PENALTY={{0,0.001,0.0001,0.00001}}' \
59 |     '--NGRAM_FOR_TRAINING_NOISE=0' \
60 |     '--LEARNING_RATE={{0.0000001,0.00000001,0.000000001,0.0000000001,0.00000000001,0.000000000001}}' \
61 |     '--EMBEDDING_LEARNING_RATE={{0.0001,0.00001,0.000001,0.0000001,0.00000001,0.000000001}}'
62 | 
63 |     #'--LEARNING_RATE={{0.0000001,0.00000001,0.0000000032,0.000000001,0.00000000032,0.0000000001,0.000000000032,0.00000000001,0.0000000000032,0.000000000001}}' \
64 |     #'--EMBEDDING_LEARNING_RATE={{0.0001,0.000032,0.00001,0.0000032,0.000001,0.0000001,0.00000001,0.000000001}}'
65 | 
66 | #dbidispatch --exp_dir="T" ./train.py \
67 | #    '--no_NORMALIZE_EMBEDDINGS' \
68 | #    '--NGRAM_FOR_TRAINING_NOISE=0' \
69 | #    '--MONOLINGUAL_VOCABULARY_SIZE=50000' \
70 | #    '--LEARNING_RATE={{0.00000032,0.0000001,0.000000032,0.00000001,0.0000000032,0.000000001}}' \
71 | #    '--EMBEDDING_LEARNING_RATE={{0.00032,0.0001,0.000032,0.00001,0.0000032,0.000001}}'
72 | 


--------------------------------------------------------------------------------
/scripts/batch-build-examples:
--------------------------------------------------------------------------------
 1 | #dbidispatch --exp_dir="T" ./w2w/train.py \
 2 | dbidispatch --no_machine=maggie42.iro.umontreal.ca --mem=1900 --exp_dir="T" ./w2w/build-example-cache.py \
 3 |     '--no_console' \
 4 |     '--no_LOG_BILINEAR_MODEL' \
 5 |     '--no_NORMALIZE_EMBEDDINGS' \
 6 |     '--EMBEDDING_SIZE=50' \
 7 |     '--HIDDEN_SIZE=100' \
 8 |     '--WINDOW_SIZE={{11,9,7,5}}' \
 9 |     '--CW_EMBEDDING_L1_PENALTY=0' \
10 |     '--NGRAM_FOR_TRAINING_NOISE=0' \
11 |     '--EMBEDDING_LEARNING_RATE=0'
12 | #    '--LEARNING_RATE={{0.001,0.0001,0.00001,0.000001,0.0000001,0.00000001,0.000000001}}' \
13 | #    '--EMBEDDING_LEARNING_RATE={{0.1,0.01,0.001,0.0001,0.00001,0.000001}}'
14 | 


--------------------------------------------------------------------------------
/scripts/batch-short:
--------------------------------------------------------------------------------
1 | dbidispatch --exp_dir="T-short" ./train.py \
2 |     '--no_NORMALIZE_EMBEDDINGS' \
3 |     '--NGRAM_FOR_TRAINING_NOISE=1' \
4 |     '--VALIDATE_EVERY=2500000' \
5 |     '--MONOLINGUAL_VOCABULARY_SIZE={{5000,20000}}' \
6 |     '--TRAINING_NOISE_SMOOTHING_ADDITION={{0,10000,10000000}}' \
7 |     '--LEARNING_RATE={{0.0001,0.00001,0.000001,0.0000001}}' \
8 |     '--EMBEDDING_LEARNING_RATE={{0.01,0.001,0.0001,0.00001}}'
9 | 


--------------------------------------------------------------------------------
/scripts/batch-w2w:
--------------------------------------------------------------------------------
 1 | #dbidispatch --exp_dir="T" ./w2w/train.py \
 2 | #dbidispatch --no_machine=maggie42.iro.umontreal.ca --mem=1900 --exp_dir="T" ./w2w/train.py \
 3 | dbidispatch --no_machine=brams01.iro.umontreal.ca --mem=1900 --exp_dir="T" ./w2w/train.py \
 4 |     '--no_console' \
 5 |     '--no_LOG_BILINEAR_MODEL' \
 6 |     '--no_NORMALIZE_EMBEDDINGS' \
 7 |     '{{--TWO_HIDDEN_LAYERS,--no_TWO_HIDDEN_LAYERS}}' \
 8 |     '--EMBEDDING_SIZE=50' \
 9 |     '--HIDDEN_SIZE={{50,100,200}}' \
10 |     '--WINDOW_SIZE={{9,7,5}}' \
11 |     '--CW_EMBEDDING_L1_PENALTY=0' \
12 |     '--NGRAM_FOR_TRAINING_NOISE=0' \
13 |     '--LEARNING_RATE={{0.01,0.001,0.0001}}' \
14 |     '--EMBEDDING_LEARNING_RATE=0'
15 | 


--------------------------------------------------------------------------------
/scripts/batch-w2w2:
--------------------------------------------------------------------------------
 1 | #dbidispatch --exp_dir="T" ./w2w/train.py \
 2 | dbidispatch --no_machine=maggie42.iro.umontreal.ca --exp_dir="T" ./w2w/train.py \
 3 |     '--no_console' \
 4 |     '--no_LOG_BILINEAR_MODEL' \
 5 |     '--no_NORMALIZE_EMBEDDINGS' \
 6 |     '--TWO_HIDDEN_LAYERS' \
 7 |     '--EMBEDDING_SIZE=50' \
 8 |     '--HIDDEN_SIZE={{100,200}}' \
 9 |     '--WINDOW_SIZE={{7,5}}' \
10 |     '--CW_EMBEDDING_L1_PENALTY=0' \
11 |     '--NGRAM_FOR_TRAINING_NOISE=0' \
12 |     '--LEARNING_RATE={{0.1,0.01,0.001,0.0001,0.00001,0.000001,0.0000001}}' \
13 |     '--EMBEDDING_LEARNING_RATE=0'
14 | #    '--EMBEDDING_LEARNING_RATE={{0.1,0.01,0.001,0.0001,0.00001,0.000001}}'
15 | 


--------------------------------------------------------------------------------
/scripts/batch_ngrams:
--------------------------------------------------------------------------------
1 | dbidispatch ./ngrams.py \
2 |     '--MONOLINGUAL_VOCABULARY_SIZE={{5000,10000,20000}}' \
3 |     '--WINDOW_SIZE={{1,2,3}}'
4 | 


--------------------------------------------------------------------------------
/scripts/diagnostics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Verbose debug output for the model.
 3 | """
 4 | 
 5 | import logging
 6 | from common.stats import stats
 7 | from common.str import percent
 8 | 
 9 | import examples
10 | 
11 | import numpy
12 | import random
13 | 
14 | def diagnostics(cnt, model):
15 |     logging.info(stats())
16 |     idxs = range(model.parameters.vocab_size)
17 |     random.shuffle(idxs)
18 |     idxs = idxs[:100]
19 | 
20 |     embeddings_debug(model.parameters.embeddings[idxs], cnt, "rand 100 words, model %s" % model.modelname)
21 |     embeddings_debug(model.parameters.embeddings[:100], cnt, "top  100 words, model %s" % model.modelname)
22 |     embeddings_debug(model.parameters.embeddings[model.parameters.vocab_size/2-50:model.parameters.vocab_size/2+50], cnt, "mid  100 words, model %s" % model.modelname)
23 |     embeddings_debug(model.parameters.embeddings[-100:], cnt, "last 100 words, model %s" % model.modelname)
24 |     weights_debug(model.parameters.hidden_weights.value, cnt, "hidden weights, model %s" % model.modelname)
25 |     weights_debug(model.parameters.output_weights.value, cnt, "output weights, model %s" % model.modelname)
26 |     logging.info(stats())
27 | 
28 | def visualizedebug(cnt, model, rundir, newkeystr, WORDCNT=500):
29 |     idxs = range(model.parameters.vocab_size)
30 |     random.shuffle(idxs)
31 |     idxs = idxs[:WORDCNT]
32 | 
33 |     visualize(cnt, model, rundir, idxs, "randomized%s" % newkeystr)
34 |     visualize(cnt, model, rundir, range(WORDCNT), "mostcommon%s" % newkeystr)
35 |     visualize(cnt, model, rundir, range(-1, -WORDCNT*50, -1*50), "leastcommon%s" % newkeystr)
36 |     visualize(cnt, model, rundir, range(model.parameters.vocab_size/2-WORDCNT*20/2,model.parameters.vocab_size/2+WORDCNT*20/2, 20), "midcommon%s" % newkeystr)
37 | 
38 | def visualize(cnt, model, rundir, idxs, str):
39 |     """
40 |     Visualize a set of examples using t-SNE.
41 |     """
42 |     from vocabulary import wordmap, wordform
43 |     PERPLEXITY=30
44 | 
45 |     idxs = [id % model.parameters.embeddings.shape[0] for id in idxs]
46 |     x = model.parameters.embeddings[idxs]
47 |     print x.shape
48 |     #titles = [`wordmap().str(id)` for id in idxs]
49 |     titles = [wordform(id) for id in idxs]
50 | 
51 |     import os.path
52 |     filename = os.path.join(rundir, "embeddings.model-%s.-%s-%d.png" % (model.modelname, str, cnt))
53 |     try:
54 |         from textSNE.calc_tsne import tsne
55 | #       from textSNE.tsne import tsne
56 |         out = tsne(x, perplexity=PERPLEXITY)
57 |         from textSNE.render import render
58 |         render([(title, point[0], point[1]) for title, point in zip(titles, out)], filename)
59 |     except IOError:
60 |         logging.info("ERROR visualizing", filename, ". Continuing...")
61 | 
62 | def embeddings_debug(w, cnt, str):
63 |     """
64 |     Output the l2norm mean and max of the embeddings, including in debug out the str and training cnt
65 |     """
66 |     totalcnt = numpy.sum(numpy.abs(w) >= 0)
67 |     notsmallcnt = numpy.sum(numpy.abs(w) >= 0.1)
68 |     logging.info("%d %s dimensions of %s have absolute value >= 0.1" % (cnt, percent(notsmallcnt, totalcnt), str))
69 |     notsmallcnt = numpy.sum(numpy.abs(w) >= 0.01)
70 |     logging.info("%d %s dimensions of %s have absolute value >= 0.01" % (cnt, percent(notsmallcnt, totalcnt), str))
71 | 
72 |     l2norm = numpy.sqrt(numpy.square(w).sum(axis=1))
73 |     median = numpy.median(l2norm)
74 |     mean = numpy.mean(l2norm)
75 |     std = numpy.std(l2norm)
76 | #    print("%d l2norm of top 100 words: mean = %f stddev=%f" % (cnt, numpy.mean(l2norm), numpy.std(l2norm),))
77 |     l2norm = l2norm.tolist()
78 |     l2norm.sort()
79 |     l2norm.reverse()
80 |     logging.info("%d l2norm of %s: median = %f mean = %f stddev=%f top3=%s" % (cnt, str, median, mean, std, `l2norm[:3]`))
81 | #    print("top 5 = %s" % `l2norm[:5]`)
82 | 
83 | def weights_debug(w, cnt, str):
84 |     """
85 |     Output the abs median, mean, and max of the weights w, including in debug out the str and training cnt
86 |     """
87 |     w = numpy.abs(w)
88 |     logging.info("%d abs of %s: median=%f mean=%f stddev=%f" % (cnt, str, numpy.median(w), numpy.mean(w), numpy.std(w),))
89 | #    print("%d l2norm of top 100 words: mean = %f stddev=%f" % (cnt, numpy.mean(l2norm), numpy.std(l2norm),))
90 | #    w = w.tolist()
91 | #    w.sort()
92 | #    w.reverse()
93 | #    logging.info("\ttop 5 = %s" % `w[:5]`)
94 | #    print("top 5 = %s" % `l2norm[:5]`)
95 | 


--------------------------------------------------------------------------------
/scripts/dump-embeddings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from optparse import OptionParser
 4 | parser = OptionParser()
 5 | parser.add_option("-m", "--modelfile", dest="modelfile")
 6 | (options, args) = parser.parse_args()
 7 | assert options.modelfile is not None
 8 | 
 9 | import cPickle
10 | m = cPickle.load(open(options.modelfile))
11 | #print m.parameters.embeddings.shape
12 | 
13 | from vocabulary import wordmap
14 | for i in range(m.parameters.vocab_size):
15 |     print wordmap.str(i),
16 |     for v in m.parameters.embeddings[i]:
17 |         print v,
18 |     print
19 | 


--------------------------------------------------------------------------------
/scripts/eda/badrun.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | #  For every filename in sys.stdin, add a file BAD to that run directory.
 4 | #  Read stdin until there is a blank line.
 5 | #
 6 | #   BUG: If the filename has a space in it, sorry you're out of luck.
 7 | #   BUG: We don't unescape quotes, we just strip them.
 8 | #
 9 | 
10 | import sys, os.path, string
11 | #assert len(sys.argv)>2
12 | 
13 | while 1:
14 | #    for l in sys.stdin:
15 |     l = sys.stdin.readline()
16 | #    for l in sys.stdin:
17 |     if string.strip(l) == "": break
18 |     for f in string.split(l):
19 |         f = f.replace('\"','').replace("\'",'')
20 |         if not os.path.exists(f): continue
21 |         d = os.path.dirname(os.path.realpath(f))
22 |         newf = os.path.join(d, "BAD")
23 |         print newf
24 |         if os.path.exists(newf): continue
25 |         cmd = "rm %s" % os.path.join(d, "*.dat")
26 |         print >> sys.stderr, "Creating %s, %s" % (newf, cmd)
27 |         open(newf, "wt").close()
28 |         os.system("rm %s" % os.path.join(d, "*.dat"))
29 | 


--------------------------------------------------------------------------------
/scripts/eda/batch-make-curves.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | rm *trainerror.dat
 4 | #rm ../run*/*trainerror.dat
 5 | 
 6 | # Make all dat files
 7 | ../../eda/make-graphs-trainerror.pl
 8 | 
 9 | ln -s ../*/*trainerror.dat .
10 | 
11 | # Sort all dat files
12 | # First perl recipe adds gnuplot codes
13 | # Second perl recipe strips final ', \'  to prevent gnuplot error
14 | echo > graphs-trainerror.gp
15 | echo "set terminal postscript color 12" >> graphs-trainerror.gp
16 | echo "set output 'graphs-trainerror.ps'" >> graphs-trainerror.gp
17 | echo "set logscale y" >> graphs-trainerror.gp
18 | echo "plot [] [] \\" >> graphs-trainerror.gp
19 | ~/dev/common-scripts/sort-curves.py *trainerror.dat | perl -ne "chop; print \"\\t'\$_' with l lw 3, \\\\\\n\"" | perl -e '$str = ""; while(<>){ $str .= $_; } $str =~ s/, \\$//s; print $str' >> graphs-trainerror.gp
20 | 
21 | gnuplot graphs-trainerror.gp
22 | ps2pdf graphs-trainerror.ps
23 | cp *pdf ~/public_html/priv ; chmod a+r ~/public_html/priv/*pdf
24 | #scp  *pdf turian@joyeux.iro.umontreal.ca:public_html/priv/
25 | 


--------------------------------------------------------------------------------
/scripts/eda/make-graphs-trainerror.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | #  Make a .dat file for each .out file.
 4 | #
 5 | 
 6 | $gnuplot = "plot";
 7 | $first = 1;
 8 | #foreach $f (split(/[\r\n]+/, `ls [0-9]*out`)) {
 9 | foreach $f (split(/[\r\n]+/, `ls ../run*/log.* | grep -v 'dat\$'`)) {
10 |     next if not $f =~ m/f1426d05c578bfd029875b646b66195044/;
11 |     next if $f =~ m/\.dat$/;
12 |     ($badf = $f) =~ s/\/[^\/]*$/\/BAD/;
13 |     next if -e $badf;
14 |     ($fnew = $f) =~ s/$/-trainerror.dat/;
15 |     die $! if $fnew eq $f;
16 |     print STDERR "$f => $fnew\n";
17 |     # We can allow e to be grepped, because of numbers like 5e-8
18 |     $cmd = "cat $f | grep --text 'pre-update train err' | perl -ne 's/=/ /g; print' | cut -d ' ' -f 2,10 | grep -v '[a-df-zA-DF-Z]' | grep '0000 ' > $fnew";
19 |     print STDERR "$cmd\n";
20 |     system($cmd);
21 |     $gnuplot .= "," unless $first;
22 |     $first = 0;
23 |     $gnuplot .= " \\\n\t'$fnew' with lp"
24 | }
25 | print "$gnuplot\n";
26 | 


--------------------------------------------------------------------------------
/scripts/eda/old/batch-make-curves.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | rm *trainerror.dat *trainloss.dat *validationlogrankloss.dat
 4 | #rm ../run*/*trainerror.dat ../run*/*trainloss.dat ../run*/*validationlogrankloss.dat
 5 | 
 6 | # Make all dat files
 7 | ../../eda/make-graphs-trainerror.pl
 8 | ../../eda/make-graphs-trainloss.pl
 9 | ../../eda/make-graphs-validationlogrankloss.pl
10 | 
11 | ln -s ../*/*trainerror.dat .
12 | ln -s ../*/*trainloss.dat .
13 | ln -s ../*/*validationlogrankloss.dat .
14 | 
15 | # Sort all dat files
16 | # First perl recipe adds gnuplot codes
17 | # Second perl recipe strips final ', \'  to prevent gnuplot error
18 | echo > graphs-trainerror.gp
19 | echo "set terminal postscript color 12" >> graphs-trainerror.gp
20 | echo "set output 'graphs-trainerror.ps'" >> graphs-trainerror.gp
21 | echo "set logscale y" >> graphs-trainerror.gp
22 | echo "plot [] [0.006:0.025] \\" >> graphs-trainerror.gp
23 | ~/dev/common-scripts/sort-curves.py *trainerror.dat | perl -ne "chop; print \"\\t'\$_' with l lw 3, \\\\\\n\"" | perl -e '$str = ""; while(<>){ $str .= $_; } $str =~ s/, \\$//s; print $str' >> graphs-trainerror.gp
24 | 
25 | echo > graphs-trainloss.gp
26 | echo "set terminal postscript color 12" >> graphs-trainloss.gp
27 | echo "set output 'graphs-trainloss.ps'" >> graphs-trainloss.gp
28 | echo "set logscale y" >> graphs-trainloss.gp
29 | echo "plot [] [] \\" >> graphs-trainloss.gp
30 | ~/dev/common-scripts/sort-curves.py *trainloss.dat | perl -ne "chop; print \"\\t'\$_' with l lw 3, \\\\\\n\""  | perl -e '$str = ""; while(<>){ $str .= $_; } $str =~ s/, \\$//s; print $str' >> graphs-trainloss.gp
31 | 
32 | echo > graphs-validationlogrankloss.gp
33 | echo "set terminal postscript color 12" >> graphs-validationlogrankloss.gp
34 | echo "set output 'graphs-validationlogrankloss.ps'" >> graphs-validationlogrankloss.gp
35 | #echo "set logscale y" >> graphs-validationlogrankloss.gp
36 | echo "plot [] [] \\" >> graphs-validationlogrankloss.gp
37 | ~/dev/common-scripts/sort-curves.py *validationlogrankloss.dat | perl -ne "chop; print \"\\t'\$_' with l lw 3, \\\\\\n\""  | perl -e '$str = ""; while(<>){ $str .= $_; } $str =~ s/, \\$//s; print $str' >> graphs-validationlogrankloss.gp
38 | 
39 | gnuplot graphs-trainerror.gp
40 | gnuplot graphs-trainloss.gp
41 | gnuplot graphs-validationlogrankloss.gp
42 | ps2pdf graphs-trainerror.ps
43 | ps2pdf graphs-trainloss.ps
44 | ps2pdf graphs-validationlogrankloss.ps
45 | cp *pdf ~/public_html/priv ; chmod a+r ~/public_html/priv/*pdf
46 | #scp  *pdf turian@joyeux.iro.umontreal.ca:public_html/priv/
47 | 


--------------------------------------------------------------------------------
/scripts/eda/old/make-graphs-trainloss.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | #  Make a .dat file for each .out file.
 4 | #
 5 | 
 6 | $gnuplot = "plot";
 7 | $first = 1;
 8 | #foreach $f (split(/[\r\n]+/, `ls [0-9]*out`)) {
 9 | foreach $f (split(/[\r\n]+/, `ls ../run*/log.* | grep -v 'dat\$'`)) {
10 |     next if $f =~ m/\.dat$/;
11 |     ($badf = $f) =~ s/\/[^\/]*$/\/BAD/;
12 |     next if -e $badf;
13 |     ($fnew = $f) =~ s/$/-trainloss.dat/;
14 |     die $! if $fnew eq $f;
15 |     print STDERR "$f => $fnew\n";
16 |     $cmd = "cat $f | grep --text 'pre-update train unpenalized loss' | perl -ne 's/=/ /g; print' | cut -d ' ' -f 2,11 | grep -v '[a-zA-Z]' | grep '0000 ' > $fnew";
17 |     print STDERR "$cmd\n";
18 |     system($cmd);
19 |     $gnuplot .= "," unless $first;
20 |     $first = 0;
21 |     $gnuplot .= " \\\n\t'$fnew' with lp"
22 | }
23 | print "$gnuplot\n";
24 | 


--------------------------------------------------------------------------------
/scripts/eda/old/make-graphs-validationlogrankloss.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | #  Make a .dat file for each .out file.
 4 | #
 5 | 
 6 | $gnuplot = "plot";
 7 | $first = 1;
 8 | #foreach $f (split(/[\r\n]+/, `ls [0-9]*out`)) {
 9 | foreach $f (split(/[\r\n]+/, `ls ../run*/log.* | grep -v 'dat\$'`)) {
10 |     next if $f =~ m/\.dat$/;
11 |     ($badf = $f) =~ s/\/[^\/]*$/\/BAD/;
12 |     next if -e $badf;
13 |     ($fnew = $f) =~ s/$/-validationlogrankloss.dat/;
14 |     die $! if $fnew eq $f;
15 |     print STDERR "$f => $fnew\n";
16 |     $cmd = "cat $f | grep --text FINAL | cut -d ' ' -f 6,9 | perl -ne 's/[:,]//g; print' > $fnew";
17 |     print STDERR "$cmd\n";
18 |     system($cmd);
19 |     $gnuplot .= "," unless $first;
20 |     $first = 0;
21 |     $gnuplot .= " \\\n\t'$fnew' with lp"
22 | }
23 | print "$gnuplot\n";
24 | 


--------------------------------------------------------------------------------
/scripts/eda/remove-nonfinal-models.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | # For each directory in @ARGV, go in that directory and remove every
 4 | # model file except for the last one.
 5 | #
 6 | 
 7 | @torm = ();
 8 | foreach $d (@ARGV) {
 9 |     $last = -1;
10 |     # Find the last model
11 |     foreach $f  (split(/[\r\n]+/, `ls $d`)) {
12 |         if ($f =~ m/model-(\d+).pkl/) {
13 |             $last = $1 if $1 > $last;
14 |         }
15 |     }
16 |     # All non-last models are added to torm
17 |     foreach $f  (split(/[\r\n]+/, `ls $d`)) {
18 |         if ($f =~ m/model-(\d+).pkl/) {
19 |             if ($1 < $last) {
20 |                 $torm[++$#torm] = "$d/$f";
21 |             } else {
22 |                 print "KEEPING $d/$f\n";
23 |             }
24 |         }
25 |     }
26 | }
27 | foreach $f (@torm) {
28 |     $cmd = "rm $f";
29 |     print "$cmd\n";
30 |     system("$cmd");
31 | }
32 | 


--------------------------------------------------------------------------------
/scripts/hyperparameters.language-model.full.yaml:
--------------------------------------------------------------------------------
  1 | #: Not actually used directly, just for convenience
  2 | #
  3 | #locations: {"DATA_DIR": "/home/fringant2/lisa/turian/dev/python/language-model/data/"}
  4 | #locations: {"DATA_DIR": "/home/turianjo/dev/python/language-model/data/"}
  5 | #locations: {"DATA_DIR": "../data-sample-bilingual/"}
  6 | #locations: {"DATA_DIR": "../data/full-bilingual/"}
  7 | #locations: {"DATA_DIR": "../data/maxlength40-lemmas/"}
  8 | locations: {"DATA_DIR": "../data/maxlength40-lemmas-filtered/"}
  9 | #locations: {"DATA_DIR": "../data/small-sample.maxlength40-lemmas-filtered/"}
 10 | 
 11 | # Are we running this automatically from a console, or is this job part of some larger batch?
 12 | # If True, we log output to stdout, not to a log file on disk.
 13 | #console: False
 14 | console: True
 15 | 
 16 | # A list of the validation examples
 17 | # Currently unused
 18 | VALIDATION_INPUT: /u/turian/data/SemEval-2-2010/Task 3 - Cross-Lingual Word Sense Disambiguation/validation.txt
 19 | 
 20 | PERCENT_OF_TRAINING_EXAMPLES_FOR_VALIDATION: 0.01
 21 | 
 22 | ## 32-bit for the GPU
 23 | ##import theano.config as config
 24 | ##floatX: config.floatX
 25 | #floatX: 'float32'
 26 | 
 27 | # Should we induce an embedding for OOV words?
 28 | INCLUDE_UNKNOWN_WORD: True
 29 | 
 30 | RUN_NAME: "rcv1.case-intact"
 31 | MONOLINGUAL_VOCABULARY_SIZE: 268810
 32 | 
 33 | # Make all example weights are uniform
 34 | # Note: When a word has more target words, we might want to give it higher weight. (Or, when we use ngram noise.)
 35 | UNIFORM EXAMPLE WEIGHTS: True
 36 | 
 37 | # Bilingual corpora language pairs
 38 | #W2W BICORPORA: [["en", "fr"], ["en", "nl"]]
 39 | #W2W BICORPORA: [["en", "fr"]]
 40 | W2W BICORPORA: [["en", "nl"], ["en", "de"], ["en", "it"], ["en", "fr"], ["en", "es"]]
 41 | # Monolingual corpora language singletons 
 42 | W2W MONOCORPORA: []
 43 | #W2W MONOCORPORA: ["en"]
 44 | 
 45 | # Only train on examples in which the focus word lemmatizes to one of these words.
 46 | # If an empty list, we use ALL examples, and don't do any filtering.
 47 | W2W FOCUS LEMMAS: [
 48 | "bank", "movement", "occupation", "passage", "plant",
 49 | "coach", "education", "execution", "figure", "job", "post", "pot", "range", "rest", "ring", "mood", "soil", "strain", "match", "scene", "test", "mission", "letter", "paper", "side"
 50 | ]
 51 | 
 52 | # Delexicalize all words that occur fewer than this number of times.
 53 | W2W MINIMUM WORD FREQUENCY: 3
 54 | #W2W MINIMUM WORD FREQUENCY: 10
 55 | 
 56 | # Skip translations to unknown word.
 57 | # This makes coding easy, because we treat the Unknown word as having its own language.
 58 | # However, this means that we always will try to translate words to
 59 | # something in the target vocab, whereas in practice we might want to
 60 | # translate to *UNKNOWN* and in the target language just keep the word
 61 | # form as is.
 62 | W2W SKIP TRANSLATIONS TO UNKNOWN WORD: True
 63 | 
 64 | # Use these embeddings to initialize the model
 65 | W2W INITIAL EMBEDDINGS: /u/turian/data/share/embeddings/model-2520000000.LEARNING_RATE=1e-09.EMBEDDING_LEARNING_RATE=1e-06.HIDDEN_SIZE=800.txt.gz
 66 | # Language of the initial embeddings
 67 | W2W INITIAL EMBEDDINGS LANGUAGE: en
 68 | # Were the initial embeddings induced case-sensitive, but now we want to lowercase them?
 69 | W2W LOWERCASE INITIAL EMBEDDINGS BEFORE INITIALIZATION: True
 70 | 
 71 | # Use the log-bilinear model or not?
 72 | # If True, we predict the Mnih log-bilinear model
 73 | # If False, we predict the C&W language model.
 74 | #LOG BILINEAR MODEL: True
 75 | LOG BILINEAR MODEL: False
 76 | 
 77 | # Number of examples per minibach
 78 | MINIBATCH SIZE: 100
 79 | 
 80 | # Randomly initialize embeddings uniformly in the range [-this value, +this value]
 81 | INITIAL_EMBEDDING_RANGE: 0.01
 82 | 
 83 | # l1 penalty appliedto C&W embeddings
 84 | CW_EMBEDDING_L1_PENALTY: 0.
 85 | 
 86 | NORMALIZE_EMBEDDINGS: False
 87 | #NORMALIZE_EMBEDDINGS: True
 88 | #UPDATES_PER_NORMALIZE_EMBEDDINGS: 1000
 89 | 
 90 | # Number of validation examples
 91 | #VALIDATION EXAMPLES: 10000
 92 | #VALIDATION EXAMPLES: 2500
 93 | VALIDATION EXAMPLES: 1000
 94 | 
 95 | # What percent of noise examples should we use for computing the logrank
 96 | # during validation?
 97 | # This is a speed optimization.
 98 | PERCENT OF NOISE EXAMPLES FOR VALIDATION LOGRANK: 0.01
 99 | 
100 | NGRAM_FOR_TRAINING_NOISE: 0
101 | 
102 | #NGRAMS: {(1, 5000): join(DATA_DIR, "1grams-wikitext-5000.json.gz"),
103 | #(1, 10000): join(DATA_DIR, "1grams-wikitext-10000.json.gz"),
104 | #(1, 20000): join(DATA_DIR, "1grams-wikitext-20000.json.gz")}
105 | 
106 | # Number of instances of each ngram to add, for smoothing.
107 | TRAINING_NOISE_SMOOTHING_ADDITION: 0
108 | 
109 | # Each embedded word representation has this width
110 | EMBEDDING_SIZE: 50
111 | #EMBEDDING_SIZE: 20
112 | #EMBEDDING_SIZE: 5
113 | 
114 | # Predict with a window of five words at a time
115 | WINDOW_SIZE: 11
116 | 
117 | HIDDEN_SIZE: 100
118 | #HIDDEN_SIZE: 40
119 | #HIDDEN_SIZE: 10
120 | 
121 | # Two hidden layers, or only one?
122 | TWO_HIDDEN_LAYERS: False
123 | 
124 | #: Scaling value to control range for weight initialization
125 | #SCALE_INITIAL_WEIGHTS_BY: math.sqrt(3)
126 | SCALE_INITIAL_WEIGHTS_BY: 1
127 | 
128 | # Which activation function to use?
129 | #ACTIVATION_FUNCTION="sigmoid"
130 | #ACTIVATION_FUNCTION="tanh"
131 | ACTIVATION_FUNCTION: "softsign"
132 | 
133 | LEARNING_RATE: 0.000000011
134 | #LEARNING_RATE: 0.000000000001
135 | 
136 | # The learning rate for the embeddings
137 | #EMBEDDING_LEARNING_RATE: 0.00000000034
138 | EMBEDDING_LEARNING_RATE: 0
139 | 
140 | ## number of (higher-order) quadratic filters for James's neuron
141 | #NUMBER_OF_QUADRATIC_FILTERS=0
142 | ## We use this scaling factor for initial weights of quadratic filters,
143 | ## instead of SCALE_INITIAL_WEIGHTS_BY
144 | ## @note: Try between 10 and 0.01
145 | #SCALE_QUADRATIC_INITIAL_WEIGHTS_BY: 1
146 | 
147 | # Validate after this many examples
148 | #VALIDATE_EVERY: 10000000
149 | VALIDATE_EVERY: 100000
150 | #VALIDATE_EVERY: 10000
151 | 


--------------------------------------------------------------------------------
/scripts/hyperparameters.language-model.sample.yaml:
--------------------------------------------------------------------------------
  1 | #: Not actually used directly, just for convenience
  2 | #
  3 | #locations: {"DATA_DIR": "/home/fringant2/lisa/turian/dev/python/language-model/data/"}
  4 | #locations: {"DATA_DIR": "/home/turianjo/dev/python/language-model/data/"}
  5 | #locations: {"DATA_DIR": "../data-sample-bilingual/"}
  6 | #locations: {"DATA_DIR": "../data/full-bilingual/"}
  7 | #locations: {"DATA_DIR": "../data/maxlength40-lemmas/"}
  8 | #locations: {"DATA_DIR": "../data/maxlength40-lemmas-filtered/"}
  9 | locations: {"DATA_DIR": "../data/small-sample.maxlength40-lemmas-filtered/"}
 10 | 
 11 | # Are we running this automatically from a console, or is this job part of some larger batch?
 12 | # If True, we log output to stdout, not to a log file on disk.
 13 | console: False
 14 | #console: True
 15 | 
 16 | # A list of the validation examples
 17 | # Currently unused
 18 | VALIDATION_INPUT: /u/turian/data/SemEval-2-2010/Task 3 - Cross-Lingual Word Sense Disambiguation/validation.txt
 19 | 
 20 | PERCENT_OF_TRAINING_EXAMPLES_FOR_VALIDATION: 0.01
 21 | 
 22 | ## 32-bit for the GPU
 23 | ##import theano.config as config
 24 | ##floatX: config.floatX
 25 | #floatX: 'float32'
 26 | 
 27 | # Should we induce an embedding for OOV words?
 28 | INCLUDE_UNKNOWN_WORD: True
 29 | 
 30 | RUN_NAME: "rcv1.case-intact"
 31 | MONOLINGUAL_VOCABULARY_SIZE: 268810
 32 | 
 33 | # Make all example weights are uniform
 34 | # Note: When a word has more target words, we might want to give it higher weight. (Or, when we use ngram noise.)
 35 | UNIFORM EXAMPLE WEIGHTS: True
 36 | 
 37 | # Bilingual corpora language pairs
 38 | #W2W BICORPORA: [["en", "fr"], ["en", "nl"]]
 39 | #W2W BICORPORA: [["en", "fr"]]
 40 | W2W BICORPORA: [["en", "nl"], ["en", "de"], ["en", "it"], ["en", "fr"], ["en", "es"]]
 41 | # Monolingual corpora language singletons 
 42 | W2W MONOCORPORA: []
 43 | #W2W MONOCORPORA: ["en"]
 44 | 
 45 | # Only train on examples in which the focus word lemmatizes to one of these words.
 46 | # If an empty list, we use ALL examples, and don't do any filtering.
 47 | W2W FOCUS LEMMAS: [
 48 | "bank", "movement", "occupation", "passage", "plant",
 49 | "coach", "education", "execution", "figure", "job", "post", "pot", "range", "rest", "ring", "mood", "soil", "strain", "match", "scene", "test", "mission", "letter", "paper", "side"
 50 | ]
 51 | 
 52 | # Delexicalize all words that occur fewer than this number of times.
 53 | W2W MINIMUM WORD FREQUENCY: 3
 54 | #W2W MINIMUM WORD FREQUENCY: 10
 55 | 
 56 | # Skip translations to unknown word.
 57 | # This makes coding easy, because we treat the Unknown word as having its own language.
 58 | # However, this means that we always will try to translate words to
 59 | # something in the target vocab, whereas in practice we might want to
 60 | # translate to *UNKNOWN* and in the target language just keep the word
 61 | # form as is.
 62 | W2W SKIP TRANSLATIONS TO UNKNOWN WORD: True
 63 | 
 64 | # Use these embeddings to initialize the model
 65 | W2W INITIAL EMBEDDINGS: /u/turian/data/share/embeddings/model-2520000000.LEARNING_RATE=1e-09.EMBEDDING_LEARNING_RATE=1e-06.HIDDEN_SIZE=800.txt.gz
 66 | # Language of the initial embeddings
 67 | W2W INITIAL EMBEDDINGS LANGUAGE: en
 68 | # Were the initial embeddings induced case-sensitive, but now we want to lowercase them?
 69 | W2W LOWERCASE INITIAL EMBEDDINGS BEFORE INITIALIZATION: True
 70 | 
 71 | # Use the log-bilinear model or not?
 72 | # If True, we predict the Mnih log-bilinear model
 73 | # If False, we predict the C&W language model.
 74 | #LOG BILINEAR MODEL: True
 75 | LOG BILINEAR MODEL: False
 76 | 
 77 | # Number of examples per minibach
 78 | MINIBATCH SIZE: 100
 79 | 
 80 | # Randomly initialize embeddings uniformly in the range [-this value, +this value]
 81 | INITIAL_EMBEDDING_RANGE: 0.01
 82 | 
 83 | # l1 penalty appliedto C&W embeddings
 84 | CW_EMBEDDING_L1_PENALTY: 0.
 85 | 
 86 | NORMALIZE_EMBEDDINGS: False
 87 | #NORMALIZE_EMBEDDINGS: True
 88 | #UPDATES_PER_NORMALIZE_EMBEDDINGS: 1000
 89 | 
 90 | # Number of validation examples
 91 | #VALIDATION EXAMPLES: 10000
 92 | #VALIDATION EXAMPLES: 2500
 93 | VALIDATION EXAMPLES: 1000
 94 | 
 95 | # What percent of noise examples should we use for computing the logrank
 96 | # during validation?
 97 | # This is a speed optimization.
 98 | PERCENT OF NOISE EXAMPLES FOR VALIDATION LOGRANK: 0.01
 99 | 
100 | NGRAM_FOR_TRAINING_NOISE: 0
101 | 
102 | #NGRAMS: {(1, 5000): join(DATA_DIR, "1grams-wikitext-5000.json.gz"),
103 | #(1, 10000): join(DATA_DIR, "1grams-wikitext-10000.json.gz"),
104 | #(1, 20000): join(DATA_DIR, "1grams-wikitext-20000.json.gz")}
105 | 
106 | # Number of instances of each ngram to add, for smoothing.
107 | TRAINING_NOISE_SMOOTHING_ADDITION: 0
108 | 
109 | # Each embedded word representation has this width
110 | EMBEDDING_SIZE: 50
111 | #EMBEDDING_SIZE: 20
112 | #EMBEDDING_SIZE: 5
113 | 
114 | # Predict with a window of five words at a time
115 | WINDOW_SIZE: 5
116 | 
117 | HIDDEN_SIZE: 100
118 | #HIDDEN_SIZE: 40
119 | #HIDDEN_SIZE: 10
120 | 
121 | # Two hidden layers, or only one?
122 | TWO_HIDDEN_LAYERS: False
123 | 
124 | #: Scaling value to control range for weight initialization
125 | #SCALE_INITIAL_WEIGHTS_BY: math.sqrt(3)
126 | SCALE_INITIAL_WEIGHTS_BY: 1
127 | 
128 | # Which activation function to use?
129 | #ACTIVATION_FUNCTION="sigmoid"
130 | #ACTIVATION_FUNCTION="tanh"
131 | ACTIVATION_FUNCTION: "softsign"
132 | 
133 | LEARNING_RATE: 0.000000011
134 | #LEARNING_RATE: 0.000000000001
135 | 
136 | # The learning rate for the embeddings
137 | #EMBEDDING_LEARNING_RATE: 0.00000000034
138 | EMBEDDING_LEARNING_RATE: 0
139 | 
140 | ## number of (higher-order) quadratic filters for James's neuron
141 | #NUMBER_OF_QUADRATIC_FILTERS=0
142 | ## We use this scaling factor for initial weights of quadratic filters,
143 | ## instead of SCALE_INITIAL_WEIGHTS_BY
144 | ## @note: Try between 10 and 0.01
145 | #SCALE_QUADRATIC_INITIAL_WEIGHTS_BY: 1
146 | 
147 | # Validate after this many examples
148 | #VALIDATE_EVERY: 10000000
149 | VALIDATE_EVERY: 10000
150 | 


--------------------------------------------------------------------------------
/scripts/hyperparameters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module to update hyperparameters automatically.
 3 | """
 4 | 
 5 | from os.path import join
 6 | import common.hyperparameters
 7 | HYPERPARAMETERS = common.hyperparameters.read("language-model")
 8 | HYPERPARAMETERS["DATA_DIR"] = HYPERPARAMETERS["locations"]["DATA_DIR"]
 9 | RUN_NAME = HYPERPARAMETERS["RUN_NAME"]
10 | MONOLINGUAL_VOCABULARY_SIZE = HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"]
11 | INCLUDE_UNKNOWN_WORD = HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]
12 | HYPERPARAMETERS["TRAIN_SENTENCES"] = join(HYPERPARAMETERS["DATA_DIR"], "%s.train.txt.gz" % RUN_NAME)
13 | HYPERPARAMETERS["ORIGINAL VALIDATION_SENTENCES"] = join(HYPERPARAMETERS["DATA_DIR"], "%s.validation.txt.gz" % RUN_NAME)
14 | HYPERPARAMETERS["VALIDATION_SENTENCES"] = join(HYPERPARAMETERS["DATA_DIR"], "%s.validation-%d.txt.gz" % (RUN_NAME, HYPERPARAMETERS["VALIDATION EXAMPLES"]))
15 | HYPERPARAMETERS["MONOLINGUAL_VOCABULARY"] = join(HYPERPARAMETERS["DATA_DIR"], "vocabulary-%s-%d.txt.gz" % (RUN_NAME, MONOLINGUAL_VOCABULARY_SIZE))
16 | HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_IDMAP_FILE"] = join(HYPERPARAMETERS["DATA_DIR"], "idmap.%s-%d.include_unknown=%s.pkl.gz" % (RUN_NAME, MONOLINGUAL_VOCABULARY_SIZE, INCLUDE_UNKNOWN_WORD))
17 | HYPERPARAMETERS["INITIAL_EMBEDDINGS"] = join(HYPERPARAMETERS["DATA_DIR"], "initial-embeddings.minfreq=%d.include_unknown=%s.pkl.gz" % (HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"], HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]))
18 | 


--------------------------------------------------------------------------------
/scripts/lemmatizer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Lemmatize English using the NLTK WordNetLemmatizer.
 3 | """
 4 | 
 5 | from nltk.stem.wordnet import WordNetLemmatizer
 6 | 
 7 | _lmtzr = None
 8 | def lmtzr():
 9 |     global _lmtzr
10 |     if _lmtzr is None: _lmtzr = WordNetLemmatizer()
11 |     return _lmtzr
12 | 
13 | def lemmatize(language, wordform):
14 |     assert language == "en"
15 |     return lmtzr().lemmatize(wordform)
16 | 


--------------------------------------------------------------------------------
/scripts/miscglobals.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Miscellaneous globals.
 3 | 
 4 | @todo: Most of these should be moved somewhere more specific.
 5 | """
 6 | 
 7 | #: RNG seed
 8 | RANDOMSEED = 0
 9 | 
10 | #LINKER      = 'c|py'
11 | ##LINKER      = 'py'
12 | #OPTIMIZER   = 'merge'   # 'math' optimizer is broken with 'c|py' linker
13 | 


--------------------------------------------------------------------------------
/scripts/model/__init__.py:
--------------------------------------------------------------------------------
1 | from model import *
2 | 


--------------------------------------------------------------------------------
/scripts/model/graphcw.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Theano graph of Collobert & Weston language model.
  3 | """
  4 | 
  5 | import theano
  6 | #import theano.sandbox.cuda
  7 | #theano.sandbox.cuda.use()
  8 | 
  9 | from theano.compile import pfunc, shared
 10 | from theano import config
 11 | floatX = config.floatX
 12 | 
 13 | 
 14 | from theano import tensor as t
 15 | from theano import scalar as s
 16 | 
 17 | from theano.tensor.basic import horizontal_stack
 18 | from theano.tensor import dot
 19 | 
 20 | from theano import gradient
 21 | 
 22 | import theano.compile
 23 | #from miscglobals import LINKER, OPTIMIZER
 24 | #mode = theano.compile.Mode(LINKER, OPTIMIZER)
 25 | #import theano.compile.debugmode
 26 | #COMPILE_MODE = theano.compile.debugmode.DebugMode(optimizer='fast_run', check_isfinite=False)
 27 | #import theano.compile.profilemode
 28 | #COMPILE_MODE = theano.compile.profilemode.ProfileMode()
 29 | COMPILE_MODE = theano.compile.Mode('c|py', 'fast_run')
 30 | #COMPILE_MODE = theano.compile.Mode('py', 'fast_compile')
 31 | 
 32 | import numpy
 33 | 
 34 | #hidden_weights = t.matrix()
 35 | #hidden_biases = t.matrix()
 36 | 
 37 | #if HYPERPARAMETERS["USE_SECOND_HIDDEN_LAYER"] == True:
 38 | #    hidden2_weights = t.matrix()
 39 | #    hidden2_biases = t.matrix()
 40 | 
 41 | #output_weights = t.matrix()
 42 | #output_biases = t.matrix()
 43 | 
 44 | # TODO: Include gradient steps in actual function, don't do them manually
 45 | 
 46 | def activation_function(r):
 47 |     from hyperparameters import HYPERPARAMETERS
 48 |     if HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "sigmoid":
 49 |         return sigmoid(r)
 50 |     elif HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "tanh":
 51 |         return t.tanh(r)
 52 |     elif HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "softsign":
 53 |         from theano.sandbox.softsign import softsign
 54 |         return softsign(r)
 55 |     else:
 56 |         assert 0
 57 | 
 58 | def stack(x):
 59 |     """
 60 |     Horizontally stack a list of representations, and then compress them to
 61 |     one representation.
 62 |     """
 63 |     assert len(x) >= 2
 64 |     return horizontal_stack(*x)
 65 | 
 66 | def score(x):
 67 |     from hyperparameters import HYPERPARAMETERS
 68 |     prehidden = dot(x, hidden_weights) + hidden_biases
 69 |     hidden = activation_function(prehidden)
 70 |     if HYPERPARAMETERS["TWO_HIDDEN_LAYERS"] == True:
 71 |         prehidden2 = dot(hidden, hidden2_weights) + hidden2_biases
 72 |         hidden2 = activation_function(prehidden2)
 73 |         score = dot(hidden2, output_weights) + output_biases
 74 |     else:
 75 |         score = dot(hidden, output_weights) + output_biases
 76 |     return score, prehidden
 77 | 
 78 | cached_functions = {}
 79 | def functions(sequence_length):
 80 |     """
 81 |     Return two functions
 82 |      * The first function does prediction.
 83 |      * The second function does learning.
 84 |     """
 85 |     global cached_functions
 86 |     cachekey = (sequence_length)
 87 |     if len(cached_functions.keys()) > 1:
 88 |         # This is problematic because we use global variables for the model parameters.
 89 |         # Hence, we might be unsafe, if we are using the wrong model parameters globally.
 90 |         assert 0
 91 |     if cachekey not in cached_functions:
 92 |         print "Need to construct graph for sequence_length=%d..." % (sequence_length)
 93 |         # Create the sequence_length inputs.
 94 |         # Each is a t.matrix(), initial word embeddings (provided by
 95 |         # Jason + Ronan) to be transformed into an initial representation.
 96 |         # We could use a vector, but instead we use a matrix with one row.
 97 |         correct_inputs = [t.matrix() for i in range(sequence_length)]
 98 |         noise_inputs = [t.matrix() for i in range(sequence_length)]
 99 |         learning_rate = t.scalar()
100 | 
101 |         stacked_correct_inputs = stack(correct_inputs)
102 |         stacked_noise_inputs = stack(noise_inputs)
103 | 
104 |         correct_score, correct_prehidden = score(stacked_correct_inputs)
105 |         noise_score, noise_prehidden = score(stacked_noise_inputs)
106 |         unpenalized_loss = t.clip(1 - correct_score + noise_score, 0, 1e999)
107 | 
108 |         from hyperparameters import HYPERPARAMETERS
109 |         if HYPERPARAMETERS["CW_EMBEDDING_L1_PENALTY"] != 0:
110 |             l1penalty = t.sum(t.abs_(stacked_correct_inputs) + t.abs_(stacked_noise_inputs), axis=1).T * HYPERPARAMETERS["CW_EMBEDDING_L1_PENALTY"]
111 |         else:
112 |             l1penalty = t.as_tensor_variable(numpy.asarray(0, dtype=floatX))
113 | #            l1penalty = t.as_tensor_variable(numpy.asarray((0,), dtype=floatX))
114 |         loss = (unpenalized_loss.T + l1penalty).T
115 | 
116 | #        import sys
117 | #        print >> sys.stderr, "FIXME: MODEL_LEARNING_RATE = fixed at 0.001"
118 | #        MODEL_LEARNING_RATE = t.as_tensor_variable(numpy.asarray(0.001, dtype=floatX))
119 | 
120 |         total_loss = t.sum(loss)
121 | 
122 |         if HYPERPARAMETERS["TWO_HIDDEN_LAYERS"] == True:
123 |             (dhidden_weights, dhidden_biases, dhidden2_weights, dhidden2_biases, doutput_weights, doutput_biases) = t.grad(total_loss, [hidden_weights, hidden_biases, hidden2_weights, hidden2_biases, output_weights, output_biases])
124 |         else:
125 |             (dhidden_weights, dhidden_biases, doutput_weights, doutput_biases) = t.grad(total_loss, [hidden_weights, hidden_biases, output_weights, output_biases])
126 |         if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0:
127 |             dcorrect_inputs = t.grad(total_loss, correct_inputs)
128 |             dnoise_inputs = t.grad(total_loss, noise_inputs)
129 |         #print "REMOVEME", len(dcorrect_inputs)
130 |         predict_inputs = correct_inputs
131 |         train_inputs = correct_inputs + noise_inputs + [learning_rate]
132 |         verbose_predict_inputs = predict_inputs
133 |         predict_outputs = [correct_score]
134 |         if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0:
135 |             train_outputs = dcorrect_inputs + dnoise_inputs + [loss, unpenalized_loss, l1penalty, correct_score, noise_score]
136 |         else:
137 |             train_outputs = [loss, unpenalized_loss, l1penalty, correct_score, noise_score]
138 |         verbose_predict_outputs = [correct_score, correct_prehidden]
139 | 
140 |         import theano.gof.graph
141 | 
142 |         nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs))
143 |         print "About to compile predict function over %d ops [nodes]..." % nnodes
144 |         predict_function = pfunc(predict_inputs, predict_outputs, mode=COMPILE_MODE)
145 |         print "...done constructing graph for sequence_length=%d" % (sequence_length)
146 | 
147 |         nnodes = len(theano.gof.graph.ops(verbose_predict_inputs, verbose_predict_outputs))
148 |         print "About to compile predict function over %d ops [nodes]..." % nnodes
149 |         verbose_predict_function = pfunc(verbose_predict_inputs, verbose_predict_outputs, mode=COMPILE_MODE)
150 |         print "...done constructing graph for sequence_length=%d" % (sequence_length)
151 | 
152 |         nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs))
153 |         print "About to compile train function over %d ops [nodes]..." % nnodes
154 |         if HYPERPARAMETERS["TWO_HIDDEN_LAYERS"] == True:
155 |             train_function = pfunc(train_inputs, train_outputs, mode=COMPILE_MODE, updates=[(p, p-learning_rate*gp) for p, gp in zip((hidden_weights, hidden_biases, hidden2_weights, hidden2_biases, output_weights, output_biases), (dhidden_weights, dhidden_biases, dhidden2_weights, dhidden2_biases, doutput_weights, doutput_biases))])
156 |         else:
157 |             train_function = pfunc(train_inputs, train_outputs, mode=COMPILE_MODE, updates=[(p, p-learning_rate*gp) for p, gp in zip((hidden_weights, hidden_biases, output_weights, output_biases), (dhidden_weights, dhidden_biases, doutput_weights, doutput_biases))])
158 |         print "...done constructing graph for sequence_length=%d" % (sequence_length)
159 | 
160 |         cached_functions[cachekey] = (predict_function, train_function, verbose_predict_function)
161 |     return cached_functions[cachekey]
162 | 
163 | #def apply_function(fn, sequence, target_output, parameters):
164 | #    assert len(sequence) == parameters.hidden_width
165 | #    inputs = [numpy.asarray([token]) for token in sequence]
166 | #    if target_output != None:
167 | ##        if HYPERPARAMETERS["USE_SECOND_HIDDEN_LAYER"]:
168 | ##            return fn(*(inputs + [numpy.asarray([target_output]), parameters.hidden_weights, parameters.hidden_biases, parameters.hidden2_weights, parameters.hidden2_biases, parameters.output_weights, parameters.output_biases]))
169 | ##        else:
170 | #        return fn(*(inputs + [numpy.asarray([target_output]), parameters.hidden_weights, parameters.hidden_biases, parameters.output_weights, parameters.output_biases]))
171 | #    else:
172 | ##        if HYPERPARAMETERS["USE_SECOND_HIDDEN_LAYER"]:
173 | ##            return fn(*(inputs + [parameters.hidden_weights, parameters.hidden_biases, parameters.hidden2_weights, parameters.hidden2_biases, parameters.output_weights, parameters.output_biases]))
174 | ##        else:
175 | #        return fn(*(inputs + [parameters.hidden_weights, parameters.hidden_biases, parameters.output_weights, parameters.output_biases]))
176 | #
177 | def predict(correct_sequence):
178 |     fn = functions(sequence_length=len(correct_sequence))[0]
179 | #    print "REMOVEME", correct_sequence
180 |     r = fn(*(correct_sequence))
181 |     assert len(r) == 1
182 |     r = r[0]
183 |     assert r.shape == (1, 1)
184 |     return r[0,0]
185 | def verbose_predict(correct_sequence):
186 |     fn = functions(sequence_length=len(correct_sequence))[2]
187 |     r = fn(*(correct_sequence))
188 |     assert len(r) == 2
189 |     (score, prehidden) = r
190 |     assert score.shape == (1, 1)
191 |     return score[0,0], prehidden
192 | def train(correct_sequence, noise_sequence, learning_rate):
193 |     assert len(correct_sequence) == len(noise_sequence)
194 |     fn = functions(sequence_length=len(correct_sequence))[1]
195 |     r = fn(*(correct_sequence + noise_sequence + [learning_rate]))
196 |     from hyperparameters import HYPERPARAMETERS
197 |     if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0:
198 |         dcorrect_inputs = r[:len(correct_sequence)]
199 |         r = r[len(correct_sequence):]
200 |         dnoise_inputs = r[:len(noise_sequence)]
201 |         r = r[len(correct_sequence):]
202 | #    print "REMOVEME", len(dcorrect_inputs), len(dnoise_inputs)
203 |     (loss, unpenalized_loss, l1penalty, correct_score, noise_score) = r
204 | #    if loss == 0:
205 | #        for di in [dhidden_weights, dhidden_biases, doutput_weights, doutput_biases]:
206 | #            assert (di == 0).all()
207 | 
208 |     if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0:
209 |         return (dcorrect_inputs, dnoise_inputs, loss, unpenalized_loss, l1penalty, correct_score, noise_score)
210 |     else:
211 |         return (loss, unpenalized_loss, l1penalty, correct_score, noise_score)
212 | 


--------------------------------------------------------------------------------
/scripts/model/graphlbl.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Theano graph of Mnih log bi-linear model.
  3 | """
  4 | 
  5 | import theano
  6 | import theano.sandbox.cuda
  7 | theano.sandbox.cuda.use()
  8 | 
  9 | from theano import tensor as t
 10 | from theano import scalar as s
 11 | 
 12 | from theano.tensor.basic import horizontal_stack
 13 | from theano.tensor import dot
 14 | 
 15 | from theano import gradient
 16 | 
 17 | import theano.compile
 18 | #from miscglobals import LINKER, OPTIMIZER
 19 | #mode = theano.compile.Mode(LINKER, OPTIMIZER)
 20 | COMPILE_MODE = theano.compile.Mode('c|py', 'fast_run')
 21 | #COMPILE_MODE = theano.compile.Mode('py', 'fast_compile')
 22 | 
 23 | import numpy
 24 | 
 25 | from common.chopargs import chopargs
 26 | 
 27 | #output_weights = t.xmatrix()
 28 | #output_biases = t.xmatrix()
 29 | 
 30 | # TODO: Include gradient steps in actual function, don't do them manually
 31 | 
 32 | def activation_function(r):
 33 |     from hyperparameters import HYPERPARAMETERS
 34 |     if HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "sigmoid":
 35 |         return sigmoid(r)
 36 |     elif HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "tanh":
 37 |         return t.tanh(r)
 38 |     elif HYPERPARAMETERS["ACTIVATION_FUNCTION"] == "softsign":
 39 |         from theano.sandbox.softsign import softsign
 40 |         return softsign(r)
 41 |     else:
 42 |         assert 0
 43 | 
 44 | def stack(x):
 45 |     """
 46 |     Horizontally stack a list of representations, and then compress them to
 47 |     one representation.
 48 |     """
 49 |     assert len(x) >= 2
 50 |     return horizontal_stack(*x)
 51 | 
 52 | def score(targetrepr, predictrepr):
 53 |     # TODO: Is this the right scoring function?
 54 |     score = dot(targetrepr, predictrepr.T)
 55 |     return score
 56 | 
 57 | cached_functions = {}
 58 | def functions(sequence_length):
 59 |     """
 60 |     Return two functions
 61 |      * The first function does prediction.
 62 |      * The second function does learning.
 63 |     """
 64 |     global cached_functions
 65 |     p = (sequence_length)
 66 |     if len(cached_functions.keys()) > 1:
 67 |         # This is problematic because we use global variables for the model parameters.
 68 |         # Hence, we might be unsafe, if we are using the wrong model parameters globally.
 69 |         assert 0
 70 |     if p not in cached_functions:
 71 |         print "Need to construct graph for sequence_length=%d..." % (sequence_length)
 72 |         # Create the sequence_length inputs.
 73 |         # Each is a t.xmatrix(), initial word embeddings (provided by
 74 |         # Jason + Ronan) to be transformed into an initial representation.
 75 |         # We could use a vector, but instead we use a matrix with one row.
 76 |         sequence = [t.xmatrix() for i in range(sequence_length)]
 77 |         correct_repr = t.xmatrix()
 78 |         noise_repr = t.xmatrix()
 79 | #        correct_scorebias = t.xscalar()
 80 | #        noise_scorebias = t.xscalar()
 81 |         correct_scorebias = t.xvector()
 82 |         noise_scorebias = t.xvector()
 83 | 
 84 |         stackedsequence = stack(sequence)
 85 |         predictrepr = dot(stackedsequence, output_weights) + output_biases
 86 | 
 87 |         correct_score = score(correct_repr, predictrepr) + correct_scorebias
 88 |         noise_score = score(noise_repr, predictrepr) + noise_scorebias
 89 |         loss = t.clip(1 - correct_score + noise_score, 0, 1e999)
 90 | 
 91 |         (doutput_weights, doutput_biases) = t.grad(loss, [output_weights, output_biases])
 92 |         dsequence = t.grad(loss, sequence)
 93 |         (dcorrect_repr, dnoise_repr) = t.grad(loss, [correct_repr, noise_repr])
 94 |         (dcorrect_scorebias, dnoise_scorebias) = t.grad(loss, [correct_scorebias, noise_scorebias])
 95 |         #print "REMOVEME", len(dcorrect_inputs)
 96 |         predict_inputs = sequence + [correct_repr, correct_scorebias, output_weights, output_biases]
 97 |         train_inputs = sequence + [correct_repr, noise_repr, correct_scorebias, noise_scorebias, output_weights, output_biases]
 98 |         predict_outputs = [predictrepr, correct_score]
 99 |         train_outputs = [loss, predictrepr, correct_score, noise_score] + dsequence + [dcorrect_repr, dnoise_repr, doutput_weights, doutput_biases, dcorrect_scorebias, dnoise_scorebias]
100 | #        train_outputs = [loss, correct_repr, correct_score, noise_repr, noise_score]
101 | 
102 |         import theano.gof.graph
103 | 
104 |         nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs))
105 |         print "About to compile predict function over %d ops [nodes]..." % nnodes
106 |         predict_function = theano.function(predict_inputs, predict_outputs, mode=COMPILE_MODE)
107 |         print "...done constructing graph for sequence_length=%d" % (sequence_length)
108 | 
109 |         nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs))
110 |         print "About to compile train function over %d ops [nodes]..." % nnodes
111 |         train_function = theano.function(train_inputs, train_outputs, mode=COMPILE_MODE)
112 |         print "...done constructing graph for sequence_length=%d" % (sequence_length)
113 | 
114 |         cached_functions[p] = (predict_function, train_function)
115 |     return cached_functions[p]
116 | 
117 | #def apply_function(fn, sequence, target_output, parameters):
118 | #    assert len(sequence) == parameters.hidden_width
119 | #    inputs = [numpy.asarray([token]) for token in sequence]
120 | #    if target_output != None:
121 | ##        if HYPERPARAMETERS["USE_SECOND_HIDDEN_LAYER"]:
122 | ##            return fn(*(inputs + [numpy.asarray([target_output]), parameters.hidden_weights, parameters.hidden_biases, parameters.hidden2_weights, parameters.hidden2_biases, parameters.output_weights, parameters.output_biases]))
123 | ##        else:
124 | #        return fn(*(inputs + [numpy.asarray([target_output]), parameters.hidden_weights, parameters.hidden_biases, parameters.output_weights, parameters.output_biases]))
125 | #    else:
126 | ##        if HYPERPARAMETERS["USE_SECOND_HIDDEN_LAYER"]:
127 | ##            return fn(*(inputs + [parameters.hidden_weights, parameters.hidden_biases, parameters.hidden2_weights, parameters.hidden2_biases, parameters.output_weights, parameters.output_biases]))
128 | ##        else:
129 | #        return fn(*(inputs + [parameters.hidden_weights, parameters.hidden_biases, parameters.output_weights, parameters.output_biases]))
130 | #
131 | 
132 | def predict(sequence, targetrepr, target_scorebias):
133 |     fn = functions(sequence_length=len(sequence))[0]
134 |     (predictrepr, score) = fn(*(sequence + [targetrepr, target_scorebias]))
135 |     return predictrepr, score
136 | 
137 | def train(sequence, correct_repr, noise_repr, correct_scorebias, noise_scorebias, learning_rate):
138 |     fn = functions(sequence_length=len(sequence))[1]
139 | #    print "REMOVEME", correct_scorebias, noise_scorebias
140 | #    print "REMOVEME", correct_scorebias[0], noise_scorebias[0]
141 |     r = fn(*(sequence + [correct_repr, noise_repr, correct_scorebias, noise_scorebias]))
142 | 
143 |     (loss, predictrepr, correct_score, noise_score, dsequence, dcorrect_repr, dnoise_repr, doutput_weights, doutput_biases, dcorrect_scorebias, dnoise_scorebias) = chopargs(r, (0,0,0,0,len(sequence),0,0,0,0,0,0))
144 |     if loss == 0:
145 |         for di in [doutput_weights, doutput_biases]:
146 |             # This tends to trigger if training diverges (NaN)
147 |             assert (di == 0).all()
148 | 
149 |     parameters.output_weights   -= 1.0 * learning_rate * doutput_weights
150 |     parameters.output_biases    -= 1.0 * learning_rate * doutput_biases
151 | 
152 |     # You also need to update score_biases here
153 |     assert 0
154 | 
155 |     dsequence = list(dsequence)
156 |     return (loss, predictrepr, correct_score, noise_score, dsequence, dcorrect_repr, dnoise_repr, dcorrect_scorebias, dnoise_scorebias)
157 | 


--------------------------------------------------------------------------------
/scripts/model/model.py:
--------------------------------------------------------------------------------
  1 | from parameters import Parameters
  2 | 
  3 | from hyperparameters import HYPERPARAMETERS
  4 | LBL = HYPERPARAMETERS["LOG BILINEAR MODEL"]
  5 | 
  6 | if LBL:
  7 |     import graphlbl as graph
  8 | else:
  9 |     import graphcw as graph
 10 | 
 11 | import sys, pickle
 12 | import math
 13 | import logging
 14 | 
 15 | from common.file import myopen
 16 | from common.movingaverage import MovingAverage
 17 | 
 18 | from vocabulary import *
 19 | 
 20 | class Model:
 21 |     """
 22 |     A Model can:
 23 | 
 24 |     @type parameters: L{Parameters}
 25 |     @todo: Document
 26 |     """
 27 | 
 28 |     import hyperparameters
 29 |     import miscglobals
 30 |     import vocabulary
 31 |     def __init__(self, modelname="", window_size=HYPERPARAMETERS["WINDOW_SIZE"], vocab_size=vocabulary.wordmap().len, embedding_size=HYPERPARAMETERS["EMBEDDING_SIZE"], hidden_size=HYPERPARAMETERS["HIDDEN_SIZE"], seed=miscglobals.RANDOMSEED, initial_embeddings=None, two_hidden_layers=HYPERPARAMETERS["TWO_HIDDEN_LAYERS"]):
 32 |         self.modelname = modelname
 33 |         self.parameters = Parameters(window_size, vocab_size, embedding_size, hidden_size, seed, initial_embeddings, two_hidden_layers)
 34 |         if LBL:
 35 |             graph.output_weights = self.parameters.output_weights
 36 |             graph.output_biases = self.parameters.output_biases
 37 |             graph.score_biases = self.parameters.score_biases
 38 |         else:
 39 |             graph.hidden_weights = self.parameters.hidden_weights
 40 |             graph.hidden_biases = self.parameters.hidden_biases
 41 |             if self.parameters.two_hidden_layers:
 42 |                 graph.hidden2_weights = self.parameters.hidden2_weights
 43 |                 graph.hidden2_biases = self.parameters.hidden2_biases
 44 |             graph.output_weights = self.parameters.output_weights
 45 |             graph.output_biases = self.parameters.output_biases
 46 | 
 47 | #        (self.graph_train, self.graph_predict, self.graph_verbose_predict) = graph.functions(self.parameters)
 48 |         import sets
 49 |         self.train_loss = MovingAverage()
 50 |         self.train_err = MovingAverage()
 51 |         self.train_lossnonzero = MovingAverage()
 52 |         self.train_squashloss = MovingAverage()
 53 |         self.train_unpenalized_loss = MovingAverage()
 54 |         self.train_l1penalty = MovingAverage()
 55 |         self.train_unpenalized_lossnonzero = MovingAverage()
 56 |         self.train_correct_score = MovingAverage()
 57 |         self.train_noise_score = MovingAverage()
 58 |         self.train_cnt = 0
 59 | 
 60 |     def __getstate__(self):
 61 |         return (self.modelname, self.parameters, self.train_loss, self.train_err, self.train_lossnonzero, self.train_squashloss, self.train_unpenalized_loss, self.train_l1penalty, self.train_unpenalized_lossnonzero, self.train_correct_score, self.train_noise_score, self.train_cnt)
 62 | 
 63 |     def __setstate__(self, state):
 64 |         (self.modelname, self.parameters, self.train_loss, self.train_err, self.train_lossnonzero, self.train_squashloss, self.train_unpenalized_loss, self.train_l1penalty, self.train_unpenalized_lossnonzero, self.train_correct_score, self.train_noise_score, self.train_cnt) = state
 65 |         if LBL:
 66 |             graph.output_weights = self.parameters.output_weights
 67 |             graph.output_biases = self.parameters.output_biases
 68 |             graph.score_biases = self.parameters.score_biases
 69 |         else:
 70 |             graph.hidden_weights = self.parameters.hidden_weights
 71 |             graph.hidden_biases = self.parameters.hidden_biases
 72 |             if self.parameters.two_hidden_layers:
 73 |                 graph.hidden2_weights = self.parameters.hidden2_weights
 74 |                 graph.hidden2_biases = self.parameters.hidden2_biases
 75 |             graph.output_weights = self.parameters.output_weights
 76 |             graph.output_biases = self.parameters.output_biases
 77 | 
 78 | #    def load(self, filename):
 79 | #        sys.stderr.write("Loading model from: %s\n" % filename)
 80 | #        f = myopen(filename, "rb")
 81 | #        (self.parameters, self.train_loss, self.train_err, self.train_lossnonzero, self.train_squashloss, self.train_unpenalized_loss, self.train_l1penalty, self.train_unpenalized_lossnonzero, self.train_correct_score, self.train_noise_score, self.train_cnt) = pickle.load(f)
 82 | #        if LBL:
 83 | #            graph.output_weights = self.parameters.output_weights
 84 | #            graph.output_biases = self.parameters.output_biases
 85 | #            graph.score_biases = self.parameters.score_biases
 86 | #        else:
 87 | #            graph.hidden_weights = self.parameters.hidden_weights
 88 | #            graph.hidden_biases = self.parameters.hidden_biases
 89 | #            graph.output_weights = self.parameters.output_weights
 90 | #            graph.output_biases = self.parameters.output_biases
 91 | #
 92 | #    def save(self, filename):
 93 | #        sys.stderr.write("Saving model to: %s\n" % filename)
 94 | #        f = myopen(filename, "wb")
 95 | #        pickle.dump((self.parameters, self.train_loss, self.train_err, self.train_lossnonzero, self.train_squashloss, self.train_unpenalized_loss, self.train_l1penalty, self.train_unpenalized_lossnonzero, self.train_correct_score, self.train_noise_score, self.train_cnt), f)
 96 | 
 97 |     def embed(self, sequence):
 98 |         """
 99 |         Embed a sequence of vocabulary IDs
100 |         """
101 |         seq = [self.parameters.embeddings[s] for s in sequence]
102 |         import numpy
103 |         return [numpy.resize(s, (1, s.size)) for s in seq]
104 | #        return [self.parameters.embeddings[s] for s in sequence]
105 | 
106 |     def embeds(self, sequences):
107 |         """
108 |         Embed sequences of vocabulary IDs.
109 |         If we are given a list of MINIBATCH lists of SEQLEN items, return a list of SEQLEN matrices of shape (MINIBATCH, EMBSIZE)
110 |         """
111 |         embs = []
112 |         for sequence in sequences:
113 |             embs.append(self.embed(sequence))
114 | 
115 |         for emb in embs: assert len(emb) == len(embs[0])
116 | 
117 |         new_embs = []
118 |         for i in range(len(embs[0])):
119 |             colembs = [embs[j][i] for j in range(len(embs))]
120 |             import numpy
121 |             new_embs.append(numpy.vstack(colembs))
122 |             assert new_embs[-1].shape == (len(sequences), self.parameters.embedding_size)
123 |         assert len(new_embs) == len(sequences[0])
124 |         return new_embs
125 | 
126 |     def train(self, correct_sequences, noise_sequences, weights):
127 |         from hyperparameters import HYPERPARAMETERS
128 |         learning_rate = HYPERPARAMETERS["LEARNING_RATE"]
129 | 
130 |         # All weights must be the same, because of how we use a scalar learning rate
131 |         assert HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]
132 |         if HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]:
133 |             for w in weights: assert w == weights[0]
134 | 
135 |         if LBL:
136 |             # REWRITE FOR MINIBATCH
137 |             assert 0
138 | 
139 | #            noise_repr = noise_sequence[-1]
140 | #            correct_repr = correct_sequence[-1]
141 |             noise_repr = noise_sequence[-1:]
142 |             correct_repr = correct_sequence[-1:]
143 |             assert noise_repr != correct_repr
144 |             assert noise_sequence[:-1] == correct_sequence[:-1]
145 |             sequence = correct_sequence[:-1]
146 | #            r = graph.train(self.embed(sequence), self.embed([correct_repr])[0], self.embed([noise_repr])[0], self.parameters.score_biases[correct_repr], self.parameters.score_biases[noise_repr])
147 |             r = graph.train(self.embed(sequence), self.embed(correct_repr)[0], self.embed(noise_repr)[0], self.parameters.score_biases[correct_repr], self.parameters.score_biases[noise_repr], learning_rate * weight)
148 |             assert len(noise_repr) == 1
149 |             assert len(correct_repr) == 1
150 |             noise_repr = noise_repr[0]
151 |             correct_repr = correct_repr[0]
152 |             (loss, predictrepr, correct_score, noise_score, dsequence, dcorrect_repr, dnoise_repr, dcorrect_scorebias, dnoise_scorebias) = r
153 | #            print
154 | #            print "loss = ", loss
155 | #            print "predictrepr = ", predictrepr
156 | #            print "correct_repr = ", correct_repr, self.embed(correct_repr)[0]
157 | #            print "noise_repr = ", noise_repr, self.embed(noise_repr)[0]
158 | #            print "correct_score = ", correct_score
159 | #            print "noise_score = ", noise_score
160 |         else:
161 |             r = graph.train(self.embeds(correct_sequences), self.embeds(noise_sequences), learning_rate * weights[0])
162 |             if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0:
163 |                 (dcorrect_inputss, dnoise_inputss, losss, unpenalized_losss, l1penaltys, correct_scores, noise_scores) = r
164 |             else:
165 |                 (losss, unpenalized_losss, l1penaltys, correct_scores, noise_scores) = r
166 | #            print [d.shape for d in dcorrect_inputss]
167 | #            print [d.shape for d in dnoise_inputss]
168 | #            print "losss", losss.shape, losss
169 | #            print "unpenalized_losss", unpenalized_losss.shape, unpenalized_losss
170 | #            print "l1penaltys", l1penaltys.shape, l1penaltys
171 | #            print "correct_scores", correct_scores.shape, correct_scores
172 | #            print "noise_scores", noise_scores.shape, noise_scores
173 | 
174 |         import sets
175 |         to_normalize = sets.Set()
176 |         for ecnt in range(len(correct_sequences)):
177 |             (loss, unpenalized_loss, correct_score, noise_score) = \
178 |                 (losss[ecnt], unpenalized_losss[ecnt], correct_scores[ecnt], noise_scores[ecnt])
179 |             if l1penaltys.shape == ():
180 |                 assert l1penaltys == 0
181 |                 l1penalty = 0
182 |             else:
183 |                 l1penalty = l1penaltys[ecnt]
184 |             correct_sequence = correct_sequences[ecnt]
185 |             noise_sequence = noise_sequences[ecnt]
186 | 
187 |             if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0:
188 |                 dcorrect_inputs = [d[ecnt] for d in dcorrect_inputss]
189 |                 dnoise_inputs = [d[ecnt] for d in dnoise_inputss]
190 | 
191 | #            print [d.shape for d in dcorrect_inputs]
192 | #            print [d.shape for d in dnoise_inputs]
193 | #            print "loss", loss.shape, loss
194 | #            print "unpenalized_loss", unpenalized_loss.shape, unpenalized_loss
195 | #            print "l1penalty", l1penalty.shape, l1penalty
196 | #            print "correct_score", correct_score.shape, correct_score
197 | #            print "noise_score", noise_score.shape, noise_score
198 | 
199 | 
200 |             self.train_loss.add(loss)
201 |             self.train_err.add(correct_score <= noise_score)
202 |             self.train_lossnonzero.add(loss > 0)
203 |             squashloss = 1./(1.+math.exp(-loss))
204 |             self.train_squashloss.add(squashloss)
205 |             if not LBL:
206 |                 self.train_unpenalized_loss.add(unpenalized_loss)
207 |                 self.train_l1penalty.add(l1penalty)
208 |                 self.train_unpenalized_lossnonzero.add(unpenalized_loss > 0)
209 |             self.train_correct_score.add(correct_score)
210 |             self.train_noise_score.add(noise_score)
211 |     
212 |             self.train_cnt += 1
213 |             if self.train_cnt % 10000 == 0:
214 |     #        if self.train_cnt % 1000 == 0:
215 |     #            print self.train_cnt
216 | #                graph.COMPILE_MODE.print_summary()
217 |                 logging.info(("After %d updates, pre-update train loss %s" % (self.train_cnt, self.train_loss.verbose_string())))
218 |                 logging.info(("After %d updates, pre-update train error %s" % (self.train_cnt, self.train_err.verbose_string())))
219 |                 logging.info(("After %d updates, pre-update train Pr(loss != 0) %s" % (self.train_cnt, self.train_lossnonzero.verbose_string())))
220 |                 logging.info(("After %d updates, pre-update train squash(loss) %s" % (self.train_cnt, self.train_squashloss.verbose_string())))
221 |                 if not LBL:
222 |                     logging.info(("After %d updates, pre-update train unpenalized loss %s" % (self.train_cnt, self.train_unpenalized_loss.verbose_string())))
223 |                     logging.info(("After %d updates, pre-update train l1penalty %s" % (self.train_cnt, self.train_l1penalty.verbose_string())))
224 |                     logging.info(("After %d updates, pre-update train Pr(unpenalized loss != 0) %s" % (self.train_cnt, self.train_unpenalized_lossnonzero.verbose_string())))
225 |                 logging.info(("After %d updates, pre-update train correct score %s" % (self.train_cnt, self.train_correct_score.verbose_string())))
226 |                 logging.info(("After %d updates, pre-update train noise score %s" % (self.train_cnt, self.train_noise_score.verbose_string())))
227 | 
228 |                 self.debug_prehidden_values(correct_sequences)
229 |     
230 |                 if LBL:
231 |                     i = 1.
232 |                     while i < wordmap.len:
233 |                         inti = int(i)
234 |                         str = "word %s, rank %d, score %f" % (wordmap.str(inti), inti, self.parameters.score_biases[inti])
235 |                         logging.info("After %d updates, score biases: %s" % (self.train_cnt, str))
236 |                         i *= 3.2
237 |     
238 |     #            print(("After %d updates, pre-update train loss %s" % (self.train_cnt, self.train_loss.verbose_string())))
239 |     #            print(("After %d updates, pre-update train error %s" % (self.train_cnt, self.train_err.verbose_string())))
240 |     
241 | 
242 |             # All weights must be the same, because of how we use a scalar learning rate
243 |             assert HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]
244 |             if HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]:
245 |                 for w in weights: assert w == weights[0]
246 |             embedding_learning_rate = HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] * weights[0]
247 |             if loss == 0:
248 |                 if LBL:
249 |                     for di in dsequence + [dcorrect_repr, dnoise_repr]:
250 |                         # This tends to trigger if training diverges (NaN)
251 |                         assert (di == 0).all()
252 |     #                if not (di == 0).all():
253 |     #                    print "WARNING:", di
254 |     #                    print "WARNING in ", dsequence + [dcorrect_repr, dnoise_repr]
255 |     #                    print "loss = ", loss
256 |     #                    print "predictrepr = ", predictrepr
257 |     #                    print "correct_repr = ", correct_repr, self.embed(correct_repr)[0]
258 |     #                    print "noise_repr = ", noise_repr, self.embed(noise_repr)[0]
259 |     #                    print "correct_score = ", correct_score
260 |     #                    print "noise_score = ", noise_score
261 |                 else:
262 |                     if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0:
263 |                         for di in dcorrect_inputs + dnoise_inputs:
264 |                             assert (di == 0).all()
265 |     
266 |             if loss != 0:
267 |                 if LBL:
268 |                     val = sequence + [correct_repr, noise_repr]
269 |                     dval = dsequence + [dcorrect_repr, dnoise_repr]
270 |     #                print val
271 |                     for (i, di) in zip(val, dval):
272 |     #                for (i, di) in zip(tuple(sequence + [correct_repr, noise_repr]), tuple(dsequence + [dcorrect_repr, dnoise_repr])):
273 |                         assert di.shape[0] == 1
274 |                         di.resize(di.size)
275 |     #                    print i, di
276 |                         self.parameters.embeddings[i] -= 1.0 * embedding_learning_rate * di
277 |                         if HYPERPARAMETERS["NORMALIZE_EMBEDDINGS"]:
278 |                             to_normalize.add(i)
279 |     
280 |                     for (i, di) in zip([correct_repr, noise_repr], [dcorrect_scorebias, dnoise_scorebias]):
281 |                         self.parameters.score_biases[i] -= 1.0 * embedding_learning_rate * di
282 |     #                    print "REMOVEME", i, self.parameters.score_biases[i]
283 |                 else:
284 |                     if HYPERPARAMETERS["EMBEDDING_LEARNING_RATE"] != 0:
285 |                         for (i, di) in zip(correct_sequence, dcorrect_inputs):
286 |     #                        assert di.shape[0] == 1
287 |     #                        di.resize(di.size)
288 |         #                    print i, di
289 |                             assert di.shape == (self.parameters.embedding_size,)
290 |                             self.parameters.embeddings[i] -= 1.0 * embedding_learning_rate * di
291 |                             if HYPERPARAMETERS["NORMALIZE_EMBEDDINGS"]:
292 |                                 to_normalize.add(i)
293 |                         for (i, di) in zip(noise_sequence, dnoise_inputs):
294 |     #                        assert di.shape[0] == 1
295 |     #                        di.resize(di.size)
296 |         #                    print i, di
297 |                             assert di.shape == (self.parameters.embedding_size,)
298 |                             self.parameters.embeddings[i] -= 1.0 * embedding_learning_rate * di
299 |                             if HYPERPARAMETERS["NORMALIZE_EMBEDDINGS"]:
300 |                                 to_normalize.add(i)
301 |         #                print to_normalize
302 |     
303 |         if len(to_normalize) > 0:
304 |             to_normalize = [i for i in to_normalize]
305 | #            print "NORMALIZING", to_normalize
306 |             self.parameters.normalize(to_normalize)
307 | 
308 | 
309 | 
310 |     def predict(self, sequence):
311 |         if LBL:
312 |             targetrepr = sequence[-1:]
313 |             sequence = sequence[:-1]
314 |             (predictrepr, score) = graph.predict(self.embed(sequence), self.embed(targetrepr)[0], self.parameters.score_biases[targetrepr], self.parameters)
315 |             return score
316 |         else:
317 |             (score) = graph.predict(self.embed(sequence), self.parameters)
318 |             return score
319 | 
320 |     def verbose_predict(self, sequence):
321 |         if LBL:
322 |             assert 0
323 |         else:
324 |             (score, prehidden) = graph.verbose_predict(self.embed(sequence))
325 |             return score, prehidden
326 |     
327 |     def debug_prehidden_values(self, sequences):
328 |         """
329 |         Give debug output on pre-squash hidden values.
330 |         """
331 |         import numpy
332 |         for (i, ve) in enumerate(sequences):
333 |             (score, prehidden) = self.verbose_predict(ve)
334 |             abs_prehidden = numpy.abs(prehidden)
335 |             med = numpy.median(abs_prehidden)
336 |             abs_prehidden = abs_prehidden.tolist()
337 |             assert len(abs_prehidden) == 1
338 |             abs_prehidden = abs_prehidden[0]
339 |             abs_prehidden.sort()
340 |             abs_prehidden.reverse()
341 | 
342 |             logging.info("model %s, %s %s %s %s %s" % (self.modelname, self.train_cnt, "abs(pre-squash hidden) median =", med, "max =", abs_prehidden[:3]))
343 |             if i+1 >= 3: break
344 | 
345 |     def validate(self, sequence):
346 |         """
347 |         Get the rank of this final word, as opposed to all other words in the vocabulary.
348 |         """
349 |         import random
350 |         r = random.Random()
351 |         r.seed(0)
352 |         from hyperparameters import HYPERPARAMETERS
353 | 
354 |         import copy
355 |         corrupt_sequence = copy.copy(sequence)
356 |         rank = 1
357 |         correct_score = self.predict(sequence)
358 | #        print "CORRECT", correct_score, [wordmap.str(id) for id in sequence]
359 |         for i in range(self.parameters.vocab_size):
360 |             if r.random() > HYPERPARAMETERS["PERCENT OF NOISE EXAMPLES FOR VALIDATION LOGRANK"]: continue
361 |             if i == sequence[-1]: continue
362 |             corrupt_sequence[-1] = i
363 |             corrupt_score = self.predict(corrupt_sequence)
364 |             if correct_score <= corrupt_score:
365 | #                print " CORRUPT", corrupt_score, [wordmap.str(id) for id in corrupt_sequence]
366 |                 rank += 1
367 |         return rank
368 | 
369 |     def validate_errors(self, correct_sequences, noise_sequences):
370 |         """
371 |         Count the errors in this validation batch.
372 |         """
373 | 
374 | #            r = graph.train(self.embeds(correct_sequences), self.embeds(noise_sequences), learning_rate * weights[0])
375 |         correct_scores = graph.predict(self.embeds(correct_sequences))
376 |         noise_scores = graph.predict(self.embeds(noise_sequences))
377 | 
378 | #        print correct_scores
379 | #        print noise_scores
380 |         return correct_scores > noise_scores
381 | ##        print "CORRECT", correct_score, [wordmap.str(id) for id in sequence]
382 | #        for i in range(self.parameters.vocab_size):
383 | #            if r.random() > HYPERPARAMETERS["PERCENT OF NOISE EXAMPLES FOR VALIDATION LOGRANK"]: continue
384 | #            if i == sequence[-1]: continue
385 | #            corrupt_sequence[-1] = i
386 | #            corrupt_score = self.predict(corrupt_sequence)
387 | #            if correct_score <= corrupt_score:
388 | ##                print " CORRUPT", corrupt_score, [wordmap.str(id) for id in corrupt_sequence]
389 | #                rank += 1
390 | #        return rank
391 | 


--------------------------------------------------------------------------------
/scripts/model/parameters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @todo: WRITEME
 3 | """
 4 | 
 5 | from theano import config
 6 | from theano.compile.sandbox import shared
 7 | 
 8 | import copy
 9 | 
10 | floatX = config.floatX
11 | 
12 | from hyperparameters import HYPERPARAMETERS
13 | LBL = HYPERPARAMETERS["LOG BILINEAR MODEL"]
14 | 
15 | class Parameters:
16 |     """
17 |     Parameters used by the L{Model}.
18 |     @todo: Document these
19 |     """
20 | 
21 |     def __init__(self, window_size, vocab_size, embedding_size, hidden_size, seed, initial_embeddings, two_hidden_layers):
22 |         """
23 |         Initialize L{Model} parameters.
24 |         """
25 | 
26 |         self.vocab_size     = vocab_size
27 |         self.window_size    = window_size
28 |         self.embedding_size = embedding_size
29 |         self.two_hidden_layers = two_hidden_layers
30 |         if LBL:
31 |             self.hidden_size    = hidden_size
32 |             self.output_size    = self.embedding_size
33 |         else:
34 |             self.hidden_size    = hidden_size
35 |             self.output_size    = 1
36 | 
37 |         import numpy
38 |         import hyperparameters
39 | 
40 |         from pylearn.algorithms.weights import random_weights
41 |         numpy.random.seed(seed)
42 |         if initial_embeddings is None:
43 |             self.embeddings = numpy.asarray((numpy.random.rand(self.vocab_size, HYPERPARAMETERS["EMBEDDING_SIZE"]) - 0.5)*2 * HYPERPARAMETERS["INITIAL_EMBEDDING_RANGE"], dtype=floatX)
44 |         else:
45 |             assert initial_embeddings.shape == (self.vocab_size, HYPERPARAMETERS["EMBEDDING_SIZE"])
46 |             self.embeddings = copy.copy(initial_embeddings)
47 |         if HYPERPARAMETERS["NORMALIZE_EMBEDDINGS"]: self.normalize(range(self.vocab_size))
48 |         if LBL:
49 |             self.output_weights = shared(numpy.asarray(random_weights(self.input_size, self.output_size, scale_by=HYPERPARAMETERS["SCALE_INITIAL_WEIGHTS_BY"]), dtype=floatX))
50 |             self.output_biases = shared(numpy.asarray(numpy.zeros((1, self.output_size)), dtype=floatX))
51 |             self.score_biases = shared(numpy.asarray(numpy.zeros(self.vocab_size), dtype=floatX))
52 |             assert not self.two_hidden_layers
53 |         else:
54 |             self.hidden_weights = shared(numpy.asarray(random_weights(self.input_size, self.hidden_size, scale_by=HYPERPARAMETERS["SCALE_INITIAL_WEIGHTS_BY"]), dtype=floatX))
55 |             self.hidden_biases = shared(numpy.asarray(numpy.zeros((self.hidden_size,)), dtype=floatX))
56 |             if self.two_hidden_layers:
57 |                 self.hidden2_weights = shared(numpy.asarray(random_weights(self.hidden_size, self.hidden_size, scale_by=HYPERPARAMETERS["SCALE_INITIAL_WEIGHTS_BY"]), dtype=floatX))
58 |                 self.hidden2_biases = shared(numpy.asarray(numpy.zeros((self.hidden_size,)), dtype=floatX))
59 |             self.output_weights = shared(numpy.asarray(random_weights(self.hidden_size, self.output_size, scale_by=HYPERPARAMETERS["SCALE_INITIAL_WEIGHTS_BY"]), dtype=floatX))
60 |             self.output_biases = shared(numpy.asarray(numpy.zeros((self.output_size,)), dtype=floatX))
61 | 
62 |     input_size = property(lambda self:
63 |                                 LBL*((self.window_size-1) * self.embedding_size) + (1-LBL)*(self.window_size * self.embedding_size))
64 |     
65 |     def normalize(self, indices):
66 |         """
67 |         Normalize such that the l2 norm of the embeddings indices passed in.
68 |         @todo: l1 norm?
69 |         @return: The normalized embeddings
70 |         """
71 |         import numpy
72 |         l2norm = numpy.square(self.embeddings[indices]).sum(axis=1)
73 |         l2norm = numpy.sqrt(l2norm.reshape((len(indices), 1)))
74 | 
75 |         self.embeddings[indices] /= l2norm
76 |         import math
77 |         self.embeddings[indices] *= math.sqrt(self.embeddings.shape[1])
78 |     
79 |         # TODO: Assert that norm is correct
80 |     #    l2norm = (embeddings * embeddings).sum(axis=1)
81 |     #    print l2norm.shape
82 |     #    print (l2norm == numpy.ones((vocabsize)) * HYPERPARAMETERS["EMBEDDING_SIZE"])
83 |     #    print (l2norm == numpy.ones((vocabsize)) * HYPERPARAMETERS["EMBEDDING_SIZE"]).all()
84 | 


--------------------------------------------------------------------------------
/scripts/monolingual/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turian/neural-language-model/f7559a6cc4e9f4c34a553fbda974762f2d3f781b/scripts/monolingual/__init__.py


--------------------------------------------------------------------------------
/scripts/monolingual/build-vocabulary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | if __name__ == "__main__":
 4 |     import common.hyperparameters, common.options
 5 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
 6 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
 7 |     import hyperparameters
 8 | 
 9 |     import vocabulary
10 |     import common.idmap
11 | 
12 |     words = []
13 | 
14 |     import string
15 |     for i, l in enumerate(common.file.myopen(HYPERPARAMETERS["MONOLINGUAL_VOCABULARY"])):
16 |         if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"] and i+1 >= HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"]:
17 |             break
18 |         if not HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"] and i >= HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"]:
19 |             break
20 |         (cnt, w) = string.split(l)
21 |         words.append(w)
22 | 
23 |     v = common.idmap.IDmap(words, allow_unknown=HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"])
24 |     assert v.len == HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"]
25 |     vocabulary.write(v)
26 | 


--------------------------------------------------------------------------------
/scripts/monolingual/corrupt.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Methods for corrupting examples.
 3 | """
 4 | 
 5 | def corrupt_example(model, e):
 6 |     """
 7 |     Return a corrupted version of example e, plus the weight of this example.
 8 |     """
 9 |     from hyperparameters import HYPERPARAMETERS
10 |     import random
11 |     import copy
12 |     e = copy.copy(e)
13 |     last = e[-1]
14 |     cnt = 0
15 |     while e[-1] == last:
16 |         if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0:
17 |             e[-1] = random.randint(0, model.parameters.vocab_size-1)
18 |             pr = 1./model.parameters.vocab_size
19 |         elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1:
20 |             import noise
21 |             from common.myrandom import weighted_sample
22 |             e[-1], pr = weighted_sample(noise.indexed_weights())
23 | #            from vocabulary import wordmap
24 | #            print wordmap.str(e[-1]), pr
25 |         else:
26 |             assert 0
27 |         cnt += 1
28 |         # Backoff to 0gram smoothing if we fail 10 times to get noise.
29 |         if cnt > 10: e[-1] = random.randint(0, model.parameters.vocab_size-1)
30 |     weight = 1./pr
31 |     return e, weight
32 | 
33 | def corrupt_examples(model, correct_sequences):
34 |     noise_sequences = []
35 |     weights = []
36 |     for e in correct_sequences:
37 |         noise_sequence, weight = model.corrupt_example(e)
38 |         noise_sequences.append(noise_sequence)
39 |         weights.append(weight)
40 |     return noise_sequences, weights
41 | 


--------------------------------------------------------------------------------
/scripts/monolingual/examples.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Methods for getting examples.
  3 | """
  4 | 
  5 | from common.stats import stats
  6 | from common.file import myopen
  7 | import string
  8 | 
  9 | import common.hyperparameters
 10 | import sys
 11 | 
 12 | class TrainingExampleStream(object):
 13 |     def __init__(self):
 14 |         self.count = 0
 15 |         pass
 16 |     
 17 |     def __iter__(self):
 18 |         HYPERPARAMETERS = common.hyperparameters.read("language-model")
 19 |         from vocabulary import wordmap
 20 |         self.filename = HYPERPARAMETERS["TRAIN_SENTENCES"]
 21 |         self.count = 0
 22 |         for l in myopen(self.filename):
 23 |             prevwords = []
 24 |             for w in string.split(l):
 25 |                 w = string.strip(w)
 26 |                 id = None
 27 |                 if wordmap.exists(w):
 28 |                     prevwords.append(wordmap.id(w))
 29 |                     if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]:
 30 |                         self.count += 1
 31 |                         yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
 32 |                 else:
 33 |                     # If we can learn an unknown word token, we should
 34 |                     # delexicalize the word, not discard the example!
 35 |                     if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0
 36 |                     prevwords = []
 37 | 
 38 |     def __getstate__(self):
 39 |         return self.filename, self.count
 40 | 
 41 |     def __setstate__(self, state):
 42 |         """
 43 |         @warning: We ignore the filename.  If we wanted
 44 |         to be really fastidious, we would assume that
 45 |         HYPERPARAMETERS["TRAIN_SENTENCES"] might change.  The only
 46 |         problem is that if we change filesystems, the filename
 47 |         might change just because the base file is in a different
 48 |         path. So we issue a warning if the filename is different from
 49 |         """
 50 |         filename, count = state
 51 |         print >> sys.stderr, ("__setstate__(%s)..." % `state`)
 52 |         print >> sys.stderr, (stats())
 53 |         iter = self.__iter__()
 54 |         while count != self.count:
 55 | #            print count, self.count
 56 |             iter.next()
 57 |         if self.filename != filename:
 58 |             assert self.filename == HYPERPARAMETERS["TRAIN_SENTENCES"]
 59 |             print >> sys.stderr, ("self.filename %s != filename given to __setstate__ %s" % (self.filename, filename))
 60 |         print >> sys.stderr, ("...__setstate__(%s)" % `state`)
 61 |         print >> sys.stderr, (stats())
 62 | 
 63 | class TrainingMinibatchStream(object):
 64 |     def __init__(self):
 65 |         pass
 66 |     
 67 |     def __iter__(self):
 68 |         HYPERPARAMETERS = common.hyperparameters.read("language-model")
 69 |         minibatch = []
 70 |         self.get_train_example = TrainingExampleStream()
 71 |         for e in self.get_train_example:
 72 | #            print self.get_train_example.__getstate__()
 73 |             minibatch.append(e)
 74 |             if len(minibatch) >= HYPERPARAMETERS["MINIBATCH SIZE"]:
 75 |                 assert len(minibatch) == HYPERPARAMETERS["MINIBATCH SIZE"]
 76 |                 yield minibatch
 77 |                 minibatch = []
 78 | 
 79 |     def __getstate__(self):
 80 |         return (self.get_train_example.__getstate__(),)
 81 | 
 82 |     def __setstate__(self, state):
 83 |         """
 84 |         @warning: We ignore the filename.
 85 |         """
 86 |         self.get_train_example = TrainingExampleStream()
 87 |         self.get_train_example.__setstate__(state[0])
 88 | 
 89 | def get_validation_example():
 90 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
 91 | 
 92 |     from vocabulary import wordmap
 93 |     for l in myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"]):
 94 |         prevwords = []
 95 |         for w in string.split(l):
 96 |             w = string.strip(w)
 97 |             if wordmap.exists(w):
 98 |                 prevwords.append(wordmap.id(w))
 99 |                 if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]:
100 |                     yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
101 |             else:
102 |                 # If we can learn an unknown word token, we should
103 |                 # delexicalize the word, not discard the example!
104 |                 if HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]: assert 0
105 |                 prevwords = []
106 | 


--------------------------------------------------------------------------------
/scripts/monolingual/noise.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Sophisticated training noise.
 3 | """
 4 | 
 5 | from vocabulary import wordmap
 6 | 
 7 | from common.myrandom import build
 8 | import sys
 9 | 
10 | _indexed_weights = None
11 | def indexed_weights():
12 |     import common.hyperparameters, common.options
13 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
14 |     global _indexed_weights
15 |     if _indexed_weights is not None:
16 |         return _indexed_weights
17 |     print >> sys.stderr, wordmap.len, "=?=", HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"]
18 |     assert wordmap.len == HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"]
19 |     if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0:
20 |         _indexed_weights = [1 for id in range(wordmap.len)]
21 |     elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1:
22 |         from common.json import load
23 |         from common.file import myopen
24 |         ngrams_file = HYPERPARAMETERS["NGRAMS"][(HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"], HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_SIZE"])]
25 |         print >> sys.stderr, "Reading ngrams from", ngrams_file, "..."
26 |         from collections import defaultdict
27 |         ngramcnt = defaultdict(int)
28 |         for (ngram, cnt) in load(myopen(ngrams_file)):
29 |             assert len(ngram) == 1
30 |             ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS["TRAINING_NOISE_SMOOTHING_ADDITION"]
31 |         _indexed_weights = [ngramcnt[wordmap.str(id)] for id in range(wordmap.len)]
32 |         _indexed_weights = build(_indexed_weights)
33 |     else: assert 0
34 |     return _indexed_weights
35 | 


--------------------------------------------------------------------------------
/scripts/monolingual/state.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Save and load training state.
 3 | @todo: Training state variables (cnt, epoch, trainstate) should all be combined into one object.
 4 | """
 5 | 
 6 | import logging
 7 | import os.path
 8 | import cPickle
 9 | 
10 | from common.stats import stats
11 | from common.file import myopen
12 | import sys
13 | 
14 | _lastfilename = None
15 | def save(model, cnt, epoch, trainstate, rundir, newkeystr):
16 |     global _lastfilename
17 | 
18 |     filename = os.path.join(rundir, "model-%d%s.pkl" % (cnt, newkeystr))
19 |     logging.info("Writing model to %s..." % filename)
20 |     logging.info(stats())
21 |     cPickle.dump(model, myopen(filename, "wb"), protocol=-1)
22 |     logging.info("...done writing model to %s" % filename)
23 |     logging.info(stats())
24 | 
25 |     if _lastfilename is not None:
26 |         logging.info("Removing old model %s..." % _lastfilename)
27 |         try:
28 |             os.remove(_lastfilename)
29 |             logging.info("...removed %s" % _lastfilename)
30 |         except:
31 |             logging.info("Could NOT remove %s" % _lastfilename)
32 |     _lastfilename = filename
33 | 
34 |     filename = os.path.join(rundir, "trainstate.pkl")
35 |     cPickle.dump((trainstate, cnt, epoch), myopen(filename, "wb"), protocol=-1)
36 | 
37 |     filename = os.path.join(rundir, "newkeystr.txt")
38 |     myopen(filename, "wt").write(newkeystr)
39 | 
40 | def load(rundir, newkeystr):
41 |     """
42 |     Read the directory and load the model, the training count, the training epoch, and the training state.
43 |     """
44 |     global _lastfilename
45 | 
46 |     filename = os.path.join(rundir, "newkeystr.txt")
47 |     assert newkeystr == myopen(filename).read()
48 | 
49 |     filename = os.path.join(rundir, "trainstate.pkl")
50 |     (trainstate, cnt, epoch) = cPickle.load(myopen(filename))
51 | 
52 |     filename = os.path.join(rundir, "model-%d%s.pkl" % (cnt, newkeystr))
53 |     print >> sys.stderr, ("Reading model from %s..." % filename)
54 |     print >> sys.stderr, (stats())
55 |     model = cPickle.load(myopen(filename))
56 |     print >> sys.stderr, ("...done reading model from %s" % filename)
57 |     print >> sys.stderr, (stats())
58 |     _lastfilename = filename
59 | 
60 |     return (model, cnt, epoch, trainstate)
61 | 


--------------------------------------------------------------------------------
/scripts/monolingual/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import string
  5 | import common.dump
  6 | from common.file import myopen
  7 | from common.stats import stats
  8 | 
  9 | import miscglobals
 10 | import logging
 11 | 
 12 | import examples
 13 | import diagnostics
 14 | import state
 15 | 
 16 | def validate(cnt):
 17 |     import math
 18 |     logranks = []
 19 |     logging.info("BEGINNING VALIDATION AT TRAINING STEP %d" % cnt)
 20 |     logging.info(stats())
 21 |     i = 0
 22 |     for (i, ve) in enumerate(examples.get_validation_example()):
 23 | #        logging.info([wordmap.str(id) for id in ve])
 24 |         logranks.append(math.log(m.validate(ve)))
 25 |         if (i+1) % 10 == 0:
 26 |             logging.info("Training step %d, validating example %d, mean(logrank) = %.2f, stddev(logrank) = %.2f" % (cnt, i+1, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks))))
 27 |             logging.info(stats())
 28 |     logging.info("FINAL VALIDATION AT TRAINING STEP %d: mean(logrank) = %.2f, stddev(logrank) = %.2f, cnt = %d" % (cnt, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)), i+1))
 29 |     logging.info(stats())
 30 | #    print "FINAL VALIDATION AT TRAINING STEP %d: mean(logrank) = %.2f, stddev(logrank) = %.2f, cnt = %d" % (cnt, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)), i+1)
 31 | #    print stats()
 32 | 
 33 | if __name__ == "__main__":
 34 |     import common.hyperparameters, common.options
 35 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
 36 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
 37 |     import hyperparameters
 38 | 
 39 |     from common import myyaml
 40 |     import sys
 41 |     print >> sys.stderr, myyaml.dump(common.dump.vars_seq([hyperparameters, miscglobals]))
 42 | 
 43 |     import noise
 44 |     indexed_weights = noise.indexed_weights()
 45 | 
 46 |     from rundir import rundir
 47 |     rundir = rundir()
 48 | 
 49 |     import os.path, os
 50 |     logfile = os.path.join(rundir, "log")
 51 |     if newkeystr != "":
 52 |         verboselogfile = os.path.join(rundir, "log%s" % newkeystr)
 53 |         print >> sys.stderr, "Logging to %s, and creating link %s" % (logfile, verboselogfile)
 54 |         os.system("ln -s log %s " % (verboselogfile))
 55 |     else:
 56 |         print >> sys.stderr, "Logging to %s, not creating any link because of default settings" % logfile
 57 | 
 58 |     import random, numpy
 59 |     random.seed(miscglobals.RANDOMSEED)
 60 |     numpy.random.seed(miscglobals.RANDOMSEED)
 61 | 
 62 |     import vocabulary
 63 | #    logging.info("Reading vocab")
 64 | #    vocabulary.read()
 65 |     
 66 |     import model
 67 |     try:
 68 |         print >> sys.stderr, ("Trying to read training state for %s %s..." % (newkeystr, rundir))
 69 |         (m, cnt, epoch, get_train_minibatch) = state.load(rundir, newkeystr)
 70 |         print >> sys.stderr, ("...success reading training state for %s %s" % (newkeystr, rundir))
 71 |         print >> sys.stderr, logfile
 72 |         logging.basicConfig(filename=logfile, level=logging.DEBUG)
 73 | #        logging.basicConfig(filename=logfile, filemode="w", level=logging.DEBUG)
 74 |         logging.info("CONTINUING FROM TRAINING STATE")
 75 |     except IOError:
 76 |         print >> sys.stderr, ("...FAILURE reading training state for %s %s" % (newkeystr, rundir))
 77 |         print >> sys.stderr, ("INITIALIZING")
 78 | 
 79 |         m = model.Model()
 80 |         cnt = 0
 81 |         epoch = 1
 82 |         get_train_minibatch = examples.TrainingMinibatchStream()
 83 |         logging.basicConfig(filename=logfile, filemode="w", level=logging.DEBUG)
 84 |         logging.info("INITIALIZING TRAINING STATE")
 85 | 
 86 |     logging.info(myyaml.dump(common.dump.vars_seq([hyperparameters, miscglobals])))
 87 | 
 88 |     #validate(0)
 89 |     diagnostics.diagnostics(cnt, m)
 90 | #    diagnostics.visualizedebug(cnt, m, rundir)
 91 |     while 1:
 92 |         logging.info("STARTING EPOCH #%d" % epoch)
 93 |         for ebatch in get_train_minibatch:
 94 |             cnt += len(ebatch)
 95 |         #    print [wordmap.str(id) for id in e]
 96 | 
 97 |             noise_sequences, weights = corrupt.corrupt_examples(m, ebatch)
 98 |             m.train(ebatch, noise_sequences, weights)
 99 | 
100 |             #validate(cnt)
101 |             if cnt % (int(1000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0:
102 |                 logging.info("Finished training step %d (epoch %d)" % (cnt, epoch))
103 | #                print ("Finished training step %d (epoch %d)" % (cnt, epoch))
104 |             if cnt % (int(100000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0:
105 |                 diagnostics.diagnostics(cnt, m)
106 |                 if os.path.exists(os.path.join(rundir, "BAD")):
107 |                     logging.info("Detected file: %s\nSTOPPING" % os.path.join(rundir, "BAD"))
108 |                     sys.stderr.write("Detected file: %s\nSTOPPING\n" % os.path.join(rundir, "BAD"))
109 |                     sys.exit(0)
110 |             if cnt % (int(HYPERPARAMETERS["VALIDATE_EVERY"]*1./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0:
111 |                 state.save(m, cnt, epoch, get_train_minibatch, rundir, newkeystr)
112 |                 diagnostics.visualizedebug(cnt, m, rundir, newkeystr)
113 | #                validate(cnt)
114 |         get_train_minibatch = examples.TrainingMinibatchStream()
115 |         epoch += 1
116 | 


--------------------------------------------------------------------------------
/scripts/monolingual/vocabulary.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Automatically load the wordmap, if available.
 3 | """
 4 | 
 5 | import cPickle
 6 | from common.file import myopen
 7 | import sys
 8 | 
 9 | def _wordmap_filename(name):
10 |     import common.hyperparameters, common.options
11 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
12 |     return HYPERPARAMETERS["MONOLINGUAL_VOCABULARY_IDMAP_FILE"]
13 | 
14 | wordmap = None
15 | try:
16 |     wordmap = cPickle.load(myopen(_wordmap_filename()))
17 |     wordmap.str = wordmap.key
18 | except: pass
19 | 
20 | def write(wordmap, name=""):
21 |     """
22 |     Write the word ID map, passed as a parameter.
23 |     """
24 |     print >> sys.stderr, "Writing word map to %s..." % _wordmap_filename(name)
25 |     cPickle.dump(wordmap, myopen(_wordmap_filename(name), "w"))
26 | 


--------------------------------------------------------------------------------
/scripts/ngrams.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Dump n-gram counts over entire training data as YAML.
 4 | """
 5 | 
 6 | import sys
 7 | from common.stats import stats
 8 | 
 9 | from collections import defaultdict
10 | cnt = defaultdict(int)
11 | if __name__ == "__main__":
12 |     import common.hyperparameters, common.options
13 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
14 |     HYPERPARAMETERS, options, args = common.options.reparse(HYPERPARAMETERS)
15 |     import hyperparameters
16 | 
17 |     import vocabulary
18 |     print >> sys.stderr, "Reading vocab"
19 |     vocabulary.read()
20 |     from vocabulary import wordmap
21 | 
22 |     import train
23 |     for (i, e) in enumerate(train.get_train_example()):
24 |         cnt[tuple([wordmap.str(t) for t in e])] += 1
25 |         if i % 10000 == 0:
26 |             print >> sys.stderr, "Read %d examples" % i
27 |             print >> sys.stderr, stats()
28 |         if i > 100000000:
29 |             break
30 |     cnt = [(t, cnt[t]) for t in cnt]
31 |     import common.json
32 |     common.json.dump(cnt, sys.stdout)
33 | 


--------------------------------------------------------------------------------
/scripts/preprocess/filter-sentences-by-lemma.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | """
 3 | For the N files given as command line arguments, filter the sentences
 4 | to be only those in which the first file contains a word that lemmatizes
 5 | to one of the W2W FOCUS LEMMAS.
 6 | We write files that are prefixed by "filtered-"
 7 | """
 8 | 
 9 | from common.str import percent
10 | import string
11 | import sys
12 | 
13 | import common.hyperparameters, common.options
14 | HYPERPARAMETERS = common.hyperparameters.read("language-model")
15 | HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
16 | 
17 | if HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0:
18 |     print >> sys.stderr, "There are no focus lemmas, hence we have nothing to filter"
19 |     sys.exit(0)
20 | 
21 | assert len(args) >= 1
22 | 
23 | from common.stats import stats
24 | from lemmatizer import lemmatize
25 | 
26 | print >> sys.stderr, "Loaded Morphological analyizer"
27 | print >> sys.stderr, stats()
28 | 
29 | from itertools import izip
30 | import os.path, os
31 | 
32 | filenames = args
33 | outfilenames = [os.path.join(os.path.dirname(f), "filtered-%s" % os.path.basename(f)) for f in filenames]
34 | 
35 | print >> sys.stderr, "Reading from %s" % `filenames`
36 | print >> sys.stderr, "Writing to %s" % `outfilenames`
37 | 
38 | for f in filenames: assert os.path.exists(f)
39 | for f in outfilenames:
40 |     if os.path.exists(f):
41 |         print >> sys.stderr, "Warning, going to overwrite %s" % f
42 | 
43 | #print "Sleeping for 10 seconds..."
44 | #import time
45 | #time.sleep(10)
46 | 
47 | inf = [open(f) for f in filenames]
48 | outf = [open(f, "wt") for f in outfilenames]
49 | 
50 | tot = 0
51 | cnt = 0
52 | for lines in izip(*inf):
53 |     tot += 1
54 |     keep = False
55 |     for w in string.split(lines[0]):
56 |         if lemmatize("en", w) in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
57 |             keep = True
58 |             break
59 |     if keep:
60 |         cnt += 1
61 |         for l, f in izip(lines, outf):
62 |             f.write(l)
63 |     if tot % 10000 == 0:
64 |         print >> sys.stderr, "%s lines kept" % percent(cnt, tot)
65 |         print >> sys.stderr, stats()
66 | 


--------------------------------------------------------------------------------
/scripts/preprocess/lemmatizer.py:
--------------------------------------------------------------------------------
1 | ../lemmatizer.py


--------------------------------------------------------------------------------
/scripts/preprocess/lowercase.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | #
3 | #  Lowercase <>
4 | #
5 | 
6 | while(<>) {
7 |     print lc $_;
8 | }
9 | 


--------------------------------------------------------------------------------
/scripts/preprocess/preprocess-validation.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | #  Transform the validation data into a form that it can be used by the system.
 4 | #
 5 | 
 6 | $VDIR = "/u/turian/data/SemEval-2-2010/Task 3 - Cross-Lingual Word Sense Disambiguation";
 7 | 
 8 | foreach $f (`find '$VDIR' -name \*.data`) {
 9 |     open(F, "<$f") or die $!;
10 |     while (<F>) {
11 |         $lemma = $1 if /<lexelt item="(.*)\.n">/;
12 |         if (/<context>(.*)<\/context>/) {
13 |             $l = $1;
14 |             $l =~ s/<head>[^<>]*<\/head>/$lemma/g;
15 |             open(O, "| ~/data/europarl-v5/europarl/tools/tokenizer.perl -l en | ~/data/europarl-v5/preprocessed/lowercase.perl | ~/utils/src/treetagger-3.2/l.py en > /tmp/removeme.txt");
16 |             print O $l;
17 |             $l = `cat /tmp/removeme.txt`;
18 |             print $l;
19 |         }
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/scripts/preprocess/reverse-alignment.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | #  USAGE:
 4 | #       ./reverse-alignment.pl filename.align.l1-l2 [...]
 5 | #
 6 | #  Create a file filename.align.l2-l1 with the alignments reversed.
 7 | #
 8 | 
 9 | die $! unless scalar @ARGV >= 1;
10 | 
11 | foreach $f (@ARGV) {
12 |     if ($f =~ m/(.*\.align\.)(..)-(..)$/) {
13 |         $fnew = "$1$3-$2";
14 |     } else {
15 |         die $!;
16 |     }
17 |    
18 |     if (-e $fnew) {
19 |         print "$fnew already exists";
20 |         next;
21 |     }
22 |     
23 |     $cmd = "cat $f | perl -ne 's/(\\d+)-(\\d+)/\$2-\$1/g; print' > $fnew";
24 |     print "$cmd\n";
25 |     system("$cmd");
26 | 
27 |     print "SANITY CHECK... (shouldn't see any output after this command)\n";
28 |     $cmd = "cat $fnew | perl -ne 's/(\\d+)-(\\d+)/\$2-\$1/g; print' | diff - $f";
29 |     print "$cmd\n";
30 |     system("$cmd");
31 | }
32 | 


--------------------------------------------------------------------------------
/scripts/random-validation-examples.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | #  Print out validation examples, disregarding vocabulary.
 4 | #
 5 | #  @TODO: Don't duplicate get_example code here and twice in train.py
 6 | #
 7 | 
 8 | from common.file import myopen
 9 | import string
10 | import sys
11 | 
12 | def get_example(f):
13 |     import common.hyperparameters
14 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
15 |     for l in myopen(f):
16 |         prevwords = []
17 |         for w in string.split(l):
18 |             w = string.strip(w)
19 |             prevwords.append(w)
20 |             if len(prevwords) >= HYPERPARAMETERS["WINDOW_SIZE"]:
21 |                 yield prevwords[-HYPERPARAMETERS["WINDOW_SIZE"]:]
22 | 
23 | if __name__ == "__main__":
24 |     import common.hyperparameters, common.options
25 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
26 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
27 |     import hyperparameters
28 | 
29 |     print >> sys.stderr, "Reading examples from %s" % HYPERPARAMETERS["ORIGINAL VALIDATION_SENTENCES"]
30 |     ves = [e for e in get_example(HYPERPARAMETERS["ORIGINAL VALIDATION_SENTENCES"])]
31 |     import random
32 |     random.shuffle(ves)
33 |     print >> sys.stderr, "Reading %d examples to %s" % (HYPERPARAMETERS["VALIDATION EXAMPLES"], HYPERPARAMETERS["VALIDATION_SENTENCES"])
34 |     o = myopen(HYPERPARAMETERS["VALIDATION_SENTENCES"], "w")
35 |     for e in ves[:HYPERPARAMETERS["VALIDATION EXAMPLES"]]:
36 |         o.write(string.join(e) + "\n")
37 | 


--------------------------------------------------------------------------------
/scripts/rundir.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Run directory
 3 | """
 4 | 
 5 | import common.hyperparameters, common.options, common.dump
 6 | 
 7 | _rundir = None
 8 | def rundir():
 9 |     global _rundir
10 |     if _rundir is None:
11 |         HYPERPARAMETERS = common.hyperparameters.read("language-model")
12 |         _rundir = common.dump.create_canonical_directory(HYPERPARAMETERS)
13 |     return _rundir
14 | 


--------------------------------------------------------------------------------
/scripts/w2w/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/turian/neural-language-model/f7559a6cc4e9f4c34a553fbda974762f2d3f781b/scripts/w2w/__init__.py


--------------------------------------------------------------------------------
/scripts/w2w/build-example-cache.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Extract all training examples, and cache them.
 4 | """
 5 | 
 6 | if __name__ == "__main__":
 7 |     import common.hyperparameters, common.options
 8 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
 9 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
10 |     import hyperparameters
11 | 
12 |     import logging
13 |     logging.basicConfig(level=logging.INFO)
14 | 
15 |     import w2w.examples
16 |     w2w.examples.all_training_examples_cached()
17 | 


--------------------------------------------------------------------------------
/scripts/w2w/build-initial-embeddings.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Given embeddings in one language, initialize embeddings in all languages
  4 | using these monolingual embeddings.  We do this as a weighted average
  5 | of the translations of the target word in the embedding language.
  6 | (However, we only do the weighted average over words that have
  7 | embeddings. By comparison, we could do the weighted average and treat
  8 | words without embeddings as *UNKNOWN* in the embedding language, and
  9 | include these embeddings. But we don't.)
 10 | """
 11 | 
 12 | def visualize(embeddings, idxs, name, PERPLEXITY=30):
 13 |     idxs = [w % embeddings.shape[0] for w in idxs]
 14 |     titles = [wordform(w) for w in idxs]
 15 |     import os.path
 16 |     filename = HYPERPARAMETERS["INITIAL_EMBEDDINGS"] + ".visualize-%s.png" % name
 17 |     try:
 18 |         from textSNE.calc_tsne import tsne
 19 | #       from textSNE.tsne import tsne
 20 |         out = tsne(embeddings[idxs], perplexity=PERPLEXITY)
 21 |         from textSNE.render import render
 22 |         render([(title, point[0], point[1]) for title, point in zip(titles, out)], filename)
 23 |     except IOError:
 24 |         logging.info("ERROR visualizing", filename, ". Continuing...")
 25 | 
 26 | 
 27 | if __name__ == "__main__":
 28 |     import common.hyperparameters, common.options
 29 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
 30 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
 31 |     import hyperparameters
 32 | 
 33 |     import sys
 34 |     from common.stats import stats
 35 |     from common.str import percent
 36 |     import common.file
 37 |     import numpy
 38 |     import string
 39 |     import copy
 40 |     import cPickle
 41 | 
 42 |     import logging
 43 |     logging.basicConfig(level=logging.DEBUG)
 44 | 
 45 |     from w2w.vocabulary import wordmap, language, wordform
 46 |     from w2w.targetvocabulary import targetmap
 47 | 
 48 |     # Read in the embeddings
 49 |     print >> sys.stderr, "Reading embeddings from %s..." % HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]
 50 |     print >> sys.stderr, stats()
 51 |     original_embeddings = {}
 52 |     tot = 0
 53 |     for l in common.file.myopen(HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]):
 54 |         vals = string.split(l)
 55 |         word = vals[0]
 56 |         if HYPERPARAMETERS["W2W LOWERCASE INITIAL EMBEDDINGS BEFORE INITIALIZATION"] and word != "*UNKNOWN*":
 57 |             if (word[0] == '*' and word[-1] == '*' and len(word) > 1):
 58 |                 print >> sys.stderr, "WEIRD WORD: %s" % word
 59 |             word = string.lower(word)
 60 |         assert len(vals[1:]) == HYPERPARAMETERS["EMBEDDING_SIZE"]
 61 |         tot += 1
 62 |         if tot % 10000 == 0:
 63 |             print >> sys.stderr, "\tRead %d lines from %s" % (tot, HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"])
 64 |         if word in original_embeddings:
 65 | #            print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0])
 66 |             continue
 67 |         else:
 68 |             original_embeddings[word] = numpy.array([float(v) for v in vals[1:]])
 69 |     print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]
 70 |     print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(tot-len(original_embeddings), tot)
 71 |     print >> sys.stderr, stats()
 72 | 
 73 |     reversemap = targetmap(name="reverse")
 74 | 
 75 |     embeddings = numpy.zeros((wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]))
 76 |     assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])
 77 | 
 78 |     ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]
 79 |     for w in range(wordmap().len):
 80 |         embedding = None
 81 |         # If this word is in a different language than the embeddings.
 82 |         if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]:
 83 |             if w not in reversemap:
 84 |                 print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % `wordmap().str(w)`
 85 |                 embedding = original_embeddings["*UNKNOWN*"]
 86 |             elif ELANG not in reversemap[w]:
 87 |                 print >> sys.stderr, "Have no %s translations for word %s, only have %s, using *UNKNOWN*" % (ELANG, wordmap().str(w), reversemap[w].keys())
 88 |                 embedding = original_embeddings["*UNKNOWN*"]
 89 |             else:
 90 |                 # Mix the target word embedding over the weighted translation into the source language
 91 | 
 92 |                 mixcnt = {}
 93 |                 for w2 in reversemap[w][ELANG]:
 94 |                     if language(w2) is None:
 95 |                         assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
 96 |                         continue
 97 |                     assert language(w2) == ELANG
 98 |                     if wordform(w2) not in original_embeddings:
 99 |                         print >> sys.stderr, "%s is NOT mixed by %s %d (no embedding)" % (wordmap().str(w), wordmap().str(w2), reversemap[w][ELANG][w2])
100 |                         continue
101 |                     mixcnt[w2] = reversemap[w][ELANG][w2]
102 | 
103 |                 tot = 0
104 |                 for w2 in mixcnt: tot += mixcnt[w2]
105 | 
106 |                 if tot == 0:
107 |                     print >> sys.stderr, "Unable to mix ANY translations for %s, using *UNKNOWN*" % `wordmap().str(w)`
108 |                     embedding = original_embeddings["*UNKNOWN*"]
109 |                 else:
110 |                     embedding = numpy.zeros((HYPERPARAMETERS["EMBEDDING_SIZE"]))
111 |                     for w2 in mixcnt:
112 |                         embedding += 1. * mixcnt[w2] / tot * (original_embeddings[wordform(w2)])
113 | #                       print >> sys.stderr, "%s is mixed %s by %s" % (wordmap().str(w), percent(mixcnt[w2], tot), wordmap().str(w2))
114 |         else:
115 |             if wordform(w) not in original_embeddings:
116 |                 print >> sys.stderr, "Word %s has no embedding, using *UNKNOWN*" % `wordmap().str(w)`
117 |                 embedding = original_embeddings["*UNKNOWN*"]
118 |             else:
119 |                 embedding = original_embeddings[wordform(w)]
120 |         embeddings[w] = copy.copy(embedding)
121 | 
122 | #        print wordform(w), language(w),
123 | #        for v in embeddings[w]:
124 | #            print v,
125 | #        print
126 | 
127 |     print >> sys.stderr, "Dumping initial embeddings to %s" % HYPERPARAMETERS["INITIAL_EMBEDDINGS"]
128 |     cPickle.dump(embeddings, common.file.myopen(HYPERPARAMETERS["INITIAL_EMBEDDINGS"], "w"))
129 | 
130 |     import random
131 |     WORDCNT = 500
132 |     idxs = range(wordmap().len)
133 |     random.shuffle(idxs)
134 |     idxs = idxs[:WORDCNT]
135 | 
136 |     visualize(embeddings, idxs, "randomized")
137 |     visualize(embeddings, range(WORDCNT), "mostcommon")
138 |     visualize(embeddings, range(-1, -WORDCNT*50, -50), "leastcommon")
139 |     visualize(embeddings, range(wordmap().len/2-WORDCNT*20/2,wordmap().len/2+WORDCNT*20/2, 20), "midcommon")
140 | 


--------------------------------------------------------------------------------
/scripts/w2w/build-target-vocabulary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Read in the w2w corpora (bi + monolingual), and build the translation
 4 | vocabulary (for each source word, what target words it can translate to).
 5 | Note: Each corpus is weighted in proportion to its length. (i.e. all
 6 | words are equally weighted.)
 7 | """
 8 | 
 9 | import sys
10 | 
11 | if __name__ == "__main__":
12 |     import common.hyperparameters, common.options
13 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
14 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
15 |     import hyperparameters
16 | 
17 |     import logging
18 |     logging.basicConfig(level=logging.DEBUG)
19 | 
20 |     import w2w.corpora
21 |     from w2w.vocabulary import wordmap, language, wordform
22 |     from collections import defaultdict
23 |     from common.mydict import sort as dictsort
24 | 
25 |     cnt = {}
26 |     reversecnt = {}
27 |     for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames():
28 |         for ws1, ws2, links in w2w.corpora.bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign):
29 |             for i1, i2 in links:
30 |                 if len(ws1) <= i1 or len(ws2) <= i2:
31 |                     print >> sys.stderr, "This is going to break on link (%d, %d) because lens = (%d, %d)" % (i1,i2, len(ws1), len(ws2))
32 |                     print >> sys.stderr, [wordform(w) for w in ws1]
33 |                     print >> sys.stderr, [wordform(w) for w in ws2]
34 |                     print >> sys.stderr, links
35 |                 w1 = ws1[i1]
36 |                 w2 = ws2[i2]
37 | #                print wordmap.str(w1)[1], wordmap.str(w2)[1]
38 | 
39 |                 l2new = language(w2)
40 | 
41 |                 assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
42 |                 # Skip translations to unknown words
43 |                 if wordform(w2) == "*UNKNOWN*": continue
44 | 
45 |                 assert l2new == l2
46 | 
47 | 
48 |                 # We don't filter here, otherwise we will get a reversemap that only maps to focus lemmas.
49 | #                # If we are filtering examples by lemma
50 | #                if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
51 | #                    assert language(w1) == "en"
52 | #                    from lemmatizer import lemmatize
53 | #                    if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
54 | ##                        logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
55 | #                        continue
56 | 
57 |                 if w1 not in cnt: cnt[w1] = {}
58 |                 if l2 not in cnt[w1]: cnt[w1][l2] = defaultdict(int)
59 |                 cnt[w1][l2][w2] += 1
60 | 
61 |                 if w2 not in reversecnt: reversecnt[w2] = {}
62 |                 if l1 not in reversecnt[w2]: reversecnt[w2][l1] = defaultdict(int)
63 |                 reversecnt[w2][l1][w1] += 1
64 | 
65 | #    for w1 in cnt:
66 | #        for l2 in cnt[w1]:
67 | #            print wordmap().str(w1), l2, [(n, wordmap().str(w2)) for n, w2 in dictsort(cnt[w1][l2])]
68 | 
69 | #    words = {}
70 | #    for (l, w) in wordfreq:
71 | #        if l not in words: words[l] = []
72 | #        if wordfreq[(l, w)] >= HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"]:
73 | #            words[l].append(w)
74 | 
75 |     import w2w.targetvocabulary
76 |     w2w.targetvocabulary.write(cnt)
77 |     w2w.targetvocabulary.write(reversecnt, name="reverse")
78 | 


--------------------------------------------------------------------------------
/scripts/w2w/build-vocabulary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Read in the w2w corpora (bi + monolingual), and build the vocabulary as
 4 | all words per language that occur at least HYPERPARAMETERS["W2W MINIMUM
 5 | WORD FREQUENCY"] times.
 6 | Each corpus is weighted in proportion to its length. (i.e. all words are equally weighted.)
 7 | """
 8 | 
 9 | import sys
10 | from common.stats import stats
11 | 
12 | def readwords(filename):
13 |     print >> sys.stderr, "Processing %s" % filename
14 |     i = 0
15 |     for line in open(filename):
16 |         i += 1
17 |         if i % 100000 == 0:
18 |             print >> sys.stderr, "Read line %d of %s..." % (i, filename)
19 |             print >> sys.stderr, stats()
20 |         for w in string.split(line):
21 |             yield w
22 | 
23 | if __name__ == "__main__":
24 |     import common.hyperparameters, common.options
25 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
26 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
27 |     import hyperparameters
28 | 
29 |     import logging
30 |     logging.basicConfig(level=logging.DEBUG)
31 | 
32 |     import w2w.corpora
33 |     import string
34 | 
35 |     from common.mydict import sort as dictsort
36 | 
37 |     from collections import defaultdict
38 |     wordfreq = defaultdict(int)
39 |     for l1, l2, f1, f2, falign in w2w.corpora.bicorpora_filenames():
40 |         for w in readwords(f1): wordfreq[(l1,w)] += 1
41 |         for w in readwords(f2): wordfreq[(l2,w)] += 1
42 | 
43 |     for l, f in w2w.corpora.monocorpora_filenames():
44 |         assert 0
45 | 
46 |     for (l, w) in wordfreq.keys():
47 |         if wordfreq[(l, w)] < HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"]:
48 |             del wordfreq[(l, w)]
49 |         if w == "*UNKNOWN*":
50 |             del wordfreq[(l, w)]
51 | 
52 |     import w2w.vocabulary
53 |     import common.idmap
54 | 
55 |     wordfreqkeys = [key for cnt, key in dictsort(wordfreq)]
56 | 
57 | #    for k in wordfreq.keys():
58 | #        print k
59 |     v = common.idmap.IDmap([(None, "*LBOUNDARY*"), (None, "*RBOUNDARY*")] + wordfreqkeys, allow_unknown=HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"], unknown_key=(None, "*UNKNOWN*"))
60 |     w2w.vocabulary.write(v)
61 | 


--------------------------------------------------------------------------------
/scripts/w2w/corpora.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Methods for reading corpora.
  3 | """
  4 | 
  5 | from os.path import join, isdir, exists
  6 | import sys
  7 | import os
  8 | import re
  9 | import itertools
 10 | import string
 11 | import logging
 12 | 
 13 | from common.stats import stats
 14 | from common.str import percent
 15 | 
 16 | def bicorpora_filenames():
 17 |     """
 18 |     For each bicorpora language pair in "W2W BICORPORA", traverse that
 19 |     language pair's subdirectory of DATA_DIR. Find all corpora files in
 20 |     that directory.
 21 |     Generator yields: tuples of type (l1, l2, f1, f2, falign), where l1 =
 22 |     source language, l2 = target language, f1 = source filename, f2 =
 23 |     target filename, falign = alignment file.
 24 |     """
 25 |     import common.hyperparameters, hyperparameters
 26 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
 27 |     
 28 |     for (l1, l2) in HYPERPARAMETERS["W2W BICORPORA"]:
 29 |         d = join(HYPERPARAMETERS["DATA_DIR"], "%s-%s" % (l1, l2))
 30 |         assert isdir(d)
 31 |         l1re = re.compile("%s$" % l1)
 32 |         alignre = re.compile("align.*-%s$" % l1)
 33 |         for f1 in os.listdir(d):
 34 |             f1 = join(d, f1)
 35 |             if not l1re.search(f1) or alignre.search(f1): continue
 36 |             f2 = l1re.sub(l2, f1)
 37 |             assert exists(f2)
 38 |             falign = l1re.sub("align.%s-%s" % (l1, l2), f1)
 39 |             assert exists(falign)
 40 |             yield l1, l2, f1, f2, falign
 41 | 
 42 | def monocorpora_filenames():
 43 |     import common.hyperparameters, hyperparameters
 44 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
 45 |     # Not yet implemented
 46 |     assert len(HYPERPARAMETERS["W2W MONOCORPORA"]) == 0
 47 |     return []
 48 | 
 49 | def bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign):
 50 |     """
 51 |     Given languages l1 and l2 and their bicorpus filenames f1, f2, and falign,
 52 |     yield tuples of the former (ws1, ws2, links),
 53 |     where ws1 are the word ids in the sentence from f1,
 54 |     where ws2 are the word ids in the sentence from f2,
 55 |     and links is a list of (i1, i2) word indexes that are linked.
 56 |     """
 57 |     from w2w.vocabulary import wordmap
 58 | 
 59 |     i = 0
 60 |     emptycnt = 0
 61 |     logging.info("Reading %s,%s sentences and alignments from %s, %s, %s" % (l1, l2, f1, f2, falign))
 62 |     fil1, fil2, filalign = open(f1), open(f2), open(falign)
 63 |     for (s1, s2, salign) in itertools.izip(fil1, fil2, filalign):
 64 |    #     print s1, s2, salign,
 65 |         i += 1
 66 |         if i % 100000 == 0:
 67 |             logging.info("\tRead line %d of %s, %s, %s..." % (i, f1, f2, falign))
 68 |             logging.info("\tEmpty sentences are %s..." % (percent(emptycnt, i)))
 69 |             logging.info("\t%s" % stats())
 70 | 
 71 |         ws1 = [(l1, w1) for w1 in string.split(s1)]
 72 |         ws2 = [(l2, w2) for w2 in string.split(s2)]
 73 |         ws1 = [wordmap().id(tok) for tok in ws1]
 74 |         ws2 = [wordmap().id(tok) for tok in ws2]
 75 |    
 76 |         if len(ws1) == 0 or len(ws2) == 0:
 77 |             emptycnt += 1
 78 |             continue
 79 |    
 80 |    #     print ws2, [w2w.vocabulary.wordmap.str(w2) for w2 in ws2]
 81 |         links = [string.split(link, sep="-") for link in string.split(salign)]
 82 |         links = [(int(i1), int(i2)) for i1, i2 in links]
 83 | 
 84 |         yield ws1, ws2, links
 85 |    
 86 |     # Make sure all iterators are exhausted
 87 |     alldone = 0
 88 |     try: value = fil1.next()
 89 |     except StopIteration: alldone += 1
 90 |     try: value = fil2.next()
 91 |     except StopIteration: alldone += 1
 92 |     try: value = filalign.next()
 93 |     except StopIteration: alldone += 1
 94 |     assert alldone == 3
 95 |    
 96 |     logging.info("DONE. Read line %d of %s, %s, %s..." % (i, f1, f2, falign))
 97 |     logging.info("Empty sentences are %s..." % (percent(emptycnt, i)))
 98 |     logging.info(stats())
 99 | 
100 | if __name__ == "__main__":
101 |     for l1, l2, f1, f2, falign in bicorpora_filenames():
102 |         print l1, l2, f1, f2, falign
103 |     print monocorpora_filenames()
104 | 


--------------------------------------------------------------------------------
/scripts/w2w/dump-example-cache.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Dump the w2w target vocabulary.
 4 | """
 5 | 
 6 | import sys
 7 | 
 8 | if __name__ == "__main__":
 9 |     import common.hyperparameters, common.options
10 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
11 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
12 | 
13 |     import logging
14 |     logging.basicConfig(level=logging.INFO)
15 | 
16 |     import w2w.examples
17 |     for e in w2w.examples.get_all_training_examples_cached():
18 |         print e
19 | 


--------------------------------------------------------------------------------
/scripts/w2w/dump-target-vocabulary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Dump the w2w target vocabulary.
 4 | """
 5 | 
 6 | import sys
 7 | 
 8 | if __name__ == "__main__":
 9 |     import common.hyperparameters, common.options
10 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
11 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
12 |     import hyperparameters
13 | 
14 |     from common.mydict import sort as dictsort
15 |     from common.str import percent
16 | 
17 |     from vocabulary import wordmap, wordform, language
18 |     from targetvocabulary import targetmap
19 | 
20 |     for w1 in wordmap().all:
21 |         w1 = wordmap().id(w1)
22 |         # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD
23 |         assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
24 |         if language(w1) is None:
25 |             print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)`
26 |             continue
27 |         if w1 not in targetmap():
28 |             print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)`
29 |             continue
30 |         for l2 in targetmap()[w1]:
31 |             totcnt = 0
32 |             for cnt, w2 in dictsort(targetmap()[w1][l2]): totcnt += cnt
33 |             print wordmap().str(w1), l2, [(percent(cnt, totcnt), wordform(w2)) for cnt, w2 in dictsort(targetmap()[w1][l2])]
34 | 
35 |     print >> sys.stderr, "REVERSE MAP NOW"
36 | 
37 |     for w1 in wordmap().all:
38 |         w1 = wordmap().id(w1)
39 |         # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD
40 |         assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
41 |         if language(w1) is None:
42 |             print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)`
43 |             continue
44 |         if w1 not in targetmap(name="reverse"):
45 |             print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)`
46 |             continue
47 |         for l2 in targetmap(name="reverse")[w1]:
48 |             totcnt = 0
49 |             for cnt, w2 in dictsort(targetmap(name="reverse")[w1][l2]): totcnt += cnt
50 |             print wordmap().str(w1), l2, [(percent(cnt, totcnt), wordform(w2)) for cnt, w2 in dictsort(targetmap(name="reverse")[w1][l2])]
51 | 


--------------------------------------------------------------------------------
/scripts/w2w/dump-vocabulary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Dump the w2w vocaulary.
 4 | """
 5 | 
 6 | if __name__ == "__main__":
 7 |     import common.hyperparameters, common.options
 8 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
 9 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
10 |     import hyperparameters
11 | 
12 |     from vocabulary import wordmap
13 |     for w in wordmap().all:
14 |         print w
15 | 


--------------------------------------------------------------------------------
/scripts/w2w/examples.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Streaming examples.
  3 | """
  4 | 
  5 | from w2w.corpora import bicorpora_filenames, monocorpora_filenames, bicorpus_sentences_and_alignments
  6 | from common.file import myopen
  7 | from common.stats import stats
  8 | 
  9 | from w2w.targetvocabulary import targetmap
 10 | from w2w.vocabulary import wordmap, language, wordform
 11 | import string
 12 | import logging
 13 | 
 14 | import random
 15 | from rundir import rundir
 16 | import os.path
 17 | import cPickle
 18 | 
 19 | import murmur
 20 | 
 21 | class MonolingualExample:
 22 |     def __init__(self, l1, l1seq, w1):
 23 |         """
 24 |         l1 = source language
 25 |         l1seq = sequence of word IDs in source language
 26 |         w1 = focus word ID in source language
 27 |         """
 28 |         self.l1 = l1
 29 |         self.l1seq = l1seq
 30 |         self.w1 = w1
 31 | 
 32 |         if wordform(self.w1) != "*UNKNOWN*":
 33 |             assert self.l1 == language(self.w1)
 34 | 
 35 |     def __str__(self):
 36 |         return "%s" % `(self.l1, wordform(self.w1), [wordmap().str(w)[1] for w in self.l1seq])`
 37 | 
 38 | class BilingualExample(MonolingualExample):
 39 |     def __init__(self, l1, l1seq, w1, w2):
 40 |         """
 41 |         l1 = source language
 42 |         l1seq = sequence of word IDs in source language
 43 |         w1 = focus word ID in source language
 44 |         w2 = focus word ID in target language
 45 |         """
 46 |         MonolingualExample.__init__(self, l1, l1seq, w1)
 47 |         self.w2 = w2
 48 | 
 49 |     @property
 50 |     def l2(self):
 51 |         return language(self.w2)
 52 | 
 53 |     @property
 54 |     def corrupt(self):
 55 |         """
 56 |         Return a (notw2, weight), a corrupt target word and its weight.
 57 |         Note: This will return a different random value every call.
 58 |         """
 59 |         from hyperparameters import HYPERPARAMETERS
 60 |         import random
 61 |         possible_targets = targetmap()[self.w1][self.l2]
 62 |         assert len(possible_targets) > 1
 63 |         assert self.w2 in possible_targets
 64 |         notw2 = self.w2
 65 |         cnt = 0
 66 |         while self.w2 == notw2:
 67 |             if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0:
 68 |                 notw2 = random.choice(possible_targets)
 69 |                 pr = 1./len(possible_targets)
 70 |             elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1:
 71 |                 assert 0
 72 |     #            import noise
 73 |     #            from common.myrandom import weighted_sample
 74 |     #            e[-1], pr = weighted_sample(noise.indexed_weights())
 75 |     ##            from vocabulary import wordmap
 76 |     ##            print wordmap.str(e[-1]), pr
 77 |             else:
 78 |                 assert 0
 79 |             cnt += 1
 80 |             # Backoff to 0gram smoothing if we fail 10 times to get noise.
 81 |             if cnt > 10: notw2 = random.choice(possible_targets)
 82 | 
 83 |         if HYPERPARAMETERS["UNIFORM EXAMPLE WEIGHTS"]:
 84 |             weight = 1.
 85 |         else:
 86 |             weight = 1./pr
 87 |         return notw2, weight
 88 | 
 89 |     def __str__(self):
 90 |         return "%s" % `(wordmap().str(self.w2), self.l1, wordform(self.w1), [wordmap().str(w)[1] for w in self.l1seq])`
 91 | 
 92 | def get_training_biexample(l1, l2, f1, f2, falign):
 93 |     """
 94 |     Generator of bilingual training examples from this bicorpus.
 95 |     """
 96 |     import common.hyperparameters
 97 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
 98 |     WINDOW = HYPERPARAMETERS["WINDOW_SIZE"]
 99 | 
100 |     for ws1, ws2, links in bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign):
101 |         for i1, i2 in links:
102 |             w1 = ws1[i1]
103 |             w2 = ws2[i2]
104 | 
105 |             l2new = language(w2)
106 |             assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
107 |             # Skip translations to unknown words
108 |             if wordform(w2) == "*UNKNOWN*": continue
109 |             assert l2new == l2
110 | 
111 |             # Skip translations from unknown words
112 |             if wordform(w1) == "*UNKNOWN*": continue
113 | 
114 |             # If we are filtering examples by lemma
115 |             if not(HYPERPARAMETERS["W2W FOCUS LEMMAS"] is None or len (HYPERPARAMETERS["W2W FOCUS LEMMAS"]) == 0):
116 | #                print wordmap().str(w1), wordmap().str(w2)
117 |                 assert language(w1) == "en"
118 | #                from lemmatizer import lemmatize
119 | #                if lemmatize(language(w1), wordform(w1)) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
120 | #                    logging.debug("Focus word %s (lemma %s) not in our list of focus lemmas" % (`wordmap().str(w1)`, lemmatize(language(w1), wordform(w1))))
121 |                 if wordform(w1) not in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
122 |                     logging.debug("Focus word %s not in our list of focus lemmas" % (`wordmap().str(w1)`))
123 |                     continue
124 | 
125 |             if w1 not in targetmap():
126 |                 logging.warning("No translations for word %s, skipping" % (`wordmap().str(w1)`))
127 |                 continue
128 | 
129 |             if l2new not in targetmap()[w1]:
130 |                 logging.warning("Word %s has no translations for language %s, skipping" % (`wordmap().str(w1)`, l2new))
131 |                 continue
132 | 
133 |             if w2 not in targetmap()[w1][l2new]:
134 |                 logging.error("Word %s cannot translate to word %s, skipping" % (`wordmap().str(w1)`, `wordmap().str(w2)`))
135 |                 continue
136 | 
137 |             if len(targetmap()[w1][l2new]) == 1:
138 |                 logging.debug("Word %s has only one translation in language %s, skipping" % (`wordmap().str(w1)`, l2new))
139 |                 continue
140 | 
141 |             # Extract the window of tokens around index i1. Pad with *LBOUNDARY* and *RBOUNDARY* as necessary.
142 |             min = i1 - (WINDOW-1)/2
143 |             max = i1 + (WINDOW-1)/2
144 |             lpad = 0
145 |             rpad = 0
146 |             if min < 0:
147 |                 lpad = -min
148 |                 min = 0
149 |             if max >= len(ws1):
150 |                 rpad = max - (len(ws1)-1)
151 |                 max = len(ws1)-1
152 |             assert lpad + (max - min + 1) + rpad == WINDOW
153 | 
154 | #            print i1 - (WINDOW-1)/2, i1 + (WINDOW-1)/2
155 | #            print "min=%d, max=%d, lpad=%d, rpad=%d" % (min, max, lpad, rpad)
156 |             seq = [wordmap().id((None, "*LBOUNDARY*"))]*lpad + ws1[min:max+1] + [wordmap().id((None, "*RBOUNDARY*"))]*rpad
157 | #            print [wordmap.str(w) for w in seq]
158 |             assert len(seq) == WINDOW
159 | #            print ws1[i1 - (WINDOW-1)/2:i1 + (WINDOW-1)/2]
160 | 
161 |             assert seq[(WINDOW-1)/2] == w1
162 |             yield BilingualExample(l1, seq, w1, w2)
163 | 
164 | def is_validation_example(e):
165 |     import common.hyperparameters
166 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
167 |     examples_per_validation = int(1/HYPERPARAMETERS["PERCENT_OF_TRAINING_EXAMPLES_FOR_VALIDATION"])
168 |     return murmur.string_hash(`e`) % examples_per_validation == 0
169 | 
170 | def get_training_minibatch_online():
171 |     """
172 |     Warning: The approach has the weird property that if one language
173 |     pair's corpus is way longer than others, it will be the only examples
174 |     for a while after the other corpora are exhausted.
175 |     """
176 | 
177 |     assert 0 # We need to filter validation examples
178 | 
179 |     import common.hyperparameters
180 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
181 |     MINIBATCH_SIZE = HYPERPARAMETERS["MINIBATCH SIZE"]
182 | 
183 |     generators = []
184 |     for l1, l2, f1, f2, falign in bicorpora_filenames():
185 | #        print l1, l2, f1, f2, falign
186 |         generators.append(get_training_biexample(l1, l2, f1, f2, falign))
187 |     for l, f in monocorpora_filenames(): assert 0
188 | 
189 |     # Cycles over generators.
190 |     idx = 0
191 |     last_minibatch = None
192 |     while 1:
193 |         minibatch = []
194 |         for e in generators[idx]:
195 |             minibatch.append(e)
196 |             if len(minibatch) >= MINIBATCH_SIZE:
197 |                 break
198 |         if len(minibatch) > 0:
199 |             last_minibatch = idx
200 |             yield minibatch
201 |         elif last_minibatch == idx:
202 |             # We haven't had any minibatch in the last cycle over the generators.
203 |             # So we are done will all corpora.
204 |             break
205 | 
206 |         # Go to the next corpus
207 |         idx = (idx + 1) % len(generators)
208 | 
209 | def training_examples_cache_filename():
210 |     import common.hyperparameters, hyperparameters
211 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
212 |     return os.path.join(HYPERPARAMETERS["DATA_DIR"], "examples-cache.minfreq=%d.include_unknown=%s.window-%d.pkl.gz" % (HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"], HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"], HYPERPARAMETERS["WINDOW_SIZE"]))
213 | 
214 | _all_examples = None
215 | def all_training_examples_cached():
216 |     global _all_examples
217 |     if _all_examples is None:
218 |         try:
219 |             _all_examples, cnt = cPickle.load(myopen(training_examples_cache_filename()))
220 |             assert len(_all_examples) == cnt
221 |             logging.info("Successfully read %d training examples from %s" % (cnt, training_examples_cache_filename()))
222 |             logging.info(stats())
223 |         except:
224 |             logging.info("(Couldn't read training examples from %s, sorry)" % (training_examples_cache_filename()))
225 |             logging.info("Caching all training examples...")
226 |             logging.info(stats())
227 |             _all_examples = []
228 |             for l1, l2, f1, f2, falign in bicorpora_filenames():
229 |                 for e in get_training_biexample(l1, l2, f1, f2, falign):
230 |                     _all_examples.append(e)
231 |                     if len(_all_examples) % 10000 == 0:
232 |                         logging.info("\tcurrently have read %d training examples" % len(_all_examples))
233 |                         logging.info(stats())
234 |             random.shuffle(_all_examples)
235 |             logging.info("...done caching all %d training examples" % len(_all_examples))
236 |             logging.info(stats())
237 | 
238 |             cnt = len(_all_examples)
239 |             cPickle.dump((_all_examples, cnt), myopen(training_examples_cache_filename(), "wb"), protocol=-1)
240 |             assert len(_all_examples) == cnt
241 |             logging.info("Wrote %d training examples to %s" % (cnt, training_examples_cache_filename()))
242 |             logging.info(stats())
243 |     assert _all_examples is not None
244 |     return _all_examples
245 | 
246 | def get_all_training_examples_cached():
247 |     for e in all_training_examples_cached():
248 |         if is_validation_example(e): continue
249 |         yield e
250 | 
251 | def get_all_validation_examples_cached():
252 |     for e in all_training_examples_cached():
253 |         if not is_validation_example(e): continue
254 |         yield e
255 |     
256 | def get_training_minibatch_cached():
257 |     import common.hyperparameters
258 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
259 |     MINIBATCH_SIZE = HYPERPARAMETERS["MINIBATCH SIZE"]
260 | 
261 |     minibatch = []
262 |     for e in get_all_training_examples_cached():
263 |         minibatch.append(e)
264 |         if len(minibatch) >= MINIBATCH_SIZE:
265 |             yield minibatch
266 |             minibatch = []
267 |     if len(minibatch) > 0:
268 |         yield minibatch
269 |         minibatch = []
270 | 
271 | if __name__ == "__main__":
272 |     for minibatch in get_training_minibatch_cached():
273 | #        print len(minibatch)
274 |         for e in minibatch:
275 |             print e
276 | 


--------------------------------------------------------------------------------
/scripts/w2w/state.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Save and load training state.
 3 | @todo: Training state variables (cnt, epoch) should all be combined into one object.
 4 | """
 5 | 
 6 | import logging
 7 | import os.path
 8 | import cPickle
 9 | 
10 | from common.stats import stats
11 | from common.file import myopen
12 | import common.json
13 | import sys
14 | 
15 | _lastfilename = None
16 | def save(translation_model, cnt, lastcnt, epoch, rundir, newkeystr):
17 |     global _lastfilename
18 | 
19 |     filename = os.path.join(rundir, "translation_model-%d%s.pkl" % (cnt, newkeystr))
20 |     logging.info("Writing translation_model to %s..." % filename)
21 |     logging.info(stats())
22 |     cPickle.dump(translation_model, myopen(filename, "wb"), protocol=-1)
23 |     logging.info("...done writing translation_model to %s" % filename)
24 |     logging.info(stats())
25 | 
26 | #    if _lastfilename is not None:
27 | #        logging.info("Removing old translation_model %s..." % _lastfilename)
28 | #        try:
29 | #            os.remove(_lastfilename)
30 | #            logging.info("...removed %s" % _lastfilename)
31 | #        except:
32 | #            logging.info("Could NOT remove %s" % _lastfilename)
33 |     _lastfilename = filename
34 | 
35 |     common.json.dumpfile((cnt, lastcnt, epoch, filename), os.path.join(rundir, "trainstate.json"))
36 | 
37 |     filename = os.path.join(rundir, "newkeystr.txt")
38 |     myopen(filename, "wt").write(newkeystr)
39 | 
40 | def load(rundir, newkeystr):
41 |     """
42 |     Read the directory and load the translation_model, the training count, the training epoch, and the training state.
43 |     """
44 |     global _lastfilename
45 | 
46 |     filename = os.path.join(rundir, "newkeystr.txt")
47 |     assert newkeystr == myopen(filename).read()
48 | 
49 |     (cnt, lastcnt, epoch, filename) = common.json.loadfile(os.path.join(rundir, "trainstate.json"))
50 | 
51 | #    filename = os.path.join(rundir, "translation_model-%d%s.pkl" % (cnt, newkeystr))
52 |     print >> sys.stderr, ("Reading translation_model from %s..." % filename)
53 |     print >> sys.stderr, (stats())
54 |     translation_model = cPickle.load(myopen(filename))
55 |     print >> sys.stderr, ("...done reading translation_model from %s" % filename)
56 |     print >> sys.stderr, (stats())
57 |     _lastfilename = filename
58 | 
59 |     return (translation_model, cnt, lastcnt, epoch)
60 | 


--------------------------------------------------------------------------------
/scripts/w2w/targetvocabulary.py:
--------------------------------------------------------------------------------
 1 | """
 2 | targetmap[w1][l2][w2] = c means that source word ID w1 mapped to target
 3 | language l2 and target word ID w2 with count c.
 4 | """
 5 | 
 6 | import cPickle
 7 | from common.file import myopen
 8 | from common.stats import stats
 9 | import sys
10 | from os.path import join
11 | 
12 | def _targetmap_filename(name=""):
13 |     import common.hyperparameters, common.options, hyperparameters
14 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
15 |     return join(HYPERPARAMETERS["DATA_DIR"], "%stargetmap.minfreq=%d.include_unknown=%s.pkl.gz" % (name, HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"], HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]))
16 | 
17 | _targetmap = {}
18 | def targetmap(name=""):
19 |     global _targetmap
20 |     if name not in _targetmap:
21 |         f = _targetmap_filename(name=name)
22 |         print >> sys.stderr, "Reading target map from %s..." % f
23 |         print >> sys.stderr, stats()
24 |         _targetmap[name] = cPickle.load(myopen(f))
25 |         print >> sys.stderr, "...done reading target map from %s" % f
26 |         print >> sys.stderr, stats()
27 |     return _targetmap[name]
28 | 
29 | def write(_targetmap_new, name=""):
30 |     """
31 |     Write the word ID map, passed as a parameter.
32 |     """
33 |     global _targetmap
34 |     assert name not in _targetmap
35 |     _targetmap[name] = _targetmap_new
36 |     f = _targetmap_filename(name=name)
37 |     print >> sys.stderr, "Writing target map to %s..." % f
38 |     cPickle.dump(_targetmap[name], myopen(f, "w"))
39 | 


--------------------------------------------------------------------------------
/scripts/w2w/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import string
  5 | import common.dump
  6 | from common.file import myopen
  7 | from common.stats import stats
  8 | from common.str import percent
  9 | 
 10 | import miscglobals
 11 | import logging
 12 | 
 13 | import w2w.examples
 14 | import diagnostics
 15 | import state
 16 | 
 17 | import cPickle
 18 | 
 19 | def validate(translation_model, cnt):
 20 |     import math
 21 | #    logranks = []
 22 | #    logging.info("BEGINNING VALIDATION AT TRAINING STEP %d" % cnt)
 23 | #    logging.info(stats())
 24 |     i = 0
 25 |     tot = 0
 26 |     correct = 0
 27 |     for (i, ve) in enumerate(w2w.examples.get_all_validation_examples_cached()):
 28 |         correct_sequences, noise_sequences, weights = ebatch_to_sequences([ve])
 29 |         source_language = ve.l1
 30 |         is_correct = translation_model[source_language].validate_errors(correct_sequences, noise_sequences)
 31 | #        print r
 32 |         for w in weights: assert w == 1.0
 33 | 
 34 |         tot += 1
 35 |         if is_correct: correct += 1
 36 | 
 37 |         if i % 1000 == 0: logging.info("\tvalidating %d examples done..." % i)
 38 | #    logging.info("Validation of model %s at cnt %d: validation err %s" % (translation_model[source_language].modelname, cnt, percent(correct, tot)))
 39 |     logging.info("VALIDATION of model at cnt %d: validation accuracy %s" % (cnt, percent(correct, tot)))
 40 | ##        logging.info([wordmap.str(id) for id in ve])
 41 | #        logranks.append(math.log(m.validate(ve)))
 42 | #        if (i+1) % 10 == 0:
 43 | #            logging.info("Training step %d, validating example %d, mean(logrank) = %.2f, stddev(logrank) = %.2f" % (cnt, i+1, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks))))
 44 | #            logging.info(stats())
 45 | #    logging.info("FINAL VALIDATION AT TRAINING STEP %d: mean(logrank) = %.2f, stddev(logrank) = %.2f, cnt = %d" % (cnt, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)), i+1))
 46 | #    logging.info(stats())
 47 | ##    print "FINAL VALIDATION AT TRAINING STEP %d: mean(logrank) = %.2f, stddev(logrank) = %.2f, cnt = %d" % (cnt, numpy.mean(numpy.array(logranks)), numpy.std(numpy.array(logranks)), i+1)
 48 | ##    print stats()
 49 | 
 50 | def ebatch_to_sequences(ebatch):
 51 |     """
 52 |     Convert example batch to sequences.
 53 |     """
 54 |     correct_sequences = []
 55 |     noise_sequences = []
 56 |     weights = []
 57 |     for e in ebatch:
 58 |         notw2, weight = e.corrupt
 59 |         correct_sequences.append(e.l1seq + [e.w2])
 60 |         noise_sequences.append(e.l1seq + [notw2])
 61 |         weights.append(weight)
 62 |     assert len(ebatch) == len(correct_sequences)
 63 |     assert len(ebatch) == len(noise_sequences)
 64 |     assert len(ebatch) == len(weights)
 65 |     return correct_sequences, noise_sequences, weights
 66 | 
 67 | if __name__ == "__main__":
 68 |     import common.hyperparameters, common.options
 69 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
 70 |     HYPERPARAMETERS, options, args, newkeystr = common.options.reparse(HYPERPARAMETERS)
 71 |     import hyperparameters
 72 | 
 73 |     from common import myyaml
 74 |     import sys
 75 |     print >> sys.stderr, myyaml.dump(common.dump.vars_seq([hyperparameters, miscglobals]))
 76 | 
 77 |     # We do not allow sophisticated training noise
 78 |     assert HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0
 79 | 
 80 |     from rundir import rundir
 81 |     rundir = rundir()
 82 | 
 83 |     import os.path, os
 84 |     logfile = os.path.join(rundir, "log")
 85 |     if newkeystr != "":
 86 |         verboselogfile = os.path.join(rundir, "log%s" % newkeystr)
 87 |         print >> sys.stderr, "Logging to %s, and creating link %s" % (logfile, verboselogfile)
 88 |         os.system("ln -s log %s " % (verboselogfile))
 89 |     else:
 90 |         print >> sys.stderr, "Logging to %s, not creating any link because of default settings" % logfile
 91 | 
 92 |     import random, numpy
 93 |     random.seed(miscglobals.RANDOMSEED)
 94 |     numpy.random.seed(miscglobals.RANDOMSEED)
 95 | 
 96 |     # Random wait if we are a batch job
 97 |     import time
 98 |     if not HYPERPARAMETERS["console"]:
 99 |         wait = 100 * random.random()
100 |         print >> sys.stderr, "Waiting %f seconds..." % wait
101 |         time.sleep(wait)
102 | 
103 | #    import vocabulary
104 | ##    logging.info("Reading vocab")
105 | ##    vocabulary.read()
106 | #    
107 |     import model
108 |     try:
109 |         print >> sys.stderr, ("Trying to read training state for %s %s..." % (newkeystr, rundir))
110 |         (translation_model, cnt, lastcnt, epoch) = state.load(rundir, newkeystr)
111 |         print >> sys.stderr, ("...success reading training state for %s %s" % (newkeystr, rundir))
112 |         print >> sys.stderr, logfile
113 |         print >> sys.stderr, "CONTINUING FROM TRAINING STATE"
114 |     except IOError:
115 |         print >> sys.stderr, ("...FAILURE reading training state for %s %s" % (newkeystr, rundir))
116 |         print >> sys.stderr, ("INITIALIZING")
117 | 
118 |         translation_model = {}
119 |         print >> sys.stderr, "Loading initial embeddings from %s" % HYPERPARAMETERS["INITIAL_EMBEDDINGS"]
120 |         # TODO: If we want more than one model, we should SHARE the embeddings parameters
121 |         embeddings = cPickle.load(common.file.myopen(HYPERPARAMETERS["INITIAL_EMBEDDINGS"]))
122 | 
123 |         print >> sys.stderr, "INITIALIZING TRAINING STATE"
124 | 
125 |         all_l1 = {}
126 |         for l1, l2 in HYPERPARAMETERS["W2W BICORPORA"]: all_l1[l1] = True
127 |         for l1 in all_l1:
128 |             translation_model[l1] = model.Model(modelname="translate-from-%s" % l1, window_size=HYPERPARAMETERS["WINDOW_SIZE"]+1, initial_embeddings=embeddings)
129 |         # TODO: I'd like to free this memory, but translation_model doesn't make a copy.
130 | #        embeddings = None
131 |         cnt = 0
132 |         lastcnt = 0
133 |         epoch = 1
134 | #        get_train_minibatch = examples.TrainingMinibatchStream()
135 | 
136 |     if HYPERPARAMETERS["console"]:
137 |         print >> sys.stderr, "Console mode (not batch mode)."
138 |         logging.basicConfig(level=logging.INFO)
139 |     else:
140 |         print >> sys.stderr, "YOU ARE RUNNING IN BATCH, NOT CONSOLE MODE. THIS WILL BE THE LAST MESSAGE TO STDERR."
141 |         logging.basicConfig(filename=logfile, filemode="w", level=logging.INFO)
142 | 
143 |     assert len(translation_model) == 1
144 |     for l1 in HYPERPARAMETERS["W2W MONOCORPORA"]:
145 |         assert 0
146 | 
147 | #    get_train_minibatch = w2w.examples.get_training_minibatch_online()
148 |     get_train_minibatch = w2w.examples.get_training_minibatch_cached()
149 | 
150 |     logging.info(myyaml.dump(common.dump.vars_seq([hyperparameters, miscglobals])))
151 | 
152 |     validate(translation_model, 0)
153 | #    diagnostics.diagnostics(cnt, m)
154 | ##    diagnostics.visualizedebug(cnt, m, rundir)
155 | #    state.save(translation_model, cnt, lastcnt, epoch, rundir, newkeystr)
156 |     while 1:
157 |         logging.info("STARTING EPOCH #%d" % epoch)
158 |         for ebatch in get_train_minibatch:
159 |             lastcnt = cnt
160 |             cnt += len(ebatch)
161 | #        #    print [wordmap.str(id) for id in e]
162 | 
163 |             source_language = ebatch[0].l1
164 |             for e in ebatch:
165 |                 # Make sure all examples have the same source language
166 |                 assert e.l1 == source_language
167 | 
168 |             # The following is code for training on bilingual examples.
169 |             # TODO: Monolingual examples?
170 | 
171 |             correct_sequences, noise_sequences, weights = ebatch_to_sequences(ebatch)
172 |             translation_model[source_language].train(correct_sequences, noise_sequences, weights)
173 | 
174 |             #validate(translation_model, cnt)
175 |             if int(cnt/1000) > int(lastcnt/1000):
176 |                 logging.info("Finished training step %d (epoch %d)" % (cnt, epoch))
177 | #                print ("Finished training step %d (epoch %d)" % (cnt, epoch))
178 |             if int(cnt/10000) > int(lastcnt/10000):
179 |                 for l1 in translation_model:
180 |                     diagnostics.diagnostics(cnt, translation_model[l1])
181 |                 if os.path.exists(os.path.join(rundir, "BAD")):
182 |                     logging.info("Detected file: %s\nSTOPPING" % os.path.join(rundir, "BAD"))
183 |                     sys.stderr.write("Detected file: %s\nSTOPPING\n" % os.path.join(rundir, "BAD"))
184 |                     sys.exit(0)
185 |             if int(cnt/HYPERPARAMETERS["VALIDATE_EVERY"]) > int(lastcnt/HYPERPARAMETERS["VALIDATE_EVERY"]):
186 |                 validate(translation_model, cnt)
187 |                 pass
188 | #                for l1 in translation_model:
189 | #                    diagnostics.visualizedebug(cnt, translation_model[l1], rundir, newkeystr)
190 | 
191 |         validate(translation_model, cnt)
192 | #        get_train_minibatch = w2w.examples.get_training_minibatch_online()
193 |         get_train_minibatch = w2w.examples.get_training_minibatch_cached()
194 |         epoch += 1
195 | 
196 |         state.save(translation_model, cnt, lastcnt, epoch, rundir, newkeystr)
197 | #       validate(cnt)
198 | 


--------------------------------------------------------------------------------
/scripts/w2w/vocabulary.py:
--------------------------------------------------------------------------------
 1 | """
 2 | wordmap is a map from id to (language, wordform)
 3 | """
 4 | 
 5 | import cPickle
 6 | from common.file import myopen
 7 | import sys
 8 | from os.path import join
 9 | 
10 | def _wordmap_filename():
11 |     import common.hyperparameters, common.options, hyperparameters
12 |     HYPERPARAMETERS = common.hyperparameters.read("language-model")
13 |     return join(HYPERPARAMETERS["DATA_DIR"], "idmap.minfreq=%d.include_unknown=%s.pkl.gz" % (HYPERPARAMETERS["W2W MINIMUM WORD FREQUENCY"], HYPERPARAMETERS["INCLUDE_UNKNOWN_WORD"]))
14 | 
15 | _wordmap = None
16 | def wordmap():
17 |     global _wordmap
18 |     if _wordmap is None:
19 |         _wordmap = cPickle.load(myopen(_wordmap_filename()))
20 |         _wordmap.str = _wordmap.key
21 |     return _wordmap
22 | 
23 | def language(id):
24 |     """
25 |     Get the language of this word id.
26 |     """
27 |     return wordmap().str(id)[0]
28 | 
29 | def wordform(id):
30 |     """
31 |     Get the word form of this word id.
32 |     """
33 |     return wordmap().str(id)[1]
34 | 
35 | def write(_wordmap_new):
36 |     """
37 |     Write the word ID map, passed as a parameter.
38 |     """
39 |     global _wordmap
40 |     assert _wordmap is None
41 |     _wordmap = _wordmap_new
42 |     print >> sys.stderr, "Writing word map with %d words to %s..." % (_wordmap.len, _wordmap_filename())
43 |     cPickle.dump(_wordmap, myopen(_wordmap_filename(), "w"))
44 | 


--------------------------------------------------------------------------------
/scripts/weight-histogram.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | #  Plot a histogram of the absolute values of model embeddings
 4 | #
 5 | #
 6 | 
 7 | PERCENT = 0.01
 8 | import random
 9 | 
10 | import sys
11 | import matplotlib
12 | matplotlib.use( 'Agg' ) # Use non-GUI backend
13 | import pylab
14 | 
15 | from optparse import OptionParser
16 | parser = OptionParser()
17 | parser.add_option("-m", "--modelfile", dest="modelfile")
18 | (options, args) = parser.parse_args()
19 | assert options.modelfile is not None
20 | 
21 | histfile = "%s.weight-histogram.png" % options.modelfile
22 | 
23 | import cPickle
24 | m = cPickle.load(open(options.modelfile))
25 | #print m.parameters.embeddings.shape
26 | 
27 | values = []
28 | 
29 | from vocabulary import wordmap
30 | for i in range(m.parameters.vocab_size):
31 |     for v in m.parameters.embeddings[i]:
32 |         if random.random() < PERCENT:
33 |             values.append(abs(v))
34 | values.sort()
35 | 
36 | print >> sys.stderr, "%d values read (at %f percent) of %d embeddings, %d/%f/%d = %f" % (len(values), PERCENT, m.parameters.vocab_size, len(values), PERCENT, m.parameters.vocab_size, len(values)/PERCENT/m.parameters.vocab_size)
37 | 
38 | x = []
39 | for i, v in enumerate(values):
40 |     x.append(1./(len(values)-1) * i)
41 | 
42 | print >> sys.stderr, 'Writing weight histogram to %s' % histfile
43 | 
44 | pylab.ylim(ymin=0, ymax=1.)
45 | pylab.plot(x, values)
46 | pylab.ylim(ymin=0, ymax=1.)
47 | pylab.savefig(histfile)
48 | pylab.show()
49 | 


--------------------------------------------------------------------------------