├── gtd
├── textmorph
    ├── __init__.py
    ├── edit_model
    │   ├── __init__.py
    │   ├── agenda.py
    │   ├── main.py
    │   ├── edit_noiser.py
    │   ├── edit_encoder.py
    │   └── attention_decoder.py
    ├── language_model
    │   ├── __init__.py
    │   └── main.py
    ├── turk
    │   ├── similarity
    │   │   └── config.txt
    │   ├── coherence
    │   │   └── config.txt
    │   └── turk.py
    └── data.py
├── .gitignore
├── third-party
    └── gtd
    │   ├── gtd
    │       ├── __init__.py
    │       ├── ml
    │       │   ├── __init__.py
    │       │   ├── tf
    │       │   │   ├── __init__.py
    │       │   │   ├── tests
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── test_framework.py
    │       │   │   │   └── test_utils.py
    │       │   │   ├── training_run.py
    │       │   │   └── profile.py
    │       │   ├── tests
    │       │   │   ├── __init__.py
    │       │   │   ├── test_vocab.py
    │       │   │   └── test_utils.py
    │       │   ├── torch
    │       │   │   ├── __init__.py
    │       │   │   ├── tests
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── test_recurrent.py
    │       │   │   │   ├── test_utils.py
    │       │   │   │   ├── test_token_embedder.py
    │       │   │   │   ├── test_alignments.py
    │       │   │   │   ├── test_source_encoder.py
    │       │   │   │   ├── test_attention.py
    │       │   │   │   └── test_seq_batch.py
    │       │   │   ├── recurrent.py
    │       │   │   ├── feed_forward.py
    │       │   │   ├── simple_decoder_cell.py
    │       │   │   ├── training_run.py
    │       │   │   ├── decoder_cell.py
    │       │   │   ├── multilayered_decoder_cell.py
    │       │   │   ├── alignments.py
    │       │   │   ├── token_embedder.py
    │       │   │   ├── checkpoints.py
    │       │   │   ├── attention.py
    │       │   │   ├── utils.py
    │       │   │   └── source_encoder.py
    │       │   ├── utils.py
    │       │   ├── training_run.py
    │       │   ├── training_run_viewer.py
    │       │   └── vocab.py
    │       ├── tests
    │       │   ├── __init__.py
    │       │   ├── test_graph.py
    │       │   ├── test_io.py
    │       │   ├── test_log.py
    │       │   ├── test_lm.py
    │       │   └── test_utils.py
    │       ├── git_utils.py
    │       ├── plot.py
    │       ├── text.py
    │       ├── profile_imports.py
    │       ├── log.py
    │       ├── codalab.py
    │       └── graph.py
    │   ├── .gitignore
    │   ├── requirements.txt
    │   ├── setup.py
    │   └── scripts
    │       ├── git_logs.py
    │       ├── run_nlpsub.py
    │       └── run_docker.py
├── config.json
├── configs
    ├── optim
    │   ├── debug.txt
    │   └── default.txt
    ├── eval
    │   ├── debug.txt
    │   ├── short.txt
    │   └── default.txt
    ├── language_model
    │   ├── default.txt
    │   └── onebil.txt
    └── edit_model
    │   ├── autogen.sh
    │   ├── edit_test.txt
    │   ├── edit_onebil.txt
    │   ├── edit_baseline.txt
    │   └── edit_logp.txt
├── README.md
├── Dockerfile
└── run_docker.py


/gtd:
--------------------------------------------------------------------------------
1 | third-party/gtd/gtd


--------------------------------------------------------------------------------
/textmorph/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .idea


--------------------------------------------------------------------------------
/textmorph/edit_model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/textmorph/language_model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/tf/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/tf/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third-party/gtd/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.ipynb
3 | *.pyc
4 | .cache
5 | .DS_Store
6 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {"docker_image": "kelvinguu/textmorph:1.2", "data_env_var": "TEXTMORPH_DATA"}
2 | 


--------------------------------------------------------------------------------
/configs/optim/debug.txt:
--------------------------------------------------------------------------------
1 | optim {
2 |     seed = 0  # random seed
3 |     learning_rate = 0.001
4 |     batch_size = 32  # examples per batch
5 |     max_iters = 1500  # max number of mini-batch steps to take
6 | }


--------------------------------------------------------------------------------
/configs/optim/default.txt:
--------------------------------------------------------------------------------
1 | optim {
2 |     seed = 0  # random seed
3 |     learning_rate = 0.001
4 |     batch_size = 128  # examples per batch
5 |     max_iters = 400000  # max number of mini-batch steps to take
6 | }


--------------------------------------------------------------------------------
/configs/eval/debug.txt:
--------------------------------------------------------------------------------
1 | eval {
2 |     num_examples = 4  # number of examples to periodically evaluate on
3 |     big_num_examples = 8
4 |     eval_steps = 10
5 |     big_eval_steps = 20
6 |     save_steps = 50
7 |     alive_steps = 5
8 | }


--------------------------------------------------------------------------------
/configs/eval/short.txt:
--------------------------------------------------------------------------------
1 | eval {
2 |     num_examples = 32  # number of examples to periodically evaluate on
3 |     big_num_examples = 512
4 |     eval_steps = 500
5 |     big_eval_steps = 5000
6 |     save_steps = 5000
7 |     alive_steps = 30
8 | }
9 | 


--------------------------------------------------------------------------------
/configs/eval/default.txt:
--------------------------------------------------------------------------------
1 | eval {
2 |     num_examples = 32  # number of examples to periodically evaluate on
3 |     big_num_examples = 128
4 |     eval_steps = 500
5 |     big_eval_steps = 5000
6 |     save_steps = 5000
7 |     alive_steps = 30
8 | }
9 | 


--------------------------------------------------------------------------------
/textmorph/turk/similarity/config.txt:
--------------------------------------------------------------------------------
1 | title = "Similarity Task"
2 | description = "Determine similarity of sentences"
3 | keywords = "sentence, similarity"
4 | price = 0.20 # default
5 | duration = 60 * 60  # 60 minutes per HIT
6 | approval_delay = 3600 * 24 * 7  # 7 days for auto-approval
7 | form_json = "form.json"
8 | 


--------------------------------------------------------------------------------
/third-party/gtd/requirements.txt:
--------------------------------------------------------------------------------
 1 | line_profiler==1.0
 2 | matplotlib==1.4.3
 3 | numpy==1.11.0
 4 | psycopg2==2.6.1
 5 | pytest==2.9.2
 6 | spacy==0.99
 7 | SQLAlchemy==1.1.0b3
 8 | tensorflow==0.8.0
 9 | ipython==5.1.0
10 | scipy==0.18.0
11 | faulthandler==2.4
12 | futures==3.0.5
13 | jsonpickle==0.9.2
14 | fabric==1.12.0
15 | 


--------------------------------------------------------------------------------
/textmorph/turk/coherence/config.txt:
--------------------------------------------------------------------------------
1 | title = "Coherence Task"
2 | description = "Determine the coherence and grammaticality of sentences"
3 | keywords = "sentence, coherence, grammar"
4 | price = 0.40 # default
5 | duration = 60 * 60  # 60 minutes per HIT
6 | approval_delay = 3600 * 24 * 7  # 7 days for auto-approval
7 | form_json = "form.json"
8 | 


--------------------------------------------------------------------------------
/textmorph/language_model/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from gtd.utils import Config
 4 | from textmorph.language_model.training_run import LMTrainingRuns
 5 | 
 6 | 
 7 | arg_parser = argparse.ArgumentParser()
 8 | arg_parser.add_argument('config_path')
 9 | args = arg_parser.parse_args()
10 | 
11 | runs = LMTrainingRuns()
12 | config = Config.from_file(args.config_path)
13 | run = runs.new(config)
14 | 
15 | run.train()


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/tf/training_run.py:
--------------------------------------------------------------------------------
 1 | from gtd.ml.training_run import TrainingRun
 2 | from gtd.utils import cached_property
 3 | 
 4 | 
 5 | class TFTrainingRun(TrainingRun):
 6 |     def __init__(self, config, save_dir):
 7 |         super(TFTrainingRun, self).__init__(config, save_dir)
 8 | 
 9 |     @cached_property
10 |     def saver(self):
11 |         from gtd.ml.tf.utils import Saver
12 |         return Saver(self.workspace.checkpoints, keep_checkpoint_every_n_hours=5)


--------------------------------------------------------------------------------
/configs/language_model/default.txt:
--------------------------------------------------------------------------------
 1 | include "../optim/default.txt"
 2 | include "../eval/default.txt"
 3 | 
 4 | model {
 5 |     vocab_size = 10000
 6 |     word_dim = 300
 7 |     agenda_dim = 100
 8 |     hidden_dim = 100
 9 |     num_layers = 3
10 |     kl_weight_steps = 50000
11 |     kl_weight_rate = 8
12 |     kl_weight_cap = 1.0
13 |     dci_keep_rate = 0.8
14 |     wvec_path = glove.6B.300d_yelp.txt
15 |     type = 0 # 0 = language model, 1 = SVAE
16 | }
17 | 
18 | dataset {
19 |     path = yelp_dataset_static
20 | }
21 | 


--------------------------------------------------------------------------------
/configs/language_model/onebil.txt:
--------------------------------------------------------------------------------
 1 | include "../optim/default.txt"
 2 | include "../eval/default.txt"
 3 | 
 4 | model {
 5 |     vocab_size = 10000
 6 |     word_dim = 300
 7 |     agenda_dim = 100
 8 |     hidden_dim = 100
 9 |     num_layers = 3
10 |     kl_weight_steps = 50000
11 |     kl_weight_rate = 8
12 |     kl_weight_cap = 1.0
13 |     dci_keep_rate = 0.8
14 |     wvec_path = glove.6B.300d_onebil.txt
15 |     type = 0 # 0 = language model, 1 = SVAE
16 | }
17 | 
18 | dataset {
19 |     path = onebil_dataset_static
20 | }
21 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/git_utils.py:
--------------------------------------------------------------------------------
 1 | import git
 2 | 
 3 | def commit_diff(c):
 4 |     """Return the set of changed files.
 5 | 
 6 |     Args:
 7 |         c (git.Commit)
 8 | 
 9 |     Returns:
10 |         set[str]: a set of file paths (relative to the git repo's root directory).
11 |     """
12 |     changed = set()
13 | 
14 |     def add_path(blob):
15 |         if blob is not None:
16 |             changed.add(blob.path)
17 | 
18 |     prev_c = c.parents[0]
19 |     for x in c.diff(prev_c):
20 |         add_path(x.a_blob)
21 |         add_path(x.b_blob)
22 |     return changed


--------------------------------------------------------------------------------
/configs/edit_model/autogen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | options=("attend_pr = 1.0" "lamb_reg = 10.0" "lamb_reg = 25.0" "lamb_reg = 100.0" "edit_dim = 128" "edit_dim = 512" "norm_eps = 0.01" "norm_eps = 0.5" "norm_eps = 1.0" "kill_edit = True")
 4 | 
 5 | 
 6 | arraylen=${#options[@]}
 7 | #for opt in "${options[@]}"
 8 | for (( i=0; i<${arraylen}; i++ ));
 9 | do
10 |     echo $i
11 |     echo "include \"edit_baseline.txt\"
12 | editor{
13 |   "${options[$i]}"
14 | }" > configs/edit_model/tmp$i
15 |     ./nlpsub.py -g 1 -n testruns 'python textmorph/edit_model/main.py configs/edit_model/tmp'$i
16 | done
17 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/tests/test_graph.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from gtd.graph import Graph
 4 | 
 5 | 
 6 | class TestGraph(TestCase):
 7 | 
 8 |     def test_shortest_path(self):
 9 | 
10 |        triples = [
11 |           ('1', '2', '3'),
12 |           ('3', '4', '5'),
13 |           ('1', '0', '5'),
14 |        ]
15 |        self.assertEqual(
16 |           Graph(triples).shortest_path('1', '5'),
17 |           ['1', '0', '5']
18 |        )
19 |        self.assertEqual(
20 |           Graph(triples[:2]).shortest_path('1', '5'),
21 |           ['1', '2', '3', '4', '5']
22 |        )
23 | 


--------------------------------------------------------------------------------
/third-party/gtd/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from distutils.core import setup, Command
 3 | 
 4 | 
 5 | class Test(Command):
 6 |     user_options = []
 7 | 
 8 |     def initialize_options(self):
 9 |         pass
10 | 
11 |     def finalize_options(self):
12 |         pass
13 | 
14 |     def run(self):
15 |         import subprocess
16 |         errno = subprocess.call(['py.test', '-v', '--doctest-modules', 'gtd'])
17 |         raise SystemExit(errno)
18 | 
19 | 
20 | setup(name='gtd',
21 |       version='1.0',
22 |       packages=['gtd'],
23 |       description='Get things done.',
24 |       cmdclass={'test': Test},
25 | )


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/tests/test_vocab.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from gtd.ml.vocab import SimpleVocab, SimpleEmbeddings
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def vocab():
 9 |     return SimpleVocab(['a', 'b', 'c'])
10 | 
11 | 
12 | @pytest.fixture
13 | def embeds(vocab):
14 |     array = np.eye(len(vocab))
15 |     return SimpleEmbeddings(array, vocab)
16 | 
17 | 
18 | class TestSimpleVocab(object):
19 |     def test_save_load(self, vocab, tmpdir):
20 |         path = str(tmpdir.join('vocab.txt'))
21 |         vocab.save(path)
22 |         new_vocab = SimpleVocab.load(path)
23 |         assert vocab == new_vocab


--------------------------------------------------------------------------------
/textmorph/data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from gtd.io import Workspace
 3 | 
 4 | 
 5 | # Set location of local data directory from environment variable
 6 | env_var = 'TEXTMORPH_DATA'
 7 | if env_var not in os.environ:
 8 |     assert False, env_var + ' environmental variable must be set.'
 9 | root = os.environ[env_var]
10 | 
11 | # define workspace
12 | workspace = Workspace(root)
13 | 
14 | # config
15 | workspace.add_file('config', 'config.txt')
16 | 
17 | # Training runs
18 | workspace.add_dir('edit_runs', 'edit_runs')
19 | workspace.add_dir('lm_runs', 'lm_runs')
20 | workspace.add_dir('retriever_runs', 'retriever_runs')
21 | 
22 | # user IDs
23 | workspace.add_file('user_ids', 'user_ids.json')
24 | 
25 | # word vectors
26 | workspace.add_dir('word_vectors', 'word_vectors')
27 | 
28 | # nearest neighbors
29 | workspace.add_dir('nearest_sentences', 'nearest_sentences')


--------------------------------------------------------------------------------
/textmorph/edit_model/agenda.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import Module, Linear
 3 | 
 4 | 
 5 | class AgendaMaker(Module):
 6 |     def __init__(self, source_dim, edit_dim, agenda_dim):
 7 |         super(AgendaMaker, self).__init__()
 8 |         self.linear = Linear(source_dim + edit_dim, agenda_dim)
 9 | 
10 |     def forward(self, source_embed, edit_embed):
11 |         """Create agenda vector from source text embedding and edit embedding.
12 | 
13 |         Args:
14 |             source_embed (Variable): of shape (batch_size, source_dim)
15 |             edit_embed (Variable): of shape (batch_size, edit_dim)
16 | 
17 |         Returns:
18 |             agenda (Variable): of shape (batch_size, agenda_dim)
19 |         """
20 |         inp = torch.cat([source_embed, edit_embed], 1)  # (batch_size, hidden_dim + edit_dim)
21 |         return self.linear(inp)


--------------------------------------------------------------------------------
/third-party/gtd/scripts/git_logs.py:
--------------------------------------------------------------------------------
 1 | from git import Repo
 2 | from os.path import join
 3 | 
 4 | import sys
 5 | print sys.path
 6 | 
 7 | from gtd.git_utils import commit_diff
 8 | from gtd.chrono import verboserate
 9 | 
10 | 
11 | repo_path = sys.argv[1]
12 | max_count = sys.argv[2]
13 | files = set(sys.argv[3:])
14 | 
15 | def format_commit(c):
16 |     msg = c.message.split('\n')[0]
17 |     return '{}\t{}'.format(c.hexsha, msg)
18 | 
19 | repo = Repo(repo_path)
20 | commits = list(repo.iter_commits('master', max_count=max_count))
21 | lines = []
22 | for c in verboserate(commits, desc='Scanning commits', total=max_count):
23 |     if len(files & commit_diff(c)) == 0:
24 |         continue
25 |     lines.append(format_commit(c))
26 | 
27 | log_path = join(repo_path, 'git-logs.tsv')
28 | with open(log_path, 'w') as f:
29 |     for line in lines:
30 |         f.write(line)
31 |         f.write('\n')


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Neural editor
 2 | 
 3 | Source code accompanying our paper, "Generating Sentences by Editing Prototypes" ([paper](https://arxiv.org/abs/1709.08878), [slides](http://kelvinguu.com/posts/generating-sentences-by-editing-prototypes/)).
 4 | 
 5 | **Authors:** Kelvin Guu\*, Tatsunori B. Hashimoto\*, Yonatan Oren, Percy Liang
 6 | (\* equal contribution)
 7 | 
 8 | - A detailed description of the training algorithm is now [available here](http://kelvinguu.com/public/projects/neural_editor_training.pdf).
 9 | - We are drafting a more detailed README in the
10 |   [README](https://github.com/kelvinguu/neural-editor/tree/readme) branch (see here for dataset links)
11 | - This is research code meant to serve as a reference implementation. We do not
12 |   recommend heavily extending or modifying this codebase for other purposes.
13 | 
14 | If you have questions, please email Kelvin at `guu.kelvin` at `gmail.com`.
15 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/tests/test_recurrent.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from gtd.ml.torch.utils import GPUVariable
 3 | 
 4 | from gtd.ml.torch.recurrent import tile_state, gated_update
 5 | from gtd.ml.torch.utils import assert_tensor_equal
 6 | 
 7 | 
 8 | def test_tile_state():
 9 |     h = GPUVariable(torch.FloatTensor([1, 2, 3]))
10 |     h_tiled = tile_state(h, 3)
11 |     assert_tensor_equal(h_tiled, [[1, 2, 3], [1, 2, 3], [1, 2, 3]])
12 | 
13 | 
14 | def test_gated_update():
15 |     h = GPUVariable(torch.FloatTensor([
16 |         [1, 2, 3],
17 |         [4, 5, 6],
18 |     ]))
19 |     h_new = GPUVariable(torch.FloatTensor([
20 |         [-1, 2, 3],
21 |         [4, 8, 0],
22 |     ]))
23 |     update = GPUVariable(torch.FloatTensor([[0], [1]]))  # only update the second row
24 | 
25 |     out = gated_update(h, h_new, update)
26 | 
27 |     assert_tensor_equal(out, [
28 |         [1, 2, 3],
29 |         [4, 8, 0]
30 |     ])


--------------------------------------------------------------------------------
/configs/edit_model/edit_test.txt:
--------------------------------------------------------------------------------
 1 | include "../optim/debug.txt"
 2 | include "../eval/debug.txt"
 3 | 
 4 | seed = 0
 5 | 
 6 | editor {
 7 |     decoder_cell = AttentionDecoderCell
 8 |     vocab_size = 10000  # a proper size would be >20000
 9 |     word_dim = 300
10 |     hidden_dim = 256  # hidden state dim of encoder and decoder
11 |     agenda_dim = 256  # agenda vector dim
12 |     edit_dim = 256 # edit vector dimension
13 |     attention_dim = 128
14 |     encoder_layers = 3
15 |     decoder_layers = 3
16 |     no_insert_delete_attn = False
17 |     edit_dropout = True
18 |     ident_pr = 0.1
19 |     attend_pr = 0.5
20 |     enable_vae = True
21 |     lamb_reg = 50.0
22 |     norm_eps = 0.1
23 |     norm_max = 7.5
24 |     kill_edit = True
25 |     embed_sentence = False
26 |     wvec_path = glove.6B.300d_yelp.txt
27 | }
28 | 
29 | dataset {
30 |     # this path should be relative to $SQUAD_ENTAILMENT_DATA
31 |     path = yelp_dataset_small
32 |     use_diff = True
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/configs/edit_model/edit_onebil.txt:
--------------------------------------------------------------------------------
 1 | include "../optim/default.txt"
 2 | include "../eval/default.txt"
 3 | 
 4 | seed = 0
 5 | 
 6 | editor {
 7 |     decoder_cell = AttentionDecoderCell
 8 |     vocab_size = 10000  # a proper size would be >20000
 9 |     word_dim = 300
10 |     hidden_dim = 256  # hidden state dim of encoder and decoder
11 |     agenda_dim = 256  # agenda vector dim
12 |     edit_dim = 256 # edit vector dimension
13 |     attention_dim = 128
14 |     encoder_layers = 3
15 |     decoder_layers = 3
16 |     no_insert_delete_attn = False
17 |     edit_dropout = True
18 |     ident_pr = 0.01
19 |     attend_pr = 0.0
20 |     enable_vae = True
21 |     lamb_reg = 50.0
22 |     norm_eps = 0.1
23 |     norm_max = 7.5
24 |     kill_edit = True
25 |     embed_sentence = False
26 |     wvec_path = glove.6B.300d_onebil.txt
27 | }
28 | 
29 | dataset {
30 |     # this path should be relative to $SQUAD_ENTAILMENT_DATA
31 |     path = onebillion_split
32 |     use_diff = True
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/configs/edit_model/edit_baseline.txt:
--------------------------------------------------------------------------------
 1 | include "../optim/default.txt"
 2 | include "../eval/short.txt"
 3 | 
 4 | seed = 0
 5 | 
 6 | editor {
 7 |     decoder_cell = AttentionDecoderCell
 8 |     vocab_size = 10000  # a proper size would be >20000
 9 |     word_dim = 300
10 |     hidden_dim = 256  # hidden state dim of encoder and decoder
11 |     agenda_dim = 256  # agenda vector dim
12 |     edit_dim = 128 # edit vector dimension
13 |     attention_dim = 128
14 |     encoder_layers = 3
15 |     decoder_layers = 3
16 |     no_insert_delete_attn = False
17 |     edit_dropout = True
18 |     ident_pr = 0.1
19 |     attend_pr = 0.0
20 |     enable_vae = True
21 |     lamb_reg = 100.0
22 |     norm_eps = 0.1
23 |     norm_max = 14.0
24 |     kill_edit = False
25 |     embed_sentence = False
26 |     wvec_path = glove.6B.300d_yelp.txt
27 | }
28 | 
29 | dataset {
30 |     # this path should be relative to $SQUAD_ENTAILMENT_DATA
31 |     path = yelp_dataset_large_split
32 |     use_diff = True
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from gtd.ml.torch.utils import expand_dims_for_broadcast, assert_tensor_equal, is_binary
 5 | 
 6 | 
 7 | def test_expand_dims_for_broadcast():
 8 |     low_tensor = torch.FloatTensor([[1, 2, 3], [4, 5, 6]])  # (2, 3)
 9 |     high_tensor = torch.zeros(2, 3, 8, 1)
10 | 
11 |     new_tensor = expand_dims_for_broadcast(low_tensor, high_tensor)
12 | 
13 |     assert new_tensor.size() == (2, 3, 1, 1)
14 | 
15 |     assert_tensor_equal(new_tensor.squeeze(), low_tensor)
16 | 
17 |     with pytest.raises(AssertionError):
18 |         bad_tensor = torch.zeros(2, 4, 8, 1)  # prefix doesn't match
19 |         expand_dims_for_broadcast(low_tensor, bad_tensor)
20 | 
21 | 
22 | def test_is_binary():
23 |     t1 = torch.FloatTensor([0, 1, 0, 0])
24 |     t2 = torch.FloatTensor([0, -1, 0, 0])
25 |     t3 = torch.FloatTensor([0, 0.1, 0.2, 0])
26 |     assert is_binary(t1)
27 |     assert not is_binary(t2)
28 |     assert not is_binary(t3)


--------------------------------------------------------------------------------
/configs/edit_model/edit_logp.txt:
--------------------------------------------------------------------------------
 1 | include "../optim/default.txt"
 2 | include "../eval/default.txt"
 3 | 
 4 | optim {
 5 |     learning_rate = 0.001
 6 | }
 7 | 
 8 | seed = 0
 9 | 
10 | editor {
11 |     decoder_cell = AttentionDecoderCell
12 |     vocab_size = 10000  # a proper size would be >20000
13 |     word_dim = 300
14 |     hidden_dim = 256  # hidden state dim of encoder and decoder
15 |     agenda_dim = 256  # agenda vector dim
16 |     edit_dim = 128 # edit vector dimension
17 |     attention_dim = 128
18 |     encoder_layers = 3
19 |     decoder_layers = 3
20 |     no_insert_delete_attn = False
21 |     edit_dropout = True
22 |     ident_pr = 0.1
23 |     attend_pr = 0.0
24 |     enable_vae = True
25 |     lamb_reg = 15.0
26 |     norm_eps = 1.0
27 |     norm_max = 10.0
28 |     kill_edit = False
29 |     embed_sentence = False
30 |     wvec_path = glove.6B.300d_yelp.txt
31 | }
32 | 
33 | dataset {
34 |     # this path should be relative to $SQUAD_ENTAILMENT_DATA
35 |     path = yelp_dataset_large_split
36 |     use_diff = True
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/tf/profile.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import tensorflow as tf
 3 | from tensorflow.python.client import timeline
 4 | 
 5 | 
 6 | class ProfiledSession(tf.Session):
 7 |     def __init__(self, *args, **kwargs):
 8 |         super(ProfiledSession, self).__init__(*args, **kwargs)
 9 | 
10 |     def run(self, fetches, feed_dict=None):
11 |         """like Session.run, but return a Timeline object in Chrome trace format (JSON).
12 | 
13 |         Save the json to a file, go to chrome://tracing, and open the file.
14 | 
15 |         Args:
16 |             fetches
17 |             feed_dict
18 | 
19 |         Returns:
20 |             dict: a JSON dict
21 |         """
22 |         options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
23 |         run_metadata = tf.RunMetadata()
24 |         super(ProfiledSession, self).run(fetches, feed_dict, options=options, run_metadata=run_metadata)
25 | 
26 |         # Create the Timeline object, and write it to a json
27 |         tl = timeline.Timeline(run_metadata.step_stats)
28 |         ctf = tl.generate_chrome_trace_format()
29 |         return json.loads(ctf)


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def temperature_smooth(sampling_probs, temperature):
 5 |     """Smooth a discrete distribution by raising/lowering temperature.
 6 | 
 7 |     Args:
 8 |         sampling_probs (np.ndarray): 1D numpy array
 9 |         temperature (float)
10 | 
11 |     Returns:
12 |         np.ndarray: 1D array of same shape as sampling_probs
13 |     """
14 |     if not isinstance(sampling_probs, np.ndarray):
15 |         raise TypeError("sampling_probs must be numpy array.")
16 | 
17 |     if temperature <= 0:
18 |         raise ValueError("Temperature must be positive.")
19 | 
20 |     if not np.isfinite(temperature):
21 |         raise ValueError("Temperature must be finite.")
22 | 
23 |     if abs(np.sum(sampling_probs) - 1.0) > 0.001:
24 |         raise ValueError("sampling_probs must sum to 1.")
25 | 
26 |     if not np.all(sampling_probs >= 0):
27 |         raise ValueError("sampling_probs must all be non-negative.")
28 | 
29 |     logits = np.log(sampling_probs)  # should be in range [-inf, 0]
30 |     unnormalized = np.exp(logits / temperature)
31 |     probs = unnormalized / np.sum(unnormalized)
32 |     return probs


--------------------------------------------------------------------------------
/third-party/gtd/gtd/tests/test_io.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gtd.io import IntegerDirectories, split_path
 4 | 
 5 | 
 6 | class TestIntegerDirectories(object):
 7 |     @pytest.fixture
 8 |     def int_dirs(self, tmpdir):
 9 |         tmpdir.mkdir('152_blah')
10 |         tmpdir.mkdir('153_woo')
11 |         tmpdir.mkdir('1_')  # no suffix, should still match
12 |         tmpdir.mkdir('-1')  # no suffix, should still match
13 |         tmpdir.mkdir('_10')  # prefix is not integer, ignore
14 |         tmpdir.mkdir('.DS_Store')
15 |         tmpdir.mkdir('other')
16 |         return IntegerDirectories(str(tmpdir))
17 | 
18 |     def test_keys(self, int_dirs):
19 |         assert int_dirs.keys() == [-1, 1, 152, 153]
20 |         assert len(int_dirs) == 4
21 | 
22 |     def test_largest_int(self, int_dirs):
23 |         assert int_dirs.largest_int == 153
24 | 
25 |     def test_new_dir(self, tmpdir, int_dirs):
26 |         correct = str(tmpdir.join('154'))
27 |         assert int_dirs.new_dir() == correct
28 | 
29 |     def test_new_dir_named(self, tmpdir, int_dirs):
30 |         correct = str(tmpdir.join('154')) + '_foobar'
31 |         assert int_dirs.new_dir('foobar') == correct
32 | 
33 | 
34 | def test_split_path():
35 |     path = '/Users/Joe/Documents/file.txt'
36 |     assert split_path(path) == ['Users', 'Joe', 'Documents', 'file.txt']


--------------------------------------------------------------------------------
/textmorph/edit_model/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from gtd.io import save_stdout
 4 | from gtd.log import set_log_level
 5 | from gtd.utils import Config
 6 | from textmorph.edit_model.training_run import EditTrainingRuns
 7 | 
 8 | set_log_level('DEBUG')
 9 | 
10 | 
11 | arg_parser = argparse.ArgumentParser()
12 | arg_parser.add_argument('exp_id', nargs='+')
13 | arg_parser.add_argument('-c', '--check_commit', default='strict')
14 | arg_parser.add_argument('-p', '--profile', action='store_true')
15 | args = arg_parser.parse_args()
16 | 
17 | # create experiment
18 | experiments = EditTrainingRuns(check_commit=(args.check_commit=='strict'))
19 | 
20 | exp_id = args.exp_id
21 | if exp_id == ['default']:
22 |     # new default experiment
23 |     exp = experiments.new()
24 | elif len(exp_id) == 1 and exp_id[0].isdigit():
25 |     # reload old experiment
26 |     exp = experiments[int(exp_id[0])]
27 | else:
28 |     # new experiment according to configs
29 |     config = Config.from_file(exp_id[0])
30 |     for filename in exp_id[1:]:
31 |         config = Config.merge(config, Config.from_file(filename))
32 |     exp = experiments.new(config)  # new experiment from config
33 | 
34 | # start training
35 | exp.workspace.add_file('stdout', 'stdout.txt')
36 | exp.workspace.add_file('stderr', 'stderr.txt')
37 | 
38 | 
39 | with save_stdout(exp.workspace.root):
40 |     exp.train()
41 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/tests/test_log.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from gtd.log import Metadata, SyncedMetadata
 4 | 
 5 | 
 6 | class TestMetadata(object):
 7 |     @pytest.fixture
 8 |     def m(self):
 9 |         m = Metadata()
10 |         m['a'] = 10  # this is overwritten
11 |         m['b'] = 'test'
12 | 
13 |         # namescope setitem
14 |         with m.name_scope('c'):
15 |             m['foo'] = 140
16 | 
17 |         # nested setitem
18 |         m['a.foo'] = 120
19 |         m['c.bar'] = 'what'
20 | 
21 |         return m
22 | 
23 |     def test_getitem(self, m):
24 |         assert m['b'] == 'test'
25 | 
26 |     def test_nested_getitem(self, m):
27 |         assert m['a.foo'] == 120
28 |         assert m['c.foo'] == 140
29 | 
30 |     def test_namescope_getitem(self, m):
31 |         with m.name_scope('c'):
32 |             assert m['bar'] == 'what'
33 | 
34 |     def test_nested_metadata(self, m):
35 |         m_sub = m['a']
36 |         assert isinstance(m_sub, Metadata)
37 |         assert m_sub['foo'] == 120
38 | 
39 |     def test_contains(self, m):
40 |         assert 'b' in m
41 |         assert 'bar' not in m
42 |         assert 'c.bar' in m
43 | 
44 | 
45 | class TestSyncedMetadata(TestMetadata):  # run all the metadata tests
46 |     def test_syncing(self, tmpdir):
47 |         meta_path = str(tmpdir.join('meta.txt'))
48 |         s = SyncedMetadata(meta_path)
49 | 
50 |         with s.name_scope('job'):
51 |             s['memory'] = 128
52 | 
53 |         s2 = SyncedMetadata(meta_path)  # reload the file
54 | 
55 |         assert s2['job.memory'] == 128


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | from numpy.testing import assert_almost_equal
 4 | 
 5 | from gtd.ml.utils import temperature_smooth
 6 | 
 7 | 
 8 | def test_temperature_smooth():
 9 |     smooth = lambda probs, temp: temperature_smooth(np.array(probs, dtype=np.float32), temp)
10 |     same = lambda x1, x2: assert_almost_equal(x1, x2, decimal=4)
11 | 
12 |     probs = [0., 0.2, 0.4, 0.4]
13 |     third = 1./3
14 |     correct = [0., third, third, third]
15 |     same(smooth(probs, 100000), correct)
16 | 
17 |     # doesn't sum to 1
18 |     with pytest.raises(ValueError):
19 |         smooth([1, 2, 0], 1)
20 | 
21 |     # contains negative numbers
22 |     with pytest.raises(ValueError):
23 |         smooth([1, -1, 1], 1)
24 | 
25 |     # temperature = 0
26 |     with pytest.raises(ValueError):
27 |         probs = [0, 0.25, 0.75, 0]
28 |         smooth(probs, 0)
29 | 
30 |     # temperature = inf
31 |     with pytest.raises(ValueError):
32 |         probs = [0, 0.25, 0.75, 0]
33 |         smooth(probs, float('inf'))
34 | 
35 |     # temperature = 1
36 |     probs = [0, 0.25, 0.75, 0]
37 |     same(smooth(probs, 1), probs)  # shouldn't alter probs
38 | 
39 |     # contains 1
40 |     probs = [1, 0, 0]
41 |     same(smooth(probs, 10), probs)
42 |     same(smooth(probs, 0.1), probs)
43 | 
44 |     a = np.exp(2)
45 |     b = np.exp(3)
46 | 
47 |     probs = [0, a/(a+b), b/(a+b)]
48 |     smoothed = smooth(probs, 11)
49 | 
50 |     a2 = np.exp(2. / 11)
51 |     b2 = np.exp(3. / 11)
52 |     correct = [0, a2/(a2+b2), b2/(a2+b2)]
53 |     same(smoothed, correct)


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/recurrent.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from gtd.ml.torch.utils import GPUVariable
 3 | from torch.nn import Module
 4 | 
 5 | from gtd.ml.torch.utils import conditional
 6 | 
 7 | 
 8 | def tile_state(h, batch_size):
 9 |     """Tile a given hidden state batch_size times.
10 | 
11 |     Args:
12 |         h (Variable): a single hidden state of shape (hidden_dim,)
13 |         batch_size (int)
14 | 
15 |     Returns:
16 |         a Variable of shape (batch_size, hidden_dim)
17 |     """
18 |     tiler = GPUVariable(torch.ones(batch_size, 1))
19 |     return torch.mm(tiler, h.unsqueeze(0))  # (batch_size, hidden_size)
20 | 
21 | 
22 | def gated_update(h, h_new, update):
23 |     """If update == 1.0, return h_new; if update == 0.0, return h.
24 | 
25 |     Applies this logic to each element in a batch.
26 | 
27 |     Args:
28 |         h (Variable): of shape (batch_size, hidden_dim)
29 |         h_new (Variable): of shape (batch_size, hidden_dim)
30 |         update (Variable): of shape (batch_size, 1).
31 | 
32 |     Returns:
33 |         Variable: of shape (batch_size, hidden_dim)
34 | 
35 |     """
36 |     batch_size, hidden_dim = h.size()
37 |     gate = update.expand(batch_size, hidden_dim)
38 |     return conditional(gate, h_new, h)
39 | 
40 | 
41 | class AdditionCell(Module):
42 |     """Just add the input vector to the hidden state vector."""
43 | 
44 |     def __init__(self, input_dim, hidden_dim):
45 |         super(AdditionCell, self).__init__()
46 |         self.W = GPUVariable(torch.eye(input_dim, hidden_dim))
47 |         # truncates input if input_dim > hidden_dim
48 |         # pads with zeros if input_dim < hidden_dim
49 |         self.hidden_size = hidden_dim
50 | 
51 |     def forward(self, x, hc):
52 |         h, c = hc
53 |         h = x.mm(self.W) + h
54 |         return h, c


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/feed_forward.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import Module, Linear
 3 | 
 4 | 
 5 | class FeedForwardNetwork(Module):
 6 |     """A standard feedforward network, with residual connections for equal-sized layers."""
 7 |     def __init__(self, layer_dims):
 8 |         """Construct network.
 9 |         
10 |         For len(layer_dims) == 3:
11 |         
12 |             y = f(x * W1 + b1) * W2 + b2
13 |         
14 |         x: (batch_size, layer_dims[0])
15 |         W1: (layer_dims[0], layer_dims[1])
16 |         W2: (layer_dims[1], layer_dims[2])
17 |         
18 |         Note that there is no nonlinearity after final linear transform.
19 |         
20 |         Args:
21 |             layer_dims (list[int]):
22 |                 layer_dims[0] = input dimension
23 |                 layer_dims[-1] = output dimension
24 |         """
25 |         if len(layer_dims) < 3:
26 |             raise ValueError("len(layer_dims) == 2 is just linear, and fewer layers does not make sense.")
27 | 
28 |         super(FeedForwardNetwork, self).__init__()
29 |         self.nonlinearity = torch.nn.Tanh()  # same for all layers
30 |         self.layers = []
31 |         for i in range(len(layer_dims) - 1):
32 |             # these layers include a bias term
33 |             layer = Linear(layer_dims[i], layer_dims[i + 1])
34 |             # make sure to register sub-module
35 |             self.add_module('linear_{}'.format(i), layer)
36 |             self.layers.append(layer)
37 | 
38 |     def forward(self, x):
39 |         for i, layer in enumerate(self.layers):
40 |             x_prev = x
41 |             x = layer(x)
42 |             if i != len(self.layers) - 1:
43 |                 x = self.nonlinearity(x)  # apply nonlinearity if it is not the final layer
44 | 
45 |             if x.size() == x_prev.size():
46 |                 x = x + x_prev  # residual connection
47 | 
48 |         return x


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/tests/test_token_embedder.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | import torch
 4 | from torch.nn import Embedding
 5 | 
 6 | from gtd.ml.torch.token_embedder import TokenEmbedder
 7 | from gtd.ml.torch.utils import GPUVariable
 8 | from gtd.ml.torch.utils import assert_tensor_equal
 9 | from gtd.ml.vocab import SimpleVocab
10 | from gtd.utils import Bunch
11 | 
12 | 
13 | class TestTokenEmbedder(object):
14 |     @pytest.fixture
15 |     def embedder(self):
16 |         vocab = SimpleVocab(['<unk>', '<start>', '<stop>'] + ['a', 'b', 'c'])
17 |         arr = np.eye(len(vocab), dtype=np.float32)
18 |         word_embeddings = Bunch(vocab=vocab, array=arr)
19 |         return TokenEmbedder(word_embeddings)
20 | 
21 |     def test_embedding_from_array(self):
22 |         emb = TokenEmbedder._embedding_from_array(np.array([[9, 9], [8, 7]], dtype=np.float32))
23 |         assert isinstance(emb, Embedding)
24 |         values = emb(GPUVariable(torch.LongTensor([[0, 0], [1, 0]])))
25 | 
26 |         assert_tensor_equal(values,
27 |                             [
28 |                                 [[9, 9], [9, 9]],
29 |                                 [[8, 7], [9, 9]],
30 |                             ])
31 | 
32 |     def test_embed_indices(self, embedder):
33 |         indices = GPUVariable(torch.LongTensor([
34 |             [0, 1],
35 |             [2, 2],
36 |             [4, 5],
37 |         ]))
38 | 
39 |         embeds = embedder.embed_indices(indices)
40 | 
41 |         assert_tensor_equal(embeds, [
42 |             [[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]],
43 |             [[0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0]],
44 |             [[0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]],
45 |         ])
46 | 
47 |     def test_embed_tokens(self, embedder):
48 |         tokens = ['b', 'c', 'c']
49 |         embeds = embedder.embed_tokens(tokens)
50 | 
51 |         assert_tensor_equal(embeds, [
52 |             [0, 0, 0, 0, 1, 0],
53 |             [0, 0, 0, 0, 0, 1],
54 |             [0, 0, 0, 0, 0, 1],
55 |         ])


--------------------------------------------------------------------------------
/textmorph/edit_model/edit_noiser.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from textmorph.edit_model.editor import EditExample
 4 | 
 5 | 
 6 | class EditNoiser(object):
 7 | 
 8 |     def __init__(self, ident_pr = 0.1, attend_pr = 0.5):
 9 |         self.ident_pr = ident_pr
10 |         self.attend_pr = attend_pr
11 | 
12 |     def __call__(self, examples):
13 |         """Return a batch of noisy EditExamples.
14 | 
15 |         Does not modify the original EditExamples.
16 |         """
17 |         return [self._noise(ex) for ex in examples]
18 | 
19 |     def dropout_split(self, word_list):
20 |         pr_list = [1.0-self.attend_pr, self.attend_pr]
21 |         if len(word_list)>0:
22 |             num_sampled = np.random.choice(np.arange(len(pr_list)), 1, p=pr_list)
23 |             num_sampled = min(num_sampled, len(word_list))
24 |             choice_index = np.random.choice(np.arange(len(word_list)), num_sampled, replace=False)
25 |             mask = np.zeros(len(word_list), dtype=bool)
26 |             mask[choice_index] = True
27 |             warray = np.array(word_list)
28 |             return (warray[mask]).tolist(), (warray[np.invert(mask)]).tolist()
29 |         else:
30 |             return [], []
31 | 
32 |     def _noise(self, ex):
33 |         """Return a noisy EditExample.
34 | 
35 |         Note: this strategy is only appropriate for diff-style EditExamples.
36 | 
37 |         Args:
38 |             ex (EditExample)
39 | 
40 |         Returns:
41 |             EditExample: a new example. Does not modify the original example.
42 |         """
43 |         ident_map = np.random.binomial(1,self.ident_pr)
44 |         if ident_map:
45 |             return EditExample(ex.source_words, [], [], [], [], ex.source_words)
46 |         else:
47 |             insert_exact, insert_approx= self.dropout_split(ex.insert_exact_words)
48 |             delete_exact, delete_approx = self.dropout_split(ex.delete_exact_words)
49 |             return EditExample(ex.source_words, insert_approx, insert_exact, delete_approx, delete_exact, ex.target_words)
50 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/tests/test_lm.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | from numpy.testing import assert_approx_equal
 3 | from gtd.lm import last_k, CountLM, LMSampler, normalize_counts
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def lm():
 9 |     return CountLM(3)
10 | 
11 | 
12 | @pytest.fixture
13 | def lm_sampler(lm):
14 |     return LMSampler(lm)
15 | 
16 | 
17 | def test_last_k():
18 |     tokens = [1, 2, 3, 4]
19 |     assert last_k(tokens, 2) == (3, 4)
20 |     assert last_k(tokens, 4) == (1, 2, 3, 4)
21 |     assert last_k(tokens, 0) == tuple()
22 | 
23 | 
24 | def test_get_contexts(lm):
25 |     tokens = [1, 2, 3, 4, 5]
26 |     assert list(lm._get_contexts(tokens)) == [tuple(), (5,), (4, 5), (3, 4, 5)]
27 | 
28 |     assert list(lm._get_contexts([1, 2])) == [tuple(), (2,), (1, 2)]
29 | 
30 | 
31 | def test_largest_known_context(lm):
32 |     contexts = {tuple(), (3,), (2, 3), (1, 2)}
33 |     assert lm._largest_context([1, 2, 3], contexts) == (2, 3)
34 |     assert lm._largest_context([2, 3, 0], contexts) == tuple()
35 | 
36 | 
37 | def test_normalize_counts():
38 |     c = Counter([1, 1, 2, 2, 3])
39 |     assert normalize_counts(c) == Counter({1: .4, 2: .4, 3: .2})
40 | 
41 | 
42 | @pytest.mark.skip
43 | def test_sample_from_distribution(lm_sampler):
44 |     distr = {'a': 0.3, 'b': 0.7}
45 |     ctr = Counter()
46 |     # law of large numbers test
47 |     for k in range(100000):
48 |         ctr[lm_sampler._sample_from_distribution(distr)] += 1
49 |     empirical = normalize_counts(ctr)
50 |     for key in distr.keys() + empirical.keys():
51 |         assert_approx_equal(empirical[key], distr[key], significant=2)
52 | 
53 | def test_sequence_probability(lm):
54 |     lm = CountLM(3)
55 |     lines = ['apple pear banana', 'pear banana apple', 'banana pear banana']
56 |     for line in lines:
57 |         tokens = line.split()
58 |         lm.record_counts(tokens, append_end=True)
59 | 
60 |     probs = lm.sequence_probability(['pear', 'apple', 'pear'])
61 |     assert probs == [('pear', 0.3333333333333333), ('apple', 0.0), ('pear', 0.5)]


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/simple_decoder_cell.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | import torch
 4 | from torch.nn import LSTMCell, Linear, Parameter, Softmax
 5 | 
 6 | from gtd.ml.torch.decoder_cell import DecoderCell, DecoderCellOutput, RNNState, RNNInput
 7 | from gtd.ml.torch.recurrent import tile_state, gated_update
 8 | 
 9 | 
10 | class SimpleRNNState(namedtuple('SimpleRNNState', ['h', 'c']), RNNState):
11 |     pass
12 | 
13 | 
14 | class SimpleRNNInput(namedtuple('SimpleRNNInput', ['x', 'agenda']), RNNInput):
15 |     pass
16 | 
17 | 
18 | class SimpleDecoderCell(DecoderCell):
19 |     def __init__(self, token_embedder, hidden_dim, input_dim, agenda_dim):
20 |         super(SimpleDecoderCell, self).__init__()
21 |         self.rnn_cell = LSTMCell(input_dim + agenda_dim, hidden_dim)
22 |         self.linear = Linear(hidden_dim, input_dim)
23 |         self.h0 = Parameter(torch.zeros(hidden_dim))
24 |         self.c0 = Parameter(torch.zeros(hidden_dim))
25 |         self.softmax = Softmax()
26 |         self.token_embedder = token_embedder
27 | 
28 |     @property
29 |     def rnn_state_type(self):
30 |         return SimpleRNNState
31 | 
32 |     @property
33 |     def rnn_input_type(self):
34 |         return SimpleRNNInput
35 | 
36 |     def initialize(self, batch_size):
37 |         h = tile_state(self.h0, batch_size)
38 |         c = tile_state(self.c0, batch_size)
39 |         return SimpleRNNState(h, c)
40 | 
41 |     def forward(self, rnn_state, rnn_input, advance):
42 |         rnn_input_embed = torch.cat([rnn_input.x, rnn_input.agenda], 1)
43 |         h, c = self.rnn_cell(rnn_input_embed, (rnn_state.h, rnn_state.c))
44 | 
45 |         # don't update if sequence has terminated
46 |         h = gated_update(rnn_state.h, h, advance)
47 |         c = gated_update(rnn_state.c, c, advance)
48 | 
49 |         query = self.linear(h)
50 |         word_vocab = self.token_embedder.vocab
51 |         word_embeds = self.token_embedder.embeds
52 |         vocab_logits = torch.mm(query, word_embeds.t())  # (batch_size, vocab_size)
53 |         vocab_probs = self.softmax(vocab_logits)
54 | 
55 |         # no attention over source, insert and delete embeds
56 |         rnn_state = SimpleRNNState(h, c)
57 | 
58 |         return DecoderCellOutput(rnn_state, vocab=word_vocab, vocab_probs=vocab_probs)


--------------------------------------------------------------------------------
/third-party/gtd/gtd/plot.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helper functions for plotting
 3 | """
 4 | import os
 5 | 
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | from gtd.io import makedirs
 9 | from gtd.log import in_ipython
10 | 
11 | 
12 | def hinton(matrix, max_weight=None, ax=None, xtick=None, ytick=None, inverted_color=False):
13 |     """Draw Hinton diagram for visualizing a weight matrix.
14 | 
15 |     Copied from: http://matplotlib.org/examples/specialty_plots/hinton_demo.html
16 |     """
17 |     ax = ax if ax is not None else plt.gca()
18 |     if not max_weight:
19 |         max_weight = 2**np.ceil(np.log(np.abs(matrix).max())/np.log(2))
20 | 
21 |     ax.patch.set_facecolor('gray')
22 |     ax.set_aspect('equal', 'box')
23 |     ax.xaxis.set_major_locator(plt.NullLocator())
24 |     ax.yaxis.set_major_locator(plt.NullLocator())
25 | 
26 |     for (x, y), w in np.ndenumerate(matrix):
27 |         if inverted_color:
28 |             color = 'black' if w > 0 else 'white'
29 |         else:
30 |             color = 'white' if w > 0 else 'black'
31 |         size = np.sqrt(np.abs(w))
32 |         rect = plt.Rectangle([x - size / 2, y - size / 2], size, size,
33 |                              facecolor=color, edgecolor=color)
34 |         ax.add_patch(rect)
35 | 
36 |     ax.autoscale_view()
37 |     ax.invert_yaxis()
38 | 
39 |     if xtick:
40 |         ax.set_xticks(np.arange(matrix.shape[0]))
41 |         ax.set_xticklabels(xtick)
42 |     if ytick:
43 |         ax.set_yticks(np.arange(matrix.shape[1]))
44 |         ax.set_yticklabels(ytick)
45 |     return ax
46 | 
47 | 
48 | def show(title, directory=''):
49 |     """If in IPython, show, otherwise, save to file."""
50 |     import matplotlib.pyplot as plt
51 |     if in_ipython():
52 |         plt.show()
53 |     else:
54 |         # ensure directory exists
55 |         makedirs(directory)
56 | 
57 |         plt.savefig(os.path.join(directory, title) + '.png')
58 |         # close all figures to conserve memory
59 |         plt.close('all')
60 | 
61 | 
62 | def plot_pdf(x, cov_factor=None, *args, **kwargs):
63 |     import matplotlib.pyplot as plt
64 |     from scipy.stats import gaussian_kde
65 |     density = gaussian_kde(x)
66 |     xgrid = np.linspace(min(x), max(x), 200)
67 |     if cov_factor is not None:
68 |         density.covariance_factor = lambda: cov_factor
69 |         density._compute_covariance()
70 |     y = density(xgrid)
71 |     plt.plot(xgrid, y, *args, **kwargs)
72 | 
73 | 
74 | def rgb_to_hex(rgb):
75 |     return '#%02x%02x%02x' % rgb


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/tests/test_alignments.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from gtd.ml.torch.utils import assert_tensor_equal
 3 | 
 4 | from gtd.ml.torch.alignments import Alignments
 5 | 
 6 | 
 7 | class TestAlignments(object):
 8 |     @pytest.fixture
 9 |     def source_words(self):
10 |         return [
11 |             ['a', 'c', 'b', 'c'],
12 |             ['1', '3', '2', '2', '2'],
13 |             [],
14 |         ]
15 | 
16 |     @pytest.fixture
17 |     def target_words(self):
18 |         return [
19 |             ['c', 'z', 'b', 'c'],
20 |             ['1', 'c'],
21 |             ['2', '4'],
22 |         ]
23 | 
24 |     @pytest.fixture
25 |     def aligns(self, source_words, target_words):
26 |         return Alignments(source_words, target_words)
27 | 
28 |     def test(self, aligns):
29 |         assert_tensor_equal(aligns.indices,
30 |                             [
31 |                                 [[1, 3], [0, 0], [2, 0], [1, 3]],
32 |                                 [[0, 0], [0, 0], [0, 0], [0, 0]],
33 |                                 [[0, 0], [0, 0], [0, 0], [0, 0]],
34 |                             ])
35 | 
36 |         assert_tensor_equal(aligns.mask,
37 |                             [
38 |                                 [[1, 1], [0, 0], [1, 0], [1, 1]],
39 |                                 [[1, 0], [0, 0], [0, 0], [0, 0]],
40 |                                 [[0, 0], [0, 0], [0, 0], [0, 0]],
41 |                             ])
42 | 
43 |     def test_split(self, aligns):
44 |         items = aligns.split()
45 |         assert len(items) == 4
46 | 
47 |         assert_tensor_equal(items[0].values,
48 |                             [
49 |                                 [1, 3],
50 |                                 [0, 0],
51 |                                 [0, 0]
52 |                             ])
53 | 
54 |         assert_tensor_equal(items[0].mask,
55 |                             [
56 |                                 [1, 1],
57 |                                 [1, 0],
58 |                                 [0, 0]
59 |                             ])
60 | 
61 |         assert_tensor_equal(items[2].values,
62 |                             [
63 |                                 [2, 0],
64 |                                 [0, 0],
65 |                                 [0, 0]
66 |                             ])
67 | 
68 |         assert_tensor_equal(items[2].mask,
69 |                             [
70 |                                 [1, 0],
71 |                                 [0, 0],
72 |                                 [0, 0]
73 |                             ])


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/training_run.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import numpy as np
 3 | from torch.nn.utils import clip_grad_norm
 4 | 
 5 | from gtd.ml.training_run import TrainingRun
 6 | from gtd.ml.torch.checkpoints import Checkpoints
 7 | from gtd.utils import cached_property
 8 | 
 9 | 
10 | class TorchTrainingRun(TrainingRun):
11 |     def __init__(self, config, save_dir):
12 |         super(TorchTrainingRun, self).__init__(config, save_dir)
13 |         self.workspace.add_dir('checkpoints', 'checkpoints')
14 | 
15 |     @cached_property
16 |     def checkpoints(self):
17 |         return Checkpoints(self.workspace.checkpoints)
18 | 
19 |     @classmethod
20 |     def _finite_grads(cls, parameters):
21 |         """Check that all parameter gradients are finite.
22 | 
23 |         Args:
24 |             parameters (List[Parameter])
25 | 
26 |         Return:
27 |             bool
28 |         """
29 |         for param in parameters:
30 |             if param.grad is None: continue
31 |             if not np.isfinite(param.grad.data.sum()):
32 |                 return False
33 |         return True
34 | 
35 |     @classmethod
36 |     def _take_grad_step(cls, train_state, loss, max_grad_norm=float('inf')):
37 |         """Try to take a gradient step w.r.t. loss.
38 |         
39 |         If the gradient is finite, takes a step. Otherwise, does nothing.
40 |         
41 |         Args:
42 |             train_state (TrainState)
43 |             loss (Variable): a differentiable scalar variable
44 |             max_grad_norm (float): gradient norm is clipped to this value.
45 |         
46 |         Returns:
47 |             bool: True if the gradient was finite.
48 |         """
49 |         model, optimizer = train_state.model, train_state.optimizer
50 |         optimizer.zero_grad()
51 |         loss.backward()
52 | 
53 |         # clip according to the max allowed grad norm
54 |         grad_norm = clip_grad_norm(model.parameters(), max_grad_norm, norm_type=2)
55 |         # (this returns the gradient norm BEFORE clipping)
56 | 
57 |         # track the gradient norm over time
58 |         train_state.track_grad_norms(grad_norm)
59 | 
60 |         finite_grads = cls._finite_grads(model.parameters())
61 | 
62 |         # take a step if the grads are finite
63 |         if finite_grads:
64 |             optimizer.step()
65 | 
66 |         # increment step count
67 |         train_state.increment_train_steps()
68 | 
69 |         return finite_grads
70 | 
71 |     def _update_metadata(self, train_state):
72 |         self.metadata['last_seen'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
73 |         self.metadata['steps'] = train_state.train_steps
74 |         self.metadata['max_grad_norm'] = train_state.max_grad_norm


--------------------------------------------------------------------------------
/third-party/gtd/scripts/run_nlpsub.py:
--------------------------------------------------------------------------------
 1 | #!/u/nlp/packages/anaconda2/bin/python
 2 | 
 3 | # THIS SCRIPT SHOULD BE SYMLINKED INTO THE ROOT OF YOUR GIT REPO
 4 | # It assumes that config.json and run_docker.py can also be found at the root of your repo.
 5 | 
 6 | import argparse
 7 | import json
 8 | import os
 9 | from datetime import datetime
10 | import subprocess
11 | 
12 | from os.path import abspath, dirname, join
13 | 
14 | arg_parser = argparse.ArgumentParser()
15 | arg_parser.add_argument('-n', '--name', default='unnamed', help='Job name.')
16 | arg_parser.add_argument('-t', '--tail', action='store_true', help='Tail the output.')
17 | arg_parser.add_argument('-x', '--debug', action='store_true', help='Print command instead of running.')
18 | arg_parser.add_argument('-m', '--host', action='append', help='Allowed hosts.')
19 | arg_parser.add_argument('-g', '--gpu', default='0', help='GPU to use.')
20 | arg_parser.add_argument('command', nargs='+', help='Command passed to run_docker.py')
21 | args = arg_parser.parse_args()
22 | 
23 | repo_dir = abspath(dirname(__file__))
24 | with open(join(repo_dir, 'config.json'), 'r') as f:
25 |     config = json.load(f)
26 | data_env_var = config['data_env_var']  # environment variable used by code to locate data, e.g. 'TEXTMORPH_DATA'
27 | data_dir = os.environ[data_env_var]
28 | 
29 | time_str = datetime.now().strftime('%Y-%m-%d_%H.%M.%S')
30 | log_dir = join(data_dir, 'nlpsub', '{}_{}'.format(args.name, time_str))
31 | 
32 | nlpsub_options = ['--queue=jag',
33 |                   '--cores=1',
34 |                   '--mem=2g',
35 |                   '--priority=high',
36 |                   '--name={}'.format(args.name),
37 |                   '--log-dir={}'.format(log_dir),
38 |                   '--mail=bea',
39 |                   '--clobber',
40 |                   '--verbose']
41 | if args.tail:
42 |     nlpsub_options.append('--tail')
43 | 
44 | if args.host is not None:
45 |     nlpsub_options.append('--hosts={}'.format(','.join(args.host)))
46 | 
47 | def bash_string(s):
48 |     s = s.replace('\\', '\\\\')  # \ -> \\
49 |     s = s.replace('\"', '\\\"')  # " -> \"
50 |     return '\"{}\"'.format(s)  # s -> "s"
51 | 
52 | 
53 | cmd = ' '.join(args.command)
54 | 
55 | docker_cmd = '/u/nlp/packages/anaconda2/bin/python run_docker.py -r -g {gpu} {command}'.format(gpu=args.gpu, command=bash_string(cmd))
56 | 
57 | nlpsub_cmd = 'nlpsub {options} {command}'.format(options=' '.join(nlpsub_options), command=bash_string(docker_cmd))
58 | 
59 | print 'Logging to: {}'.format(log_dir)
60 | print 'Allowed hosts: {}'.format(args.host)
61 | print 'GPU to use: {}'.format(args.gpu)
62 | print 'Command inside Docker: {}'.format(cmd)
63 | print nlpsub_cmd
64 | print
65 | 
66 | if not args.debug:
67 |     subprocess.call(nlpsub_cmd, shell=True)
68 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/tests/test_source_encoder.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from gtd.ml.torch.recurrent import AdditionCell
 5 | from gtd.ml.torch.seq_batch import SequenceBatch
 6 | from gtd.ml.torch.source_encoder import BidirectionalSourceEncoder
 7 | from gtd.ml.torch.token_embedder import TokenEmbedder
 8 | from gtd.ml.torch.utils import assert_tensor_equal
 9 | from gtd.ml.vocab import SimpleVocab
10 | from gtd.utils import Bunch
11 | 
12 | 
13 | class TestBidirectionalSourceEncoder(object):
14 |     @pytest.fixture
15 |     def encoder(self):
16 |         return BidirectionalSourceEncoder(1, 2, AdditionCell)
17 | 
18 |     @pytest.fixture
19 |     def input_embeds_list(self):
20 |         sequences = [
21 |             [1, 2, 3],
22 |             [8, 4, 2, 1, 1],
23 |             [],
24 |         ]
25 | 
26 |         # token 1 maps to embedding [1], 2 maps to [2] and so on...
27 |         vocab = SimpleVocab([1, 2, 3, 4, 5, 6, 7, 8])
28 |         array = np.expand_dims(np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.float32), 1)
29 |         token_embedder = TokenEmbedder(Bunch(vocab=vocab, array=array))
30 | 
31 |         seq_embeds = token_embedder.embed_seq_batch(SequenceBatch.from_sequences(sequences, vocab))
32 |         return seq_embeds.split()
33 | 
34 |     def test_combined_states(self, encoder, input_embeds_list):
35 |         states = encoder(input_embeds_list).combined_states
36 | 
37 |         # forward encoder is cumulatively summing from the left
38 |         # backward encoder is cumulatively summing from the right
39 | 
40 |         # both encoders should ignore masked time steps
41 |         assert_tensor_equal(states[0].values, [[1, 6],
42 |                                                [8, 16],
43 |                                                [0, 0],
44 |                                                ])
45 |         assert_tensor_equal(states[0].mask, [[1], [1], [0]])
46 | 
47 |         assert_tensor_equal(states[2].values, [[6, 3],
48 |                                                [14, 4],
49 |                                                [0, 0],
50 |                                                ])
51 |         assert_tensor_equal(states[2].mask, [[1], [1], [0]])
52 | 
53 |         assert_tensor_equal(states[3].values, [[6, 0],
54 |                                                [15, 2],
55 |                                                [0, 0],
56 |                                                ])
57 |         assert_tensor_equal(states[3].mask, [[0], [1], [0]])
58 | 
59 |     def test_final_states(self, encoder, input_embeds_list):
60 |         forward, backward = encoder(input_embeds_list).final_states
61 |         assert_tensor_equal(forward, [[6], [16], [0]])
62 |         assert_tensor_equal(backward, [[6], [16], [0]])
63 | 
64 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM kelvinguu/pytorch:1.2
 2 | # FROM tensorflow/tensorflow:0.12.0
 3 | # FROM continuumio/anaconda
 4 | 
 5 | # Add the PostgreSQL PGP key to verify their Debian packages.
 6 | # It should be the same key as https://www.postgresql.org/media/keys/ACCC4CF8.asc
 7 | RUN apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys B97B0AFCAA1A47F044F244A07FCC7D46ACCC4CF8
 8 | 
 9 | # Add PostgreSQL's repository. It contains the most recent stable release of PostgreSQL, ``9.3``.
10 | RUN echo "deb http://apt.postgresql.org/pub/repos/apt/ precise-pgdg main" > /etc/apt/sources.list.d/pgdg.list
11 | 
12 | # Install ``python-software-properties``, ``software-properties-common`` and PostgreSQL 9.3
13 | # There are some warnings (in red) that show up during the build. You can hide
14 | # them by prefixing each apt-get statement with DEBIAN_FRONTEND=noninteractive
15 | RUN apt-get update && apt-get install -y python-software-properties software-properties-common postgresql-9.3 postgresql-client-9.3 postgresql-contrib-9.3
16 | 
17 | RUN apt-get update
18 | RUN apt-get --yes --force-yes install libffi6 libffi-dev libssl-dev libpq-dev git
19 | 
20 | RUN pip install --upgrade pip
21 | RUN pip install jupyter
22 | RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension  # add Jupyter notebook extension
23 | 
24 | RUN pip install fabric
25 | RUN pip install pyOpenSSL==16.2.0
26 | RUN pip install psycopg2==2.6.1
27 | RUN pip install SQLAlchemy==1.1.0b3
28 | RUN pip install cherrypy==8.1.2
29 | RUN pip install bottle==0.12.10
30 | RUN pip install boto==2.43.0
31 | 
32 | RUN pip install requests
33 | RUN pip install nltk==3.2.3
34 | RUN python -m nltk.downloader punkt  # download tokenizer data
35 | 
36 | RUN pip install keras==1.1.0
37 | RUN pip install pyhocon line_profiler pytest tqdm faulthandler python-Levenshtein gitpython futures jsonpickle prettytable tensorboard_logger click
38 | 
39 | RUN apt-get update
40 | RUN apt-get install -y vim less tmux nmap
41 | COPY .tmux.conf /root
42 | 
43 | # vim bindings for Jupyter
44 | # https://github.com/lambdalisue/jupyter-vim-binding
45 | RUN mkdir -p $(jupyter --data-dir)/nbextensions
46 | RUN git clone https://github.com/lambdalisue/jupyter-vim-binding $(jupyter --data-dir)/nbextensions/vim_binding
47 | RUN jupyter nbextension enable vim_binding/vim_binding
48 | 
49 | # autoreload for Jupyter
50 | RUN ipython profile create
51 | RUN echo 'c.InteractiveShellApp.exec_lines = []' >> ~/.ipython/profile_default/ipython_config.py
52 | RUN echo 'c.InteractiveShellApp.exec_lines.append("%load_ext autoreload")' >> ~/.ipython/profile_default/ipython_config.py
53 | RUN echo 'c.InteractiveShellApp.exec_lines.append("%autoreload 2")' >> ~/.ipython/profile_default/ipython_config.py
54 | 
55 | # just installing so we can get tensorboard
56 | RUN pip install tensorflow
57 | 
58 | RUN pip install annoy pympler


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/decoder_cell.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod, abstractproperty
 2 | from collections import namedtuple
 3 | 
 4 | import torch
 5 | from torch.nn import Module
 6 | 
 7 | from gtd.ml.torch.utils import NamedTupleLike
 8 | 
 9 | 
10 | # marker class
11 | class RNNState(NamedTupleLike):
12 |     __slots__ = []
13 | 
14 | 
15 | # marker class
16 | class RNNInput(NamedTupleLike):
17 |     __slots__ = []
18 | 
19 | 
20 | PredictionBatch = namedtuple('PredictionBatch', ['probs', 'vocab'])
21 | """
22 | Attributes:
23 |     probs (np.ndarray): of shape (batch_size, vocab_size)
24 |     vocab (Vocab)
25 | """
26 | 
27 | 
28 | class DecoderCellOutput(namedtuple('DecoderCellOutput', ['rnn_state', 'vocab', 'vocab_probs'])):
29 |     """
30 |     Attributes:
31 |         rnn_state (RNNState)
32 |         vocab (Vocab)
33 |         vocab_probs (Variable): of shape (batch_size, vocab_size)
34 |     """
35 | 
36 |     def loss(self, target_word):
37 |         """Compute loss for this time step.
38 | 
39 |         Args:
40 |             target_word (Variable): LongTensor of shape (batch_size,)
41 | 
42 |         Returns:
43 |             loss (Variable): of shape (batch_size,)
44 |         """
45 |         target_prob = torch.gather(self.vocab_probs, 1, target_word.unsqueeze(1)).squeeze(1)  # (batch_size,)
46 |         assert len(target_prob.size()) == 1
47 | 
48 |         loss = -torch.log(target_prob + 1e-45)  # negative log-likelihood
49 |         # added 1e-45 to prevent loss from being -infinity in the case where probs is close to 0
50 |         return loss
51 | 
52 |     @property
53 |     def predictions(self):
54 |         """Return a PredictionBatch.
55 | 
56 |         Returns:
57 |             PredictionBatch
58 |         """
59 |         return PredictionBatch(self.vocab_probs.data.cpu().numpy(), self.vocab)
60 | 
61 | 
62 | class DecoderCell(Module):
63 |     __metaclass__ = ABCMeta
64 | 
65 |     @abstractproperty
66 |     def rnn_state_type(self):
67 |         pass
68 | 
69 |     @abstractproperty
70 |     def rnn_input_type(self):
71 |         pass
72 | 
73 |     @abstractmethod
74 |     def initialize(self, batch_size):
75 |         """Return initial RNNState.
76 | 
77 |         Args:
78 |             batch_size (int)
79 | 
80 |         Returns:
81 |             RNNState
82 |         """
83 |         raise NotImplementedError
84 | 
85 |     def forward(self, rnn_state, rnn_input, advance):
86 |         """Advance the decoder by one step.
87 | 
88 |         Args:
89 |             rnn_state (RNNState): the previous RNN state.
90 |             rnn_input (RNNInput): any inputs at this time step.
91 |             advance (Variable): of shape (batch_size, 1). The RNN should advance on example i iff mask[i] == 1.
92 | 
93 |         Returns:
94 |             DecoderCellOutput
95 |         """
96 |         raise NotImplementedError


--------------------------------------------------------------------------------
/run_docker.py:
--------------------------------------------------------------------------------
 1 | #!/u/nlp/packages/anaconda2/bin/python
 2 | 
 3 | # THIS SCRIPT SHOULD BE SYMLINKED INTO THE ROOT OF YOUR GIT REPO
 4 | # It assumes that config.json can also be found at the root of your repo.
 5 | 
 6 | import argparse
 7 | import json
 8 | import os
 9 | 
10 | from os.path import dirname, abspath, join
11 | import subprocess
12 | 
13 | arg_parser = argparse.ArgumentParser()
14 | arg_parser.add_argument('-r', '--root', action='store_true', help='Run as root in Docker.')
15 | arg_parser.add_argument('-g', '--gpu', default='', help='GPU to use.')
16 | arg_parser.add_argument('-d', '--debug', action='store_true', help='Print command instead of running.')
17 | arg_parser.add_argument('command', nargs='?', default=None,
18 |                         help='Command to execute in Docker. If no command is specified, ' \
19 |                              'you enter interactive mode. ' \
20 |                              'To execute a command with spaces, wrap ' \
21 |                              'the entire command in quotes.')
22 | args = arg_parser.parse_args()
23 | 
24 | repo_dir = abspath(dirname(__file__))
25 | 
26 | with open(join(repo_dir, 'config.json'), 'r') as f:
27 |     config = json.load(f)
28 | 
29 | image = config['docker_image']  # name of the Docker image, e.g. 'kelvinguu/textmorph:1.0'
30 | data_env_var = config['data_env_var']  # environment variable used by code to locate data, e.g. 'TEXTMORPH_DATA'
31 | data_dir = os.environ[data_env_var]
32 | 
33 | my_uid = subprocess.check_output(['echo', '$UID']).strip()
34 | 
35 | docker_args = ["--net host",  # access to the Internet
36 |                "--publish 8888:8888",  # only certain ports are exposed
37 |                "--publish 6006:6006",
38 |                "--publish 8080:8080",
39 |                "--ipc=host",
40 |                "--rm",
41 |                "--volume {}:/data".format(data_dir),
42 |                "--volume {}:/code".format(repo_dir),
43 |                "--env {}=/data".format(data_env_var),
44 |                "--env PYTHONPATH=/code",
45 |                "--env NLTK_DATA=/data/nltk",
46 |                "--env CUDA_VISIBLE_DEVICES={}".format(args.gpu),
47 |                "--workdir /code"]
48 | 
49 | # interactive mode
50 | if args.command is None:
51 |     docker_args.append('--interactive')
52 |     docker_args.append('--tty')
53 |     args.command = '/bin/bash'
54 | 
55 | if not args.root:
56 |     docker_args.append('--user={}'.format(my_uid))
57 | 
58 | if args.gpu == '':
59 |     # run on CPU
60 |     docker = 'docker'
61 | else:
62 |     # run on GPU
63 |     docker = 'nvidia-docker'
64 | 
65 | pull_cmd = "docker pull {}".format(image)
66 | 
67 | run_cmd = '{docker} run {options} {image} {command}'.format(docker=docker,
68 |                                                             options=' '.join(docker_args),
69 |                                                             image=image,
70 |                                                             command=args.command)
71 | print 'Data directory: {}'.format(data_dir)
72 | print 'Command to run inside Docker: {}'.format(args.command)
73 | 
74 | print pull_cmd
75 | print run_cmd
76 | if not args.debug:
77 |     subprocess.call(pull_cmd, shell=True)
78 |     subprocess.call(run_cmd, shell=True)
79 | 


--------------------------------------------------------------------------------
/third-party/gtd/scripts/run_docker.py:
--------------------------------------------------------------------------------
 1 | #!/u/nlp/packages/anaconda2/bin/python
 2 | 
 3 | # THIS SCRIPT SHOULD BE SYMLINKED INTO THE ROOT OF YOUR GIT REPO
 4 | # It assumes that config.json can also be found at the root of your repo.
 5 | 
 6 | import argparse
 7 | import json
 8 | import os
 9 | 
10 | from os.path import dirname, abspath, join
11 | import subprocess
12 | 
13 | arg_parser = argparse.ArgumentParser()
14 | arg_parser.add_argument('-r', '--root', action='store_true', help='Run as root in Docker.')
15 | arg_parser.add_argument('-g', '--gpu', default='', help='GPU to use.')
16 | arg_parser.add_argument('-d', '--debug', action='store_true', help='Print command instead of running.')
17 | arg_parser.add_argument('command', nargs='?', default=None,
18 |                         help='Command to execute in Docker. If no command is specified, ' \
19 |                              'you enter interactive mode. ' \
20 |                              'To execute a command with spaces, wrap ' \
21 |                              'the entire command in quotes.')
22 | args = arg_parser.parse_args()
23 | 
24 | repo_dir = abspath(dirname(__file__))
25 | 
26 | with open(join(repo_dir, 'config.json'), 'r') as f:
27 |     config = json.load(f)
28 | 
29 | image = config['docker_image']  # name of the Docker image, e.g. 'kelvinguu/textmorph:1.0'
30 | data_env_var = config['data_env_var']  # environment variable used by code to locate data, e.g. 'TEXTMORPH_DATA'
31 | data_dir = os.environ[data_env_var]
32 | 
33 | my_uid = subprocess.check_output(['echo', '$UID']).strip()
34 | 
35 | docker_args = ["--net host",  # access to the Internet
36 |                "--publish 8888:8888",  # only certain ports are exposed
37 |                "--publish 6006:6006",
38 |                "--publish 8080:8080",
39 |                "--ipc=host",
40 |                "--rm",
41 |                "--volume {}:/data".format(data_dir),
42 |                "--volume {}:/code".format(repo_dir),
43 |                "--env {}=/data".format(data_env_var),
44 |                "--env PYTHONPATH=/code",
45 |                "--env NLTK_DATA=/data/nltk",
46 |                "--env CUDA_VISIBLE_DEVICES={}".format(args.gpu),
47 |                "--workdir /code"]
48 | 
49 | # interactive mode
50 | if args.command is None:
51 |     docker_args.append('--interactive')
52 |     docker_args.append('--tty')
53 |     args.command = '/bin/bash'
54 | 
55 | if not args.root:
56 |     docker_args.append('--user={}'.format(my_uid))
57 | 
58 | if args.gpu == '':
59 |     # run on CPU
60 |     docker = 'docker'
61 | else:
62 |     # run on GPU
63 |     docker = 'nvidia-docker'
64 | 
65 | pull_cmd = "docker pull {}".format(image)
66 | 
67 | run_cmd = '{docker} run {options} {image} {command}'.format(docker=docker,
68 |                                                             options=' '.join(docker_args),
69 |                                                             image=image,
70 |                                                             command=args.command)
71 | print 'Data directory: {}'.format(data_dir)
72 | print 'Command to run inside Docker: {}'.format(args.command)
73 | 
74 | print pull_cmd
75 | print run_cmd
76 | if not args.debug:
77 |     subprocess.call(pull_cmd, shell=True)
78 |     subprocess.call(run_cmd, shell=True)
79 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/multilayered_decoder_cell.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | import torch
 4 | from torch.nn import LSTMCell, Linear, Parameter, Softmax
 5 | 
 6 | from gtd.ml.torch.decoder_cell import DecoderCell, DecoderCellOutput, RNNState, RNNInput
 7 | from gtd.ml.torch.recurrent import tile_state, gated_update
 8 | from gtd.ml.torch.utils import GPUVariable, try_gpu
 9 | 
10 | 
11 | class MultilayeredRNNState(namedtuple('MultilayeredRNNState', ['hs', 'cs']), RNNState):
12 |     pass
13 | 
14 | 
15 | class MultilayeredRNNInput(namedtuple('MultilayeredRNNInput', ['x', 'agenda']), RNNInput):
16 |     pass
17 | 
18 | 
19 | class MultilayeredDecoderCell(DecoderCell):
20 |     def __init__(self, token_embedder, hidden_dim, input_dim, agenda_dim, num_layers):
21 |         super(MultilayeredDecoderCell, self).__init__()
22 |         self.linear = Linear(hidden_dim, input_dim)
23 |         self.h0 = Parameter(torch.zeros(hidden_dim))
24 |         self.c0 = Parameter(torch.zeros(hidden_dim))
25 |         self.softmax = Softmax()
26 |         self.token_embedder = token_embedder
27 |         self.num_layers = num_layers
28 | 
29 |         self.rnn_cells = []
30 |         for layer in range(num_layers):
31 |             in_dim = (input_dim + agenda_dim) if layer == 0 else hidden_dim # inputs to first layer are word vectors
32 |             out_dim = hidden_dim
33 |             rnn_cell = LSTMCell(in_dim, out_dim)
34 |             self.add_module('decoder_layer_{}'.format(layer), rnn_cell)
35 |             self.rnn_cells.append(rnn_cell)
36 | 
37 |     @property
38 |     def rnn_state_type(self):
39 |         return MultilayeredRNNState
40 | 
41 |     @property
42 |     def rnn_input_type(self):
43 |         return MultilayeredRNNInput
44 | 
45 |     def initialize(self, batch_size):
46 |         h = tile_state(self.h0, batch_size)
47 |         c = tile_state(self.c0, batch_size)
48 |         return MultilayeredRNNState([h] * self.num_layers, [c] * self.num_layers)
49 | 
50 |     def forward(self, rnn_state, rnn_input, advance):
51 |         x = torch.cat([rnn_input.x, rnn_input.agenda], 1)
52 |         hs, cs = [], []
53 |         for layer in range(self.num_layers):
54 |             rnn_cell = self.rnn_cells[layer]
55 | 
56 |             # collect the h, c belonging to the previous time-step at the corresponding depth
57 |             h_prev_t, c_prev_t = rnn_state.hs[layer], rnn_state.cs[layer]
58 | 
59 |             # forward pass and masking
60 |             h, c = rnn_cell(x, (h_prev_t, c_prev_t))
61 |             h = gated_update(h_prev_t, h, advance)
62 |             c = gated_update(c_prev_t, c, advance)
63 |             hs.append(h)
64 |             cs.append(c)
65 | 
66 |             if layer == 0:
67 |                 x = h  # no skip connection on the first layer
68 |             else:
69 |                 x = x + h
70 | 
71 |         query = self.linear(x)
72 |         word_vocab = self.token_embedder.vocab
73 |         word_embeds = self.token_embedder.embeds
74 |         vocab_logits = torch.mm(query, word_embeds.t())  # (batch_size, vocab_size)
75 |         vocab_probs = self.softmax(vocab_logits)
76 | 
77 |         rnn_state = MultilayeredRNNState(hs, cs)
78 | 
79 |         return DecoderCellOutput(rnn_state, vocab=word_vocab, vocab_probs=vocab_probs)


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/alignments.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from itertools import izip
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | from gtd.ml.torch.utils import GPUVariable
 7 | 
 8 | from gtd.ml.torch.seq_batch import SequenceBatch
 9 | 
10 | 
11 | class Alignments(object):
12 |     """
13 |     Attributes:
14 |         indices (Variable): of shape (batch_size, max_target_seq_length, max_alignments)
15 |         mask (Variable): of shape (batch_size, max_target_seq_length, max_alignments)
16 | 
17 |     max_alignments is always at least 1, so that indices and mask do not have a dimension of 0 (which confuses
18 |         downstream Torch ops)
19 |     """
20 |     def __init__(self, source_words, target_words):
21 |         """Represent alignments as a Tensor.
22 | 
23 |         Args:
24 |             source_words (list[list[unicode]]): batch of source sequences
25 |             target_words (list[list[unicode]]): batch of target sequences
26 |         """
27 |         assert len(source_words) == len(target_words)
28 |         # compute alignments
29 |         alignments_batch = [self._align(s, t) for s, t in izip(source_words, target_words)]
30 | 
31 |         # compute dimensions of alignment tensor
32 |         batch_size = len(alignments_batch)
33 |         max_target_seq_length = 0
34 |         max_alignments = 1  # make this dimension at least 1, so that we don't get a tensor with no entries
35 |         for alignments in alignments_batch:
36 |             max_target_seq_length = max(max_target_seq_length, len(alignments))
37 |             for align in alignments:
38 |                 max_alignments = max(max_alignments, len(align))
39 | 
40 |         indices = -1 * np.ones((batch_size, max_target_seq_length, max_alignments), dtype=np.int64)
41 |         # filled with -1's for padding.
42 |         # int64 gets converted into torch.LongTensor
43 | 
44 |         for i, alignments in enumerate(alignments_batch):
45 |             for j, align in enumerate(alignments):
46 |                 for k, idx in enumerate(align):
47 |                     indices[i, j, k] = idx
48 | 
49 |         mask = (indices != -1).astype(np.float32)
50 |         indices[indices == -1] = 0
51 |         self._indices = GPUVariable(torch.from_numpy(indices))
52 |         self._mask = GPUVariable(torch.from_numpy(mask))
53 | 
54 |     @property
55 |     def indices(self):
56 |         return self._indices
57 | 
58 |     @property
59 |     def mask(self):
60 |         return self._mask
61 | 
62 |     @classmethod
63 |     def _align(self, source_seq, target_seq):
64 |         """For each target word, give its positions in the source sequence.
65 | 
66 |         Args:
67 |             source_seq (list[unicode])
68 |             target_seq (list[unicode])
69 | 
70 |         Returns:
71 |             alignments (list[list[int]]): alignments[i] is an ordered list of the indices where target_seq[i]
72 |                 appears in source_seq.
73 |         """
74 |         alignments_dict = defaultdict(list)
75 |         for i, word in enumerate(source_seq):
76 |             alignments_dict[word].append(i)
77 | 
78 |         alignments = []
79 |         for word in target_seq:
80 |             alignments.append(alignments_dict[word])
81 | 
82 |         return alignments
83 | 
84 |     def split(self):
85 |         """Split alignments object into per-time-step alignments.
86 | 
87 |         Returns:
88 |             list[SequenceBatch]: where each element has shape (batch_size, max_alignments)
89 |         """
90 |         indices_list = [v.squeeze(1) for v in self.indices.split(1, dim=1)]
91 |         mask_list = [v.squeeze(1) for v in self.mask.split(1, dim=1)]
92 |         return [SequenceBatch(i, m) for i, m in izip(indices_list, mask_list)]


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/tests/test_attention.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | from gtd.ml.torch.attention import Attention
  5 | from gtd.ml.torch.seq_batch import SequenceBatch
  6 | from gtd.ml.torch.utils import GPUVariable
  7 | from gtd.ml.torch.utils import assert_tensor_equal
  8 | 
  9 | 
 10 | class TestAttention(object):
 11 | 
 12 |     def test_forward(self):
 13 |         float_tensor = lambda arr: torch.FloatTensor(arr)
 14 |         float_tensor_var = lambda arr: GPUVariable(torch.FloatTensor(arr))
 15 | 
 16 |         batch_size, num_cells = 5, 2
 17 |         memory_dim, query_dim, attn_dim = 4, 3, 2
 18 | 
 19 |         memory_transform = np.array([ # Wh: (memory_dim x attn_dim)
 20 |                                     [.1, .5],
 21 |                                     [.2, .6],
 22 |                                     [.3, .7],
 23 |                                     [.4, .8],
 24 |                                 ])
 25 |         query_transform = np.array([ # Ws: (query_dim x attn_dim)
 26 |                                    [.3, .4],
 27 |                                    [.2, .5],
 28 |                                    [.1, .6],
 29 |                                 ])
 30 |         v_transform = np.array([ # v: (attn_dim x 1)
 31 |                                [.1],
 32 |                                [.8],
 33 |                             ])
 34 | 
 35 |         mem_values = np.array([ # (batch_size x num_cells x memory_dim)
 36 |                               [
 37 |                                 [.1, .2, .3, .4],
 38 |                                 [.4, .5, .6, .7],
 39 |                               ],
 40 |                               [
 41 |                                 [.2, .3, .4, .5],
 42 |                                 [.6, .7, .8, .9],
 43 |                               ],
 44 |                               [
 45 |                                 [.3, .4, .5, .6],
 46 |                                 [.7, .8, .9, .1],
 47 |                               ],
 48 |                               [
 49 |                                 [-8, -9, -10, -11],
 50 |                                 [-12, -13, -14, -15],
 51 |                               ],
 52 |                               [
 53 |                                 [8, 9, 10, 11],
 54 |                                 [12, 13, 14, 15],
 55 |                               ]
 56 |                         ])
 57 |         mem_values = float_tensor_var(mem_values)
 58 |         mem_mask = np.array([
 59 |                             [1, 0],
 60 |                             [1, 1],
 61 |                             [1, 0],
 62 |                             [0, 0],
 63 |                             [0, 1],
 64 |                           ])
 65 |         mem_mask = float_tensor_var(mem_mask)
 66 |         memory_cells = SequenceBatch(values=mem_values, mask=mem_mask)
 67 |         query = np.array([ # (batch_size x query_dim)
 68 |                          [.1, .2, .3],
 69 |                          [.4, .5, .6],
 70 |                          [.7, .8, .9],
 71 |                          [10, 11, 12],
 72 |                          [13, 14, 15],
 73 |                     ])
 74 |         query = float_tensor_var(query)
 75 | 
 76 |         # compute manually
 77 |         # Et = np.array([ [ 0.65388812,  0.81788159],
 78 |         #                 [ 0.81039669,  0.87306204],
 79 |         #                 [ 0.86236411,  0.86977563]])
 80 |         manual_weights = np.array([[ 1.,  0.],
 81 |                                    [ 0.4843,  0.5156],
 82 |                                    [ 1.,  0.],
 83 |                                    [0, 0],
 84 |                                    [0, 1.],
 85 |                                    ])
 86 |         manual_context = np.array([[ 0.1,  0.2,  0.3,  0.4],
 87 |                                    [ 0.4062,  0.5062,  0.6062,  0.7062],
 88 |                                    [ 0.3,  0.4,  0.5,  0.6],
 89 |                                    [0, 0, 0, 0],
 90 |                                    [12, 13, 14, 15],
 91 |                                    ])
 92 | 
 93 |         # compute with module
 94 |         attn = Attention(memory_dim, query_dim, attn_dim)
 95 |         attn.memory_transform.data.set_(float_tensor(memory_transform))
 96 |         attn.query_transform.data.set_(float_tensor(query_transform))
 97 |         attn.v_transform.data.set_(float_tensor(v_transform))
 98 | 
 99 |         attn_out = attn(memory_cells, query)
100 |         assert_tensor_equal(attn_out.weights, manual_weights, decimal=4)
101 |         assert_tensor_equal(attn_out.context, manual_context, decimal=4)


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/token_embedder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from gtd.ml.torch.utils import GPUVariable
  3 | from torch.nn import Embedding, Module
  4 | 
  5 | from gtd.ml.torch.seq_batch import SequenceBatch
  6 | 
  7 | 
  8 | class TokenEmbedder(Module):
  9 |     """
 10 |     Attributes:
 11 |         vocab (WordVocab)
 12 |         embeds (Variable): of shape (vocab_size, embed_dim)
 13 |         embed_dim (int)
 14 |     """
 15 | 
 16 |     def __init__(self, word_embeddings, trainable=True):
 17 |         """Create TokenEmbedder.
 18 | 
 19 |         Args:
 20 |             word_embeddings (WordEmbeddings)
 21 |             trainable (bool): if False, the embedding array will not see
 22 |                 gradient steps
 23 |         """
 24 |         super(TokenEmbedder, self).__init__()
 25 | 
 26 |         arr = word_embeddings.array  # np.ndarray
 27 |         vocab_size, embed_dim = arr.shape
 28 | 
 29 |         assert vocab_size == len(word_embeddings.vocab)
 30 |         self.vocab = word_embeddings.vocab
 31 |         self.embed_dim = embed_dim
 32 | 
 33 |         # create Embedding Module
 34 |         vocab_size, embed_dim = arr.shape
 35 |         self._embedding = TrainFlagEmbedding(
 36 |             vocab_size, embed_dim, arr, trainable=trainable)
 37 | 
 38 |     @property
 39 |     def embeds(self):
 40 |         """Return Variable of shape (vocab_size, embed_dim)."""
 41 |         return self._embedding.weight
 42 | 
 43 |     def embed_indices(self, indices):
 44 |         """Embed array of indices.
 45 | 
 46 |         Args:
 47 |             indices (Variable[LongTensor]): of shape (X1, X2) or (X1)
 48 | 
 49 |         Returns:
 50 |             embeds (Variable[FloatTensor]): of shape (X1, X2, embed_dim) or (X1, embed_dim)
 51 |         """
 52 |         return self._embedding(indices)
 53 | 
 54 |     def embed_seq_batch(self, seq_batch):
 55 |         """Embed elements of a SequenceBatch.
 56 | 
 57 |         Args:
 58 |             seq_batch (SequenceBatch)
 59 | 
 60 |         Returns:
 61 |             SequenceBatch
 62 |         """
 63 |         return SequenceBatch(self._embedding(seq_batch.values), seq_batch.mask)
 64 | 
 65 |     def embed_tokens(self, tokens):
 66 |         """Embed list of tokens.
 67 | 
 68 |         Args:
 69 |             tokens (list[unicode])
 70 | 
 71 |         Returns:
 72 |             embeds (Variable[FloatTensor]): of shape (len(tokens), embed_dim)
 73 |         """
 74 |         vocab = self.vocab
 75 |         indices = GPUVariable(torch.LongTensor([vocab.word2index(t) for t in tokens]))
 76 |         return self._embedding(indices)
 77 | 
 78 | 
 79 | class TrainFlagEmbedding(Module):
 80 |     """Small wrapper around PyTorch Embedding object. Exports a trainable
 81 |     flag, which allows you to fix the weights matrix. Obeys same interface as
 82 |     PyTorch Embedding object, except for extra constructor arguments.
 83 |     """
 84 | 
 85 |     def __init__(self, num_embeddings, embedding_dim,
 86 |                  initial_embeddings, **kwargs):
 87 |         """Constructs TrainFlagEmbedding with embeddings initialized with
 88 |         initial_embeddings.
 89 | 
 90 |         Args:
 91 |             num_embeddings (int)
 92 |             embedding_dim (int)
 93 |             initial_embeddings (np.array): (num_embeddings, embedding_dim)
 94 |             trainable (bool): if False, weights matrix will not change.
 95 |                 (default True)
 96 |             kwargs: all other supported keywords in torch.nn.Embeddings.
 97 |         """
 98 |         super(TrainFlagEmbedding, self).__init__()
 99 |         trainable = kwargs.pop("trainable", True)
100 |         self._trainable = trainable
101 |         if trainable:
102 |             embedding = Embedding(
103 |                 num_embeddings, embedding_dim, **kwargs)
104 |             embedding.weight.data.set_(
105 |                 torch.from_numpy(initial_embeddings))
106 |             self._embedding = embedding
107 |             self._weight = embedding.weight
108 |         else:
109 |             self._weight = GPUVariable(
110 |                 torch.from_numpy(initial_embeddings))
111 | 
112 |     @property
113 |     def weight(self):
114 |         return self._weight
115 | 
116 |     def forward(self, index):
117 |         """Looks up a batch of indices.
118 | 
119 |         Args:
120 |             index (Variable[LongTensor]): (batch, indices per batch)
121 | 
122 |         Returns:
123 |             Tensor: (batch, indices per batch, embedding_dim)
124 |         """
125 |         if self._trainable:
126 |             return self._embedding(index)
127 |         else:
128 |             batch, num_indices = index.size()
129 |             flattened_index = index.view(batch * num_indices)
130 |             embeddings = torch.index_select(
131 |                 self._weight, 0, flattened_index)
132 |             return embeddings.view(batch, num_indices, -1)
133 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/checkpoints.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pickle
  2 | from os import listdir
  3 | import os
  4 | from os.path import join
  5 | 
  6 | import torch
  7 | 
  8 | import gtd
  9 | from gtd.ml.torch.utils import RandomState
 10 | 
 11 | 
 12 | class TrainState(object):
 13 |     def __init__(self, model, optimizer, train_steps, random_state, max_grad_norm):
 14 |         """Construct a snapshot of training state.
 15 | 
 16 |         Args:
 17 |             model (Module)
 18 |             optimizer (Optimizer)
 19 |             train_steps (int)
 20 |             random_state (RandomState)
 21 |             max_grad_norm (float): used for gradient clipping
 22 |         """
 23 |         self.model = model
 24 |         self.optimizer = optimizer
 25 |         self.train_steps = train_steps
 26 |         self.random_state = random_state
 27 |         self.max_grad_norm = max_grad_norm
 28 | 
 29 |     def increment_train_steps(self):
 30 |         self.train_steps += 1
 31 | 
 32 |     def track_grad_norms(self, grad_norm):
 33 |         # we will clip grad norm to be at most 2x the norm of anything we've tracked so far
 34 |         self.max_grad_norm = max(self.max_grad_norm, 2 * grad_norm)
 35 | 
 36 |     def save(self, path):
 37 |         gtd.io.makedirs(path)
 38 | 
 39 |         # Store the latest random state
 40 |         self.random_state = RandomState()
 41 | 
 42 |         # save model
 43 |         torch.save(self.model.state_dict(), join(path, 'model'))
 44 |         torch.save(self.optimizer.state_dict(), join(path, 'optimizer'))
 45 | 
 46 |         # pickle remaining attributes
 47 |         d = {attr: getattr(self, attr) for attr in ['train_steps', 'random_state', 'max_grad_norm']}
 48 |         with open(join(path, 'metadata.p'), 'w') as f:
 49 |             pickle.dump(d, f)
 50 | 
 51 |     @classmethod
 52 |     def load(cls, path, model, optimizer):
 53 |         with open(join(path, 'metadata.p'), 'r') as f:
 54 |             d = pickle.load(f)
 55 | 
 56 |         # load model
 57 |         optimizer.load_state_dict(torch.load(join(path, 'optimizer')))
 58 |         model.load_state_dict(torch.load(join(path, 'model')))
 59 |         train_state = TrainState(model=model, optimizer=optimizer, **d)
 60 |         return train_state
 61 | 
 62 |     @classmethod
 63 |     def initialize(cls, model, optimizer):
 64 |         train_steps = 0
 65 |         max_grad_norm = 0
 66 |         random_state = RandomState()
 67 |         return TrainState(model=model, optimizer=optimizer, train_steps=train_steps,
 68 |                           random_state=random_state, max_grad_norm=max_grad_norm)
 69 | 
 70 | 
 71 | class Checkpoints(object):
 72 |     def __init__(self, checkpoints_dir):
 73 |         self._path = checkpoints_dir
 74 | 
 75 |     @property
 76 |     def checkpoint_numbers(self):
 77 |         """Return the train steps at which checkpoints were saved (sorted ascending)."""
 78 |         dirs = [d for d in listdir(self._path) if d.endswith('.checkpoint')]
 79 |         return sorted([int(d[:-11]) for d in dirs])  # '.checkpoint' is 11 characters
 80 | 
 81 |     @property
 82 |     def latest_checkpoint_number(self):
 83 |         """Return the train_steps of the latest saved checkpoint.
 84 | 
 85 |         If no checkpoints, return None.
 86 |         """
 87 |         nums = self.checkpoint_numbers
 88 |         if len(nums) == 0:
 89 |             return None
 90 |         else:
 91 |             return max(nums)
 92 | 
 93 |     def load(self, train_steps, model, optimizer):
 94 |         """Load the checkpoint for a particular training step.
 95 | 
 96 |         Args:
 97 |             model (Module)
 98 |             optimizer (Optimizer)
 99 | 
100 |         Returns:
101 |             TrainState
102 |         """
103 |         ckpt_path = join(self._path, '{}.checkpoint'.format(train_steps))
104 |         if not os.path.exists(ckpt_path):
105 |             raise ValueError('Checkpoint #{} does not exist.'.format(train_steps))
106 |         return TrainState.load(ckpt_path, model, optimizer)
107 | 
108 |     def save(self, train_state):
109 |         """Save TrainState."""
110 |         ckpt_path = join(self._path, '{}.checkpoint'.format(train_state.train_steps))
111 |         train_state.save(ckpt_path)
112 | 
113 |     def load_latest(self, model, optimizer):
114 |         """Load the latest checkpoint.
115 |         
116 |         If there are no checkpoints, return a freshly initialized Checkpoint.
117 |         
118 |         Args:
119 |             model (Module)
120 |             optimizer (Optimizer)
121 | 
122 |         Returns:
123 |             TrainState
124 |         """
125 |         ckpt_num = self.latest_checkpoint_number
126 |         if ckpt_num is None:
127 |             print 'No checkpoint to reload. Initializing fresh.'
128 |             return TrainState.initialize(model, optimizer)
129 |         else:
130 |             train_state = self.load(self.latest_checkpoint_number, model, optimizer)
131 |             print 'Reloaded checkpoint #{}'.format(ckpt_num)
132 |             return train_state


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/attention.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | 
  3 | import math
  4 | import torch
  5 | from gtd.ml.torch.utils import GPUVariable
  6 | from torch.nn import Parameter
  7 | from torch.nn import Softmax, Tanh, Module
  8 | 
  9 | from gtd.ml.torch.utils import conditional, NamedTupleLike
 10 | 
 11 | 
 12 | class AttentionOutput(namedtuple('AttentionOutput', ['weights', 'context']), NamedTupleLike):
 13 |     pass
 14 | """
 15 | Attributes:
 16 |     weights (Variable): of shape (batch_size, num_cells)
 17 |     context (Variable): of shape (batch_size, memory_dim)
 18 | """
 19 | 
 20 | class DummyAttention(Module):
 21 |     def __init__(self, memory_dim, query_dim, attn_dim):
 22 |         super(DummyAttention, self).__init__()
 23 |         self.memory_dim = memory_dim
 24 |         self.query_dim = query_dim
 25 |         self.attn_dim = attn_dim
 26 | 
 27 |     def forward(self, memory_cells, query):
 28 |         batch_size, num_cells = memory_cells.mask.size()
 29 |         weights = GPUVariable(torch.zeros(batch_size, num_cells))
 30 |         context = GPUVariable(torch.zeros(batch_size, self.memory_dim))
 31 |         return AttentionOutput(weights=weights, context=context)
 32 | 
 33 | 
 34 | 
 35 | class Attention(Module):
 36 |     def __init__(self, memory_dim, query_dim, attn_dim):
 37 |         super(Attention, self).__init__()
 38 |         self.tanh = Tanh()
 39 |         self.softmax = Softmax()
 40 | 
 41 |         self.memory_dim = memory_dim
 42 |         self.query_dim = query_dim
 43 |         self.attn_dim = attn_dim
 44 | 
 45 |         self.memory_transform = Parameter(self._initialize_weight_matrix(memory_dim, attn_dim))  # Wh
 46 |         self.query_transform = Parameter(self._initialize_weight_matrix(query_dim, attn_dim))  # Ws
 47 |         self.v_transform = Parameter(self._initialize_weight_matrix(attn_dim, 1))  # v
 48 | 
 49 |     @classmethod
 50 |     def _initialize_weight_matrix(cls, in_dim, out_dim):
 51 |         stdv = 1. / math.sqrt(in_dim)
 52 |         m = torch.ones(in_dim, out_dim)
 53 |         m.uniform_(-stdv, stdv)
 54 |         return m
 55 | 
 56 |     def forward(self, memory_cells, query):
 57 |         """Generates a density over a set of elements w.r.t. the query vector.
 58 | 
 59 |         Et(i) = tanh(Hi * Wh + St * Ws) * v
 60 |         At = softmax(Et)
 61 | 
 62 |         Dimensions:
 63 |             Hi: (batch_size x memory_dim)
 64 |             St: (batch_size x query_dim)
 65 |             Wh: (memory_dim x attn_dim)
 66 |             Ws: (query_dim x attn_dim)
 67 |             v:  (attn_dim x 1)
 68 |             --
 69 |             tanh( Hi * Wh + St * Ws ):       (batch_size x attn_dim)
 70 |             tanh( Hi * Wh + St * Ws ) * v:   (batch_size x 1)
 71 |             At = softmax(Et):                (batch_size x num_cells)
 72 | 
 73 |         Args:
 74 |             memory_cells (SequenceBatch): (batch_size x num_cells x memory_dim)
 75 |             query (torch.Variable): St (batch_size x query_dim)
 76 | 
 77 |         Returns:
 78 |             Variable: (batch_size x num_cells) array
 79 |         """
 80 |         transformed_query = torch.mm(query, self.query_transform)  # (batch_size, attn_dim)
 81 | 
 82 |         batch_size, num_cells = memory_cells.mask.size()
 83 |         memory_cells_ = torch.transpose(memory_cells.values, 0, 1)  # (num_cells, batch_size, memory_dim)
 84 |         expanded_transformed_query = transformed_query.expand(num_cells, batch_size, self.attn_dim)
 85 |         expanded_memory_transform = self.memory_transform.expand(num_cells, self.memory_dim, self.attn_dim)
 86 |         expanded_v_transform = self.v_transform.expand(num_cells, self.attn_dim, 1)
 87 | 
 88 |         # (num_cells, batch_size, attn_dim)
 89 |         attn_embeds = torch.bmm(memory_cells_, expanded_memory_transform) + expanded_transformed_query
 90 |         attn_embeds = self.tanh(attn_embeds)
 91 |         attn_embeds = torch.bmm(attn_embeds, expanded_v_transform)  # (num_cells, batch_size, 1)
 92 |         logits = torch.transpose(attn_embeds.squeeze(2), 0, 1)
 93 | 
 94 |         mask = memory_cells.mask
 95 | 
 96 |         # no_cells is a FloatTensor with shape (batch_size, num_cells)
 97 |         # no_cells[i, j] = 1 if example i has NO memory cells, 0 otherwise
 98 |         no_cells = (1 - mask).prod(1).expand_as(mask)
 99 |         # TODO(kelvin): check for numerical stability. Product of 1's does not necessarily equal 1 exactly, which we need
100 | 
101 |         suppress = GPUVariable(torch.zeros(*mask.size()))
102 |         suppress[mask == 0] = float('-inf')  # send the logit of non-cells to -infinity
103 |         suppress[no_cells == 1] = 0.0  # but if an entire row has no cells, just leave the cells alone
104 | 
105 |         logits = logits + suppress
106 |         # -inf + anything = -inf
107 | 
108 |         # compute normalized weights
109 |         weights = self.softmax(logits)  # (batch_size, num_cells)
110 | 
111 |         # if a given row has no memory cells, weights should be all zeros
112 |         all_zeros = GPUVariable(torch.zeros(*mask.size()))
113 |         weights = conditional(no_cells, all_zeros, weights)
114 | 
115 |         context = torch.bmm(weights.unsqueeze(1), memory_cells.values)  # (batch_size, 1, memory_dim)
116 |         context = context.squeeze(1)  # (batch_size, memory_dim)
117 |         return AttentionOutput(weights=weights, context=context)
118 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/training_run.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import logging
  3 | import socket
  4 | from abc import ABCMeta, abstractmethod
  5 | from collections import Mapping
  6 | from os.path import join
  7 | 
  8 | from git import Repo
  9 | from tensorboard_logger import tensorboard_logger
 10 | 
 11 | from gtd.io import IntegerDirectories, Workspace
 12 | from gtd.log import SyncedMetadata
 13 | from gtd.utils import Config, cached_property
 14 | 
 15 | 
 16 | class TrainingRunWorkspace(Workspace):
 17 |     def __init__(self, root):
 18 |         super(TrainingRunWorkspace, self).__init__(root)
 19 |         for attr in ['config', 'metadata']:
 20 |             self.add_file(attr, '{}.txt'.format(attr))
 21 |         for attr in ['git_patches', 'tensorboard']:
 22 |             self.add_dir(attr, attr)
 23 | 
 24 | 
 25 | class TrainingRun(object):
 26 |     __metaclass__ = ABCMeta
 27 | 
 28 |     def __init__(self, config, save_dir):
 29 |         """Create TrainingRun.
 30 | 
 31 |         Args:
 32 |             config (Config)
 33 |             save_dir (str)
 34 |         """
 35 |         self._config = config
 36 |         self._workspace = TrainingRunWorkspace(save_dir)
 37 |         self.metadata['host'] = socket.gethostname()
 38 | 
 39 |     @abstractmethod
 40 |     def train(self):
 41 |         raise NotImplementedError
 42 | 
 43 |     @property
 44 |     def config(self):
 45 |         return self._config
 46 | 
 47 |     @property
 48 |     def workspace(self):
 49 |         return self._workspace
 50 | 
 51 |     @cached_property
 52 |     def metadata(self):
 53 |         return SyncedMetadata(self.workspace.metadata, fmt='json')
 54 | 
 55 |     @cached_property
 56 |     def tb_logger(self):
 57 |         return tensorboard_logger.Logger(self.workspace.tensorboard)
 58 | 
 59 |     def record_commit(self, src_dir):
 60 |         repo = Repo(src_dir)
 61 | 
 62 |         if 'dirty_repo' in self.metadata or 'commit' in self.metadata:
 63 |             raise RuntimeError('A commit has already been recorded.')
 64 | 
 65 |         self.metadata['dirty_repo'] = repo.is_dirty()
 66 |         self.metadata['commit'] = repo.head.object.hexsha.encode('utf-8')
 67 | 
 68 |     def dump_diff(self, src_dir):
 69 |         repo = Repo(src_dir)
 70 |         diffindex = repo.head.commit.diff(None, create_patch=True)
 71 |         if len(diffindex) > 0:
 72 |             print 'uncomitted changes being stored as patches'
 73 |             patch_strings = [unicode(diff) for diff in diffindex]
 74 |             patch_filenames = [unicode(diff.a_rawpath).replace(u'/', u'-').replace(u'.', u'-') + u'.patch' for diff in
 75 |                                diffindex]
 76 |             for strin, filename in zip(patch_strings, patch_filenames):
 77 |                 file_out = join(self.workspace.git_patches, filename)
 78 |                 with io.open(file_out, 'w', encoding='utf-8') as fout:
 79 |                     fout.writelines(strin)
 80 |         else:
 81 |             print 'no changes to diff. ignoring git diff.'
 82 | 
 83 |     def match_commit(self, src_dir):
 84 |         """Check that the current commit matches the recorded commit for this run.
 85 | 
 86 |         Raises an error if commits don't match, or if there is dirty state.
 87 | 
 88 |         Args:
 89 |             src_dir (str): path to the Git repository
 90 |         """
 91 |         if self.metadata['dirty_repo']:
 92 |             raise EnvironmentError('Working directory was dirty when commit was recorded.')
 93 | 
 94 |         repo = Repo(src_dir)
 95 |         if repo.is_dirty():
 96 |             raise EnvironmentError('Current working directory is dirty.')
 97 | 
 98 |         current_commit = repo.head.object.hexsha.encode('utf-8')
 99 |         run_commit = self.metadata['commit']
100 |         if current_commit != run_commit:
101 |             raise EnvironmentError("Commits don't match.\nCurrent: {}\nRecorded: {}".format(current_commit, run_commit))
102 | 
103 | 
104 | class TrainingRuns(Mapping):
105 |     """A map from integers to TrainingRuns."""
106 | 
107 |     def __init__(self, root_dir, src_dir, run_factory, check_commit=True):
108 |         """Create TrainingRuns object.
109 | 
110 |         Args:
111 |             root_dir (str): directory where all training run data will be stored
112 |             src_dir (str): a Git repository path (used to check commits)
113 |             run_factory (Callable[[Config, str], TrainingRun]): a Callable, which takes a Config and a save_dir
114 |                 as arguments, and creates a new TrainingRun.
115 |             check_commit (bool): if True, checks that current working directory is on same commit as when the run
116 |                 was originally created.
117 |         """
118 |         self._int_dirs = IntegerDirectories(root_dir)
119 |         self._src_dir = src_dir
120 |         self._run_factory = run_factory
121 |         self._check_commit = check_commit
122 | 
123 |     def _config_path(self, save_dir):
124 |         return join(save_dir, 'config.txt')
125 | 
126 |     def __getitem__(self, i):
127 |         """Reload an existing TrainingRun."""
128 |         save_dir = self._int_dirs[i]
129 |         config = Config.from_file(self._config_path(save_dir))
130 |         run = self._run_factory(config, save_dir)
131 |         if self._check_commit:
132 |             run.match_commit(self._src_dir)
133 | 
134 |         logging.info('Reloaded TrainingRun #{}'.format(i))
135 |         return run
136 | 
137 |     def new(self, config, name=None):
138 |         """Create a new TrainingRun."""
139 |         print 'TrainingRun configuration:\n{}'.format(config)
140 | 
141 |         save_dir = self._int_dirs.new_dir(name=name)
142 |         cfg_path = self._config_path(save_dir)
143 |         config.to_file(cfg_path)  # save the config
144 |         run = self._run_factory(config, save_dir)
145 |         run.record_commit(self._src_dir)
146 |         run.dump_diff(self._src_dir)
147 |         run.metadata['config'] = config._config_tree  # save config in metadata, for programmatic access
148 | 
149 |         print 'New TrainingRun created at: {}'.format(run.workspace.root)
150 |         return run
151 | 
152 |     def __iter__(self):
153 |         return iter(self._int_dirs)
154 | 
155 |     def __len__(self):
156 |         return len(self._int_dirs)
157 | 
158 |     def paths(self):
159 |         return self._int_dirs.values()
160 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/training_run_viewer.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from abc import ABCMeta, abstractmethod
  3 | from collections import OrderedDict
  4 | 
  5 | import pwd
  6 | from IPython.core.display import display, HTML
  7 | from os.path import join, basename
  8 | 
  9 | import os
 10 | from os import listdir
 11 | from prettytable import PrettyTable
 12 | 
 13 | from gtd.chrono import verboserate
 14 | from gtd.log import in_ipython, jupyter_no_margins, Metadata
 15 | 
 16 | 
 17 | class TrainingRunViewer(object):
 18 |     def __init__(self, runs):
 19 |         """Construct TrainingRunViewer.
 20 |         
 21 |         Args:
 22 |             runs (gtd.ml.TrainingRuns)
 23 |         """
 24 |         self._runs = runs
 25 |         self._renderers = OrderedDict()
 26 | 
 27 |     def add(self, name, renderer, post_processor=None):
 28 |         """Add a renderer.
 29 |         
 30 |         Args:
 31 |             name (unicode): name for the attribute
 32 |             renderer (Callable[str, object]): takes a run dir (absolute path) and returns something to print.
 33 |             post_processor (Callable[object, unicode]): takes the output of the renderer and returns a modified output.
 34 | 
 35 |         Returns:
 36 | 
 37 |         """
 38 |         if post_processor:
 39 |             r = lambda path: post_processor(renderer(path))
 40 |         else:
 41 |             r = renderer
 42 |         self._renderers[name] = r
 43 | 
 44 |     def view(self, select=lambda path: True):
 45 |         """View runs.
 46 |         
 47 |         Args:
 48 |             select (Callable[str, bool]): given a path to a run, returns True if we want to display the
 49 |                 run, False otherwise.
 50 |         """
 51 |         field_names = self._renderers.keys()
 52 |         table = PrettyTable(field_names=field_names)
 53 |         types = OrderedDict((n, set()) for n in field_names)
 54 | 
 55 |         for i, path in verboserate(self._runs._int_dirs.items(), desc='Scanning runs.'):
 56 |             if not select(path):
 57 |                 continue
 58 | 
 59 |             row = []
 60 |             for render in self._renderers.values():
 61 |                 try:
 62 |                     s = render(path)
 63 |                 except:
 64 |                     s = u''
 65 |                 row.append(s)
 66 | 
 67 |             # record types
 68 |             for name, elem in zip(field_names, row):
 69 |                 types[name].add(type(elem))
 70 | 
 71 |             table.add_row(row)
 72 | 
 73 |         self._print_table(table)
 74 | 
 75 |         # display types for each attribute
 76 |         type_table = PrettyTable(['attribute', 'types'])
 77 |         for name, type_set in types.iteritems():
 78 |             type_table.add_row([name, ', '.join(t.__name__ for t in type_set)])
 79 |         self._print_table(type_table)
 80 | 
 81 |     @classmethod
 82 |     def _print_table(cls, table):
 83 |         if in_ipython():
 84 |             jupyter_no_margins()
 85 |             display(HTML(table.get_html_string()))
 86 |         else:
 87 |             print table
 88 | 
 89 | 
 90 | class Renderer(object):
 91 |     __metaclass__ = ABCMeta
 92 | 
 93 |     @abstractmethod
 94 |     def __call__(self, path):
 95 |         """Render.
 96 | 
 97 |         Args:
 98 |             path (str): absolute path to a run directory
 99 | 
100 |         Returns:
101 |             object: value to be displayed in a pretty-printed table. Should implement __str__ and __unicode__.
102 |         """
103 |         raise NotImplementedError
104 | 
105 | 
106 | # Some renderers below are just functions, for simplicity.
107 | 
108 | 
109 | class JSONSelector(Renderer):
110 |     def __init__(self, file_path, json_keys):
111 |         """Select a value in a JSON file, or a HOCON file.
112 | 
113 |         Args:
114 |             file_path (str): path to the JSON file, relative to run dir root.
115 |             json_keys (list[str]): path from the root of the JSON tree to the target attribute
116 |         """
117 |         self.file_path = file_path
118 |         self.json_keys = json_keys
119 | 
120 |     def __call__(self, path):
121 |         full_path = join(path, self.file_path)
122 |         try:
123 |             # try loading as JSON
124 |             with open(full_path, 'r') as f:
125 |                 x = json.load(f)
126 |         except ValueError:
127 |             # try loading as HOCON
128 |             x = Metadata.from_file(full_path, fmt='hocon')
129 | 
130 |         for key in self.json_keys:
131 |             x = x[key]
132 | 
133 |         return x
134 | 
135 | 
136 | class Commit(Renderer):
137 |     def __init__(self):
138 |         self._commit = JSONSelector('metadata.txt', ['commit'])
139 |         self._dirty = JSONSelector('metadata.txt', ['dirty_repo'])
140 | 
141 |     def __call__(self, path):
142 |         c = self._commit(path)[:8]
143 |         d = ' (dirty)' if self._dirty(path) else ''
144 |         return '{}{}'.format(c, d)
145 | 
146 | 
147 | class NumSteps(Renderer):
148 |     def __init__(self):
149 |         self.json_selector = JSONSelector('metadata.txt', ['steps'])
150 | 
151 |     def __call__(self, path):
152 |         try:
153 |             steps = self.json_selector(path)  # try looking in JSON
154 |         except:
155 |             # if that fails, look at the largest checkpoint
156 |             ckpt_nums = checkpoint_numbers(join(path, 'checkpoints'))
157 |             steps = max(ckpt_nums) if ckpt_nums else 0
158 |         return steps
159 | 
160 | 
161 | class Owner(Renderer):
162 |     def __init__(self, user_ids):
163 |         self.user_ids = user_ids
164 | 
165 |     def __call__(self, path):
166 |         stat_info = os.stat(path)
167 |         uid = stat_info.st_uid
168 |         try:
169 |             user = pwd.getpwuid(uid)[0]
170 |         except:
171 |             # sometimes no name is associated with the ID
172 |             user = self.user_ids.get(uid, uid)
173 | 
174 |         return str(user)
175 | 
176 | 
177 | def checkpoint_numbers(checkpoints_dir):
178 |     """Return the train steps at which checkpoints were saved (sorted ascending)."""
179 |     dirs = [d for d in listdir(checkpoints_dir) if d.endswith('.checkpoint')]
180 |     return sorted([int(d[:-11]) for d in dirs])
181 | 
182 | 
183 | def run_name(path):
184 |     return basename(path)
185 | 
186 | 
187 | def num_checkpoints(path):
188 |     ckpt_nums = checkpoint_numbers(join(path, 'checkpoints'))
189 |     return len(ckpt_nums)
190 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/tf/tests/test_framework.py:
--------------------------------------------------------------------------------
  1 | from abc import abstractmethod
  2 | from collections import Sequence, Mapping
  3 | from itertools import izip
  4 | 
  5 | import numpy as np
  6 | import pytest
  7 | import tensorflow as tf
  8 | from keras.engine import Input
  9 | from keras.layers import Dense
 10 | from numpy.testing import assert_array_almost_equal
 11 | 
 12 | from gtd.ml.tf.framework import Feedable, KerasModel
 13 | from gtd.ml.tf.utils import guarantee_initialized_variables, clean_session
 14 | from gtd.utils import Bunch
 15 | 
 16 | 
 17 | @pytest.yield_fixture
 18 | def clean_test_session():
 19 |     with clean_session() as sess:
 20 |         yield sess
 21 | 
 22 | 
 23 | def assert_array_collections_equal(correct, test, decimal=7):
 24 |     """Assert that two collections of numpy arrays have the same values.
 25 | 
 26 |     Collections can be either a Sequence or a Mapping.
 27 |     """
 28 |     if type(correct) != type(test):
 29 |         raise ValueError('correct ({}) and test ({}) must have the same type.'.format(type(correct), type(test)))
 30 | 
 31 |     assert_equal = lambda c, t: assert_array_almost_equal(c, t, decimal=decimal)
 32 | 
 33 |     if isinstance(correct, Sequence):
 34 |         assert len(correct) == len(test)
 35 |         for c, t in izip(correct, test):
 36 |             assert_equal(c, t)
 37 |     elif isinstance(correct, Mapping):
 38 |         # same keys
 39 |         assert set(test.keys()) == set(correct.keys())
 40 |         # same values
 41 |         for key in test:
 42 |             assert_equal(correct[key], test[key])
 43 |     else:
 44 |         raise TypeError('Inputs must be of type Mapping or Sequence, not {}.'.format(type(correct)))
 45 | 
 46 | 
 47 | class FeedableTester(object):
 48 |     """A template for testing Feedable classes.
 49 | 
 50 |     Subclass this class and implement all of its abstractmethods.
 51 | 
 52 |     NOTE:
 53 |         You must decorate the implementation of each abstractmethod with a @pytest.fixture decorator.
 54 |         See the `TestFeedable` class below for an example.
 55 |     """
 56 |     @abstractmethod
 57 |     def model(self):
 58 |         """The Model to be tested."""
 59 |         pass
 60 | 
 61 |     @abstractmethod
 62 |     def inputs(self):
 63 |         """Inputs to the model.
 64 | 
 65 |         Returns:
 66 |             (list, dict): an args, kwargs pair
 67 |         """
 68 |         pass
 69 | 
 70 |     @classmethod
 71 |     def as_args_kwargs(cls, *args, **kwargs):
 72 |         return args, kwargs
 73 | 
 74 |     @abstractmethod
 75 |     def feed_dict(self):
 76 |         """Return the correct result of the model's `feed_dict` method."""
 77 |         pass
 78 | 
 79 |     @abstractmethod
 80 |     def output_tensors(self):
 81 |         """Output tensors to be fetched.
 82 | 
 83 |         Returns:
 84 |             list[np.array]
 85 |         """
 86 |         pass
 87 | 
 88 |     @abstractmethod
 89 |     def outputs(self):
 90 |         """Return the correct results of running model.compute(fetch=output_tensors, ...)
 91 | 
 92 |         Returns:
 93 |             list[np.array]
 94 |         """
 95 |         pass
 96 | 
 97 |     @pytest.mark.usefixtures('clean_test_session')
 98 |     def test_inputs_to_feed_dict(self, model, inputs, feed_dict):
 99 |         """Test for correct feed_dict."""
100 |         args, kwargs = inputs
101 |         test_feed_dict = model.inputs_to_feed_dict(*args, **kwargs)
102 |         assert_array_collections_equal(feed_dict, test_feed_dict)
103 | 
104 |     @pytest.mark.usefixtures('clean_test_session')
105 |     def test_outputs(self, model, inputs, output_tensors, outputs):
106 |         """Test for correct output."""
107 |         sess = tf.get_default_session()
108 |         guarantee_initialized_variables(sess)
109 |         args, kwargs = inputs
110 |         test_outputs = model.compute(output_tensors, *args, **kwargs)
111 |         assert_array_collections_equal(outputs, test_outputs, decimal=4)
112 | 
113 | 
114 | class KerasModelTester(FeedableTester):
115 |     @pytest.fixture
116 |     def output_tensors(self, model):
117 |         return model.output_tensors
118 | 
119 |     @pytest.mark.usefixtures('clean_test_session')
120 |     def test_placeholders(self, model, feed_dict):
121 |         """Test that model.placeholders matches the keys of feed_dict."""
122 |         assert set(model.placeholders) == set(feed_dict.keys())
123 | 
124 | 
125 | class FeedableExample(Feedable):
126 |     def __init__(self):
127 |         x = tf.placeholder(tf.float32, shape=[], name='x')
128 |         y = tf.get_variable('y', shape=[], initializer=tf.constant_initializer(2.0))
129 |         z = x * y
130 | 
131 |         self.x = x
132 |         self.y = y
133 |         self.z = z
134 | 
135 |     def inputs_to_feed_dict(self, batch):
136 |         return {self.x: batch.x}
137 | 
138 | 
139 | class TestFeedableExample(FeedableTester):
140 |     @pytest.fixture
141 |     def model(self):
142 |         return FeedableExample()
143 | 
144 |     @pytest.fixture
145 |     def inputs(self):
146 |         return self.as_args_kwargs(Bunch(x=5.0))
147 | 
148 |     @pytest.fixture
149 |     def feed_dict(self, model):
150 |         return {model.x: 5.0}
151 | 
152 |     @pytest.fixture
153 |     def output_tensors(self, model):
154 |         return [model.z]
155 | 
156 |     @pytest.fixture
157 |     def outputs(self):
158 |         return [10.0]
159 | 
160 | 
161 | class KerasLayersModelExample(KerasModel):
162 |     """A Model that is defined using Keras layers from beginning to end."""
163 |     def __init__(self):
164 |         x = Input([1])
165 |         y = np.array([[2.0]])
166 |         b = np.array([0.0])
167 |         mult = Dense(1, weights=(y, b))
168 |         z = mult(x)
169 | 
170 |         self.x = x
171 |         self.mult = mult
172 |         self.z = z
173 | 
174 |     @property
175 |     def placeholders(self):
176 |         return [self.x]
177 | 
178 |     def inputs_to_feed_dict(self, batch):
179 |         return {self.x: np.array([[batch.x]])}
180 | 
181 |     @property
182 |     def output_tensors(self):
183 |         return [self.z]
184 | 
185 | 
186 | class TestKerasLayersModel(KerasModelTester):
187 |     @pytest.fixture
188 |     def model(self):
189 |         return KerasLayersModelExample()
190 | 
191 |     @pytest.fixture
192 |     def inputs(self):
193 |         return self.as_args_kwargs(Bunch(x=5.0))
194 | 
195 |     @pytest.fixture
196 |     def feed_dict(self, model):
197 |         return {model.x: 5.0}
198 | 
199 |     @pytest.fixture
200 |     def outputs(self):
201 |         return [10.0]


--------------------------------------------------------------------------------
/third-party/gtd/gtd/text.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import logging
  3 | import numpy as np
  4 | from gtd.utils import memoize
  5 | 
  6 | 
  7 | @memoize
  8 | def get_spacy():
  9 |     """
 10 |     Loads the spaCy english processor.
 11 | 
 12 |     Tokenizing, Parsing, and NER are enabled. All other features are disabled.
 13 | 
 14 |     Returns:
 15 |         A spaCy Language object for English
 16 |     """
 17 |     logging.info('Loading spaCy...')
 18 |     import spacy.en
 19 |     nlp = spacy.en.English(tagger=False, parser=True, matcher=False)
 20 |     return nlp
 21 | 
 22 | 
 23 | class NER(object):
 24 |     def __init__(self):
 25 |         self.processor = get_spacy()
 26 | 
 27 |     def __call__(self, text):
 28 |         """Given a unicode string, return a tuple of the named entities found inside."""
 29 |         if not isinstance(text, unicode):
 30 |             text = unicode(text)
 31 |         doc = self.processor(text)
 32 |         return doc.ents
 33 | 
 34 | 
 35 | class Trie(object):
 36 | 
 37 |     def __init__(self, token, parent, sink=False):
 38 |         self.token = token
 39 |         self.parent = parent
 40 |         self.sink = sink
 41 |         self.children = {}
 42 | 
 43 |     def __contains__(self, phrase):
 44 |         if phrase[0] == self.token:
 45 |             if len(phrase) == 1:
 46 |                 # On our last word. Must be a sink to match.
 47 |                 return self.sink
 48 |         else:
 49 |             # doesn't match
 50 |             return False
 51 | 
 52 |         suffix = phrase[1:]
 53 |         for child in self.children.values():
 54 |             if suffix in child:
 55 |                 return True
 56 | 
 57 |     def ancestors(self):
 58 |         if self.parent is None:
 59 |             return []
 60 |         anc = self.parent.ancestors()
 61 |         anc.append(self.token)
 62 |         return anc
 63 | 
 64 | 
 65 | class PhraseMatcher(object):
 66 |     def __init__(self, phrases):
 67 |         """Construct a phrase matcher.
 68 | 
 69 |         Args:
 70 |             phrases (List[Tuple[str]]): a list of phrases to match, where each phrase is a tuple of strings
 71 |         """
 72 |         # construct Trie
 73 |         root = Trie('ROOT', None)
 74 |         for phrase in phrases:
 75 |             current = root
 76 |             for token in phrase:
 77 |                 if token not in current.children:
 78 |                     current.children[token] = Trie(token, current)
 79 |                 current = current.children[token]
 80 |             current.sink = True  # mark last node as a sink
 81 | 
 82 |         self.root = root
 83 |         self.phrases = phrases
 84 | 
 85 |     def has_phrase(self, phrase):
 86 |         """Check if a particular phrase is matched by the matcher.
 87 | 
 88 |         Args:
 89 |             phrase (tuple[str])
 90 |         """
 91 |         return ['ROOT'] + phrase in self.root
 92 | 
 93 |     def match(self, tokens):
 94 |         """A list of matches.
 95 | 
 96 |         Args:
 97 |             tokens (list[str]): a list of tokens
 98 | 
 99 |         Returns:
100 |             list[tuple[str, int, int]]: A list of (match, start, end) triples. Each `match` is a tuple of tokens.
101 |             `start` and `end` are word offsets.
102 |         """
103 |         root = self.root
104 |         candidates = [root]
105 | 
106 |         matches = []
107 |         for i, token in enumerate(tokens):
108 | 
109 |             # extend candidates or prune failed candidates
110 |             new_candidates = []
111 |             for cand in candidates:
112 |                 if token in cand.children:
113 |                     new_candidates.append(cand.children[token])  # move to child
114 |             candidates = new_candidates
115 |             candidates.append(root)  # always add root
116 | 
117 |             for cand in candidates:
118 |                 if cand.sink:
119 |                     match = tuple(cand.ancestors())
120 |                     end = i + 1
121 |                     start = end - len(match)
122 |                     matches.append((match, start, end))
123 | 
124 |         return matches
125 | 
126 | 
127 | # first_cap_re = re.compile('(.)([A-Z][a-z]+)')
128 | first_cap_re = re.compile('([^-_])([A-Z][a-z]+)')
129 | all_cap_re = re.compile('([a-z0-9])([A-Z])')
130 | 
131 | 
132 | def camel_to_snake_case(name):
133 |     """Convert camelCase to snake_case (Python)."""
134 |     s1 = first_cap_re.sub(r'\1_\2', name)
135 |     return all_cap_re.sub(r'\1_\2', s1).lower()
136 | 
137 | 
138 | def longest_common_subsequence(X, Y):
139 |     # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_subsequence#Computing_the_length_of_the_LCS
140 | 
141 |     def LCS(X, Y):
142 |         m = len(X)
143 |         n = len(Y)
144 |         # An (m+1) times (n+1) matrix
145 |         C = [[0] * (n + 1) for _ in range(m + 1)]
146 |         for i in range(1, m + 1):
147 |             for j in range(1, n + 1):
148 |                 if X[i - 1] == Y[j - 1]:
149 |                     C[i][j] = C[i - 1][j - 1] + 1
150 |                 else:
151 |                     C[i][j] = max(C[i][j - 1], C[i - 1][j])
152 |         return C
153 | 
154 |     def backTrack(C, X, Y, i, j):
155 |         if i == 0 or j == 0:
156 |             return []
157 |         elif X[i - 1] == Y[j - 1]:
158 |             return backTrack(C, X, Y, i - 1, j - 1) + [X[i - 1]]
159 |         else:
160 |             if C[i][j - 1] > C[i - 1][j]:
161 |                 return backTrack(C, X, Y, i, j - 1)
162 |             else:
163 |                 return backTrack(C, X, Y, i - 1, j)
164 | 
165 |     m = len(X)
166 |     n = len(Y)
167 |     C = LCS(X, Y)
168 |     return backTrack(C, X, Y, m, n)
169 | 
170 | 
171 | def get_ngrams(s, n):
172 |     """Get n-grams for s.
173 | 
174 |     >>> s = [1, 2, 3, 4]
175 |     >>> get_ngrams(s, 2)
176 |     [(1, 2), (2, 3), (3, 4)]
177 |     >>> get_ngrams(s, 1)
178 |     [(1,), (2,), (3,), (4,)]
179 |     >>> get_ngrams(s, 4)
180 |     [(1, 2, 3, 4)]
181 |     """
182 |     assert n <= len(s)
183 |     assert n >= 1
184 |     return [tuple(s[k:k + n]) for k in range(len(s) + 1 - n)]
185 | 
186 | 
187 | def ngram_precision_recall(reference, candidate, n=None):
188 |     if n is None:
189 |         # Take the average over 1 through 4 grams.
190 |         prs = []
191 |         for m in [1, 2, 3, 4]:
192 |             prs.append(ngram_precision_recall(reference, candidate, m))
193 |         ps, rs = zip(*prs)
194 |         return np.mean(ps), np.mean(rs)
195 | 
196 |     ref_set = set(get_ngrams(reference, n))
197 |     can_set = set(get_ngrams(candidate, n))
198 |     correct = float(len(ref_set & can_set))
199 |     rec = correct / len(ref_set)
200 |     prec = correct / len(can_set)
201 |     return prec, rec


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from contextlib import contextmanager
  3 | 
  4 | import numpy as np
  5 | from numpy.testing import assert_array_almost_equal
  6 | from torch import _TensorBase, torch
  7 | from torch.autograd import Variable
  8 | 
  9 | from gtd.utils import chunks
 10 | 
 11 | 
 12 | def conditional(b, x, y):
 13 |     """Conditional operator for PyTorch.
 14 | 
 15 |     Args:
 16 |         b (FloatTensor): with values that are equal to 0 or 1
 17 |         x (FloatTensor): of same shape as b
 18 |         y (FloatTensor): of same shape as b
 19 | 
 20 |     Returns:
 21 |         z (FloatTensor): of same shape as b. z[i] = x[i] if b[i] == 1 else y[i]
 22 |     """
 23 |     return b * x + (1 - b) * y
 24 | 
 25 | 
 26 | def to_numpy(x):
 27 |     if isinstance(x, Variable):
 28 |         x = x.data  # unwrap Variable
 29 | 
 30 |     if isinstance(x, _TensorBase):
 31 |         x = x.cpu().numpy()
 32 |     return x
 33 | 
 34 | 
 35 | def assert_tensor_equal(x, y, decimal=6):
 36 |     assert_array_almost_equal(to_numpy(x), to_numpy(y), decimal=decimal)
 37 | 
 38 | 
 39 | def expand_dims_for_broadcast(low_tensor, high_tensor):
 40 |     """Expand the dimensions of a lower-rank tensor, so that its rank matches that of a higher-rank tensor.
 41 | 
 42 |     This makes it possible to perform broadcast operations between low_tensor and high_tensor.
 43 | 
 44 |     Args:
 45 |         low_tensor (Tensor): lower-rank Tensor with shape [s_0, ..., s_p]
 46 |         high_tensor (Tensor): higher-rank Tensor with shape [s_0, ..., s_p, ..., s_n]
 47 | 
 48 |     Note that the shape of low_tensor must be a prefix of the shape of high_tensor.
 49 | 
 50 |     Returns:
 51 |         Tensor: the lower-rank tensor, but with shape expanded to be [s_0, ..., s_p, 1, 1, ..., 1]
 52 |     """
 53 |     low_size, high_size = low_tensor.size(), high_tensor.size()
 54 |     low_rank, high_rank = len(low_size), len(high_size)
 55 | 
 56 |     # verify that low_tensor shape is prefix of high_tensor shape
 57 |     assert low_size == high_size[:low_rank]
 58 | 
 59 |     new_tensor = low_tensor
 60 |     for _ in range(high_rank - low_rank):
 61 |         new_tensor = torch.unsqueeze(new_tensor, len(new_tensor.size()))
 62 | 
 63 |     return new_tensor
 64 | 
 65 | 
 66 | def is_binary(t):
 67 |     """Check if values of t are binary.
 68 |     
 69 |     Args:
 70 |         t (Tensor|Variable)
 71 | 
 72 |     Returns:
 73 |         bool
 74 |     """
 75 |     if isinstance(t, Variable):
 76 |         t = t.data  # convert Variable to Tensor
 77 | 
 78 |     binary = (t == 0) | (t == 1)  # ByteTensor, should be all 1's
 79 |     all_binary = torch.prod(binary)  # int, should be 1
 80 |     return all_binary == 1
 81 | 
 82 | 
 83 | def similar_size_batches(examples, batch_size, size=lambda x: len(x.target_words)):
 84 |     """Create similar-sized batches of EditExamples.
 85 | 
 86 |     By default, elements with similar len('source_words') are batched together.
 87 |     See editor.py / EditExample.
 88 | 
 89 |     Args:
 90 |         examples (list[EditExample])
 91 |         batch_size (int)
 92 |         size (Callable[[EditExample], int])
 93 | 
 94 |     Returns:
 95 |         list[list[EditExample]]
 96 |     """
 97 |     assert batch_size >= 1
 98 |     sorted_examples = sorted(examples, key=size)
 99 |     batches = list(chunks(sorted_examples, batch_size))
100 |     random.shuffle(batches)  # in-place
101 | 
102 |     # report savings
103 |     suboptimal_batches = list(chunks(examples, batch_size))
104 | 
105 |     total_cost = lambda batches: batch_size * sum(max(size(b) for b in batch) for batch in batches)
106 |     naive_cost = total_cost(suboptimal_batches)
107 |     improved_cost = total_cost(batches)
108 |     optimal_cost = sum(size(ex) for ex in examples)
109 | 
110 |     print 'Optimized batches: reduced cost from {naive} (naive) to {improved} ({reduction}% reduction).\n' \
111 |           'Optimal (batch_size=1) would be {optimal}.'.format(naive=naive_cost, improved=improved_cost,
112 |                                                               reduction=float(naive_cost - improved_cost) / naive_cost,
113 |                                                               optimal=optimal_cost)
114 | 
115 |     return batches
116 | 
117 | 
118 | def print_module_parameters(m, depth=0):
119 |     """Print out all parameters of a module."""
120 |     tabs = '\t' * depth
121 |     for p_name, p in m._parameters.items():
122 |         print tabs + p_name
123 |     for c_name, c in m.named_children():
124 |         print tabs + c_name
125 |         print_module_parameters(c, depth + 1)
126 | 
127 | 
128 | _GPUS_EXIST = True  # True by default
129 | 
130 | def try_gpu(x):
131 |     """Try to put a Variable/Tensor/Module on GPU."""
132 |     global _GPUS_EXIST
133 | 
134 |     if _GPUS_EXIST:
135 |         try:
136 |             return x.cuda()
137 |         except (AssertionError, RuntimeError):
138 |             # actually, GPUs don't exist
139 |             print 'No GPUs detected. Sticking with CPUs.'
140 |             _GPUS_EXIST = False
141 |             return x
142 |     else:
143 |         return x
144 | 
145 | 
146 | def GPUVariable(data):
147 |     return try_gpu(Variable(data, requires_grad=False))
148 | 
149 | 
150 | class RandomState(object):
151 |     def __init__(self):
152 |         """Take a snapshot of random number generator state at this point in time.
153 | 
154 |         Only covers random, numpy.random and torch (CPU).
155 |         """
156 |         self.py = random.getstate()
157 |         self.np = np.random.get_state()
158 |         self.torch = torch.get_rng_state()
159 | 
160 |     def set_global(self):
161 |         """Set all global random number generators to this state."""
162 |         random.setstate(self.py)
163 |         np.random.set_state(self.np)
164 |         torch.set_rng_state(self.torch)
165 | 
166 | 
167 | @contextmanager
168 | def random_state(state):
169 |     """Execute code inside this with-block by starting with the specified random state.
170 | 
171 |     Does not affect the state of random number generators outside this block.
172 |     Not thread-safe.
173 | 
174 |     Args:
175 |         state (RandomState)
176 |     """
177 |     old_state = RandomState()
178 |     state.set_global()
179 |     yield
180 |     old_state.set_global()
181 | 
182 | 
183 | @contextmanager
184 | def random_seed(seed):
185 |     """Execute code inside this with-block using the specified random seed.
186 | 
187 |     Sets the seed for random, numpy.random and torch (CPU).
188 | 
189 |     WARNING: torch GPU seeds are NOT set!
190 | 
191 |     Does not affect the state of random number generators outside this block.
192 |     Not thread-safe.
193 | 
194 |     Args:
195 |         seed (int)
196 |     """
197 |     state = RandomState()
198 |     random.seed(seed)  # alter state
199 |     np.random.seed(seed)
200 |     torch.manual_seed(seed)
201 |     yield
202 |     state.set_global()
203 | 
204 | 
205 | class NamedTupleLike(object):
206 |     __slots__ = []
207 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/profile_imports.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2006, 2008, 2009, 2010 by Canonical Ltd
  2 | # Written by John Arbash Meinel <john@arbash-meinel.com>
  3 | #
  4 | # This program is free software; you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation; either version 2 of the License, or
  7 | # (at your option) any later version.
  8 | #
  9 | # This program is distributed in the hope that it will be useful,
 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | # GNU General Public License for more details.
 13 | #
 14 | # You should have received a copy of the GNU General Public License
 15 | # along with this program; if not, write to the Free Software
 16 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 17 | 
 18 | """A custom importer and regex compiler which logs time spent."""
 19 | 
 20 | import sys
 21 | import time
 22 | 
 23 | 
 24 | import re
 25 | 
 26 | 
 27 | _parent_stack = []
 28 | _total_stack = {}
 29 | _info = {}
 30 | _cur_id = 0
 31 | _timer = time.time
 32 | if sys.platform == 'win32':
 33 |     _timer = time.clock
 34 | 
 35 | 
 36 | def stack_add(name, frame_name, frame_lineno, scope_name=None):
 37 |     """Start a new record on the stack"""
 38 |     global _cur_id
 39 |     _cur_id += 1
 40 |     this_stack = (_cur_id, name)
 41 | 
 42 |     if _parent_stack:
 43 |         _total_stack[_parent_stack[-1]].append(this_stack)
 44 |     _total_stack[this_stack] = []
 45 |     _parent_stack.append(this_stack)
 46 |     _info[this_stack] = [len(_parent_stack)-1, frame_name, frame_lineno, scope_name]
 47 | 
 48 |     return this_stack
 49 | 
 50 | 
 51 | def stack_finish(this, cost):
 52 |     """Finish a given entry, and record its cost in time"""
 53 |     global _parent_stack
 54 | 
 55 |     assert _parent_stack[-1] == this, \
 56 |         'import stack does not end with this %s: %s' % (this, _parent_stack)
 57 |     _parent_stack.pop()
 58 |     _info[this].append(cost)
 59 | 
 60 | 
 61 | def log_stack_info(out_file, sorted=True, hide_fast=True):
 62 |     # Find all of the roots with import = 0
 63 |     out_file.write('%5s %5s %-40s @ %s:%s\n'
 64 |         % ('cum', 'inline', 'name', 'file', 'line'))
 65 |     todo = [(value[-1], key) for key,value in _info.iteritems() if value[0] == 0]
 66 | 
 67 |     if sorted:
 68 |         todo.sort()
 69 | 
 70 |     while todo:
 71 |         cum_time, cur = todo.pop()
 72 |         children = _total_stack[cur]
 73 | 
 74 |         c_times = []
 75 | 
 76 |         info = _info[cur]
 77 |         if hide_fast and info[-1] < 0.0001:
 78 |             continue
 79 | 
 80 |         # Compute the module time by removing the children times
 81 |         mod_time = info[-1]
 82 |         for child in children:
 83 |             c_info = _info[child]
 84 |             mod_time -= c_info[-1]
 85 |             c_times.append((c_info[-1], child))
 86 | 
 87 |         # indent, cum_time, mod_time, name,
 88 |         # scope_name, frame_name, frame_lineno
 89 |         out_file.write('%5.1f %5.1f %-40s @ %s:%d\n'
 90 |             % (info[-1]*1000., mod_time*1000.,
 91 |                ('+'*info[0] + cur[1]),
 92 |                info[1], info[2]))
 93 | 
 94 |         if sorted:
 95 |             c_times.sort()
 96 |         else:
 97 |             c_times.reverse()
 98 |         todo.extend(c_times)
 99 | 
100 | 
101 | _real_import = __import__
102 | 
103 | def timed_import(name, globals=None, locals=None, fromlist=None, level=None):
104 |     """Wrap around standard importer to log import time"""
105 |     # normally there are 4, but if this is called as __import__ eg by
106 |     # /usr/lib/python2.6/email/__init__.py then there may be only one
107 |     # parameter
108 |     # level is only passed by python2.6
109 | 
110 |     if globals is None:
111 |         # can't determine the scope name afaics; we could peek up the stack to
112 |         # see where this is being called from, but it should be a rare case.
113 |         scope_name = None
114 |     else:
115 |         scope_name = globals.get('__name__', None)
116 |         if scope_name is None:
117 |             scope_name = globals.get('__file__', None)
118 |         if scope_name is None:
119 |             scope_name = globals.keys()
120 |         else:
121 |             # Trim out paths before bzrlib
122 |             loc = scope_name.find('bzrlib')
123 |             if loc != -1:
124 |                 scope_name = scope_name[loc:]
125 |             # For stdlib, trim out early paths
126 |             loc = scope_name.find('python2.4')
127 |             if loc != -1:
128 |                 scope_name = scope_name[loc:]
129 | 
130 |     # Figure out the frame that is doing the importing
131 |     frame = sys._getframe(1)
132 |     frame_name = frame.f_globals.get('__name__', '<unknown>')
133 |     extra = ''
134 |     if frame_name.endswith('demandload'):
135 |         # If this was demandloaded, we have 3 frames to ignore
136 |         extra = '(demandload) '
137 |         frame = sys._getframe(4)
138 |         frame_name = frame.f_globals.get('__name__', '<unknown>')
139 |     elif frame_name.endswith('lazy_import'):
140 |         # If this was lazily imported, we have 3 frames to ignore
141 |         extra = '[l] '
142 |         frame = sys._getframe(4)
143 |         frame_name = frame.f_globals.get('__name__', '<unknown>')
144 |     if fromlist:
145 |         extra += ' [%s]' % (', '.join(map(str, fromlist)),)
146 |     frame_lineno = frame.f_lineno
147 | 
148 |     this = stack_add(extra + name, frame_name, frame_lineno, scope_name)
149 | 
150 |     tstart = _timer()
151 |     try:
152 |         # Do the import
153 |         mod = _real_import(name, globals, locals, fromlist)
154 |     finally:
155 |         tload = _timer()-tstart
156 |         stack_finish(this, tload)
157 | 
158 |     return mod
159 | 
160 | 
161 | _real_compile = re._compile
162 | 
163 | 
164 | def timed_compile(*args, **kwargs):
165 |     """Log how long it takes to compile a regex"""
166 | 
167 |     # And who is requesting this?
168 |     frame = sys._getframe(2)
169 |     frame_name = frame.f_globals.get('__name__', '<unknown>')
170 | 
171 |     extra = ''
172 |     if frame_name.endswith('lazy_regex'):
173 |         # If this was lazily compiled, we have 3 more frames to ignore
174 |         extra = '[l] '
175 |         frame = sys._getframe(5)
176 |         frame_name = frame.f_globals.get('__name__', '<unknown>')
177 |     frame_lineno = frame.f_lineno
178 |     this = stack_add(extra+repr(args[0]), frame_name, frame_lineno)
179 | 
180 |     tstart = _timer()
181 |     try:
182 |         # Measure the compile time
183 |         comp = _real_compile(*args, **kwargs)
184 |     finally:
185 |         tcompile = _timer() - tstart
186 |         stack_finish(this, tcompile)
187 | 
188 |     return comp
189 | 
190 | 
191 | def install():
192 |     """Install the hooks for measuring import and regex compile time."""
193 |     __builtins__['__import__'] = timed_import
194 |     re._compile = timed_compile
195 | 
196 | 
197 | def uninstall():
198 |     """Remove the import and regex compile timing hooks."""
199 |     __builtins__['__import__'] = _real_import
200 |     re._compile = _real_compile
201 | 
202 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | from os.path import join
  3 | 
  4 | import pytest
  5 | from gtd.text import PhraseMatcher
  6 | from gtd.utils import FileMemoized, SimpleExecutor, as_batches, Failure, NestedDict, EqualityMixinSlots, \
  7 |     memoize_with_key_fxn, DictMemoized, ranks, truncated, ClassCounter
  8 | 
  9 | 
 10 | def test_as_batches():
 11 |     items = [0, 1, 2, 3, 4, 5, 6]
 12 |     assert list(as_batches(items, 2)) == [[0, 1], [2, 3], [4, 5], [6]]
 13 | 
 14 | 
 15 | def test_file_memoized_represent_args(tmpdir):
 16 |     path = str(tmpdir.join('fxn'))
 17 | 
 18 |     fm = FileMemoized(None, path, None, None)
 19 |     key = fm._cache_key(['a', 'b'], {'c': 2, 'd': 'e'})
 20 |     assert key == join(path, 'a_b_c=2_d=e.txt')
 21 |     key = fm._cache_key([], {'c': 2, 'd': 'e'})
 22 |     assert key == join(path, 'c=2_d=e.txt')
 23 |     key = fm._cache_key([], dict())
 24 |     assert key == join(path, 'NO_KEY.txt')
 25 | 
 26 | 
 27 | def test_ranks():
 28 |     scores = [10, -1, 0.3, 24, 11]
 29 |     assert ranks(scores, ascending=True) == [3, 1, 2, 5, 4]
 30 |     assert ranks(scores, ascending=False) == [3, 5, 4, 1, 2]
 31 | 
 32 | 
 33 | class TestUtils(TestCase):
 34 | 
 35 |     def test_phrase_matcher(self):
 36 |         phrases = [[1, 2, 3], [1, ], [2, ], [2, 4]]
 37 |         not_phrases = [[1, 2], [4, ]]
 38 | 
 39 |         pm = PhraseMatcher(phrases)
 40 | 
 41 |         for phrase in phrases:
 42 |             self.assertTrue(pm.has_phrase(phrase))
 43 | 
 44 |         for phrase in not_phrases:
 45 |             self.assertFalse(pm.has_phrase(phrase))
 46 | 
 47 |         tokens = [1, 2, 1, 2, 3, 2, 3, 2, 4]
 48 | 
 49 |         matches = pm.match(tokens)
 50 | 
 51 |         correct = [((1,), 0, 1),
 52 |                    ((2,), 1, 2),
 53 |                    ((1,), 2, 3),
 54 |                    ((2,), 3, 4),
 55 |                    ((1, 2, 3), 2, 5),
 56 |                    ((2,), 5, 6),
 57 |                    ((2,), 7, 8),
 58 |                    ((2, 4), 7, 9)]
 59 | 
 60 |         self.assertEqual(matches, correct)
 61 | 
 62 | 
 63 | class TestSimpleExecutor(object):
 64 | 
 65 |     def test_context_manager(self):
 66 |         fxn = lambda x: 2 * x
 67 |         with SimpleExecutor(fxn, max_workers=2) as ex:
 68 |             for i, x in enumerate(range(10)):
 69 |                 ex.submit(i, x)
 70 |             results = {k: v for k, v in ex.results()}
 71 | 
 72 |         correct = {k: 2 * k for k in range(10)}
 73 |         assert results == correct
 74 | 
 75 | 
 76 | class TestFailure(object):
 77 |     def test_eq(self):
 78 |         f0 = Failure()
 79 |         f1 = Failure()
 80 |         f2 = Failure(uid=1)
 81 |         f3 = Failure(uid=1, message='different message')
 82 |         assert f0 != f1  # different id
 83 |         assert f1 != f2  # different id
 84 |         assert f2 == f3  # same id
 85 | 
 86 | 
 87 | class TestNestedDict(object):
 88 |     @pytest.fixture
 89 |     def normal_dict(self):
 90 |         return {
 91 |             'a': 1,
 92 |             'b': {
 93 |                 'c': 2,
 94 |                 'd': 3,
 95 |             },
 96 |         }
 97 | 
 98 |     @pytest.fixture
 99 |     def nested_dict(self, normal_dict):
100 |         return NestedDict(normal_dict)
101 | 
102 |     def test_as_dict(self, nested_dict, normal_dict):
103 |         assert nested_dict.as_dict() == normal_dict
104 | 
105 |     def test_iter(self, nested_dict):
106 |         assert set(nested_dict) == {'a', 'b'}
107 | 
108 |     def test_len(self, nested_dict):
109 |         assert len(nested_dict) == 3
110 | 
111 |     def test_nested(self):
112 |         d = NestedDict()
113 |         d.set_nested(('a', 'b', 'c'), 1)
114 |         d.set_nested(('a', 'd'), 2)
115 | 
116 |         assert d.as_dict() == {
117 |             'a': {
118 |                 'b': {
119 |                     'c': 1
120 |                 },
121 |                 'd': 2,
122 |             }
123 |         }
124 |         assert d.get_nested(('a', 'd')) == 2
125 | 
126 |         with pytest.raises(KeyError):
127 |             d.get_nested(('a', 'd', 'e'))
128 | 
129 |     def test_leaves(self, nested_dict):
130 |         assert set(nested_dict.leaves()) == {1, 2, 3}
131 | 
132 | 
133 | class DummySlotsObject(EqualityMixinSlots):
134 |     __slots__ = ['a', 'b', 'c']
135 | 
136 |     def __init__(self, a, b, c=None):
137 |         self.a = a
138 |         self.b = b
139 | 
140 |         if c:
141 |             self.c = c
142 | 
143 | 
144 | class TestEqualityMixinSlot(object):
145 |     def test_equality(self):
146 |         d1 = DummySlotsObject(5, 10)
147 |         d2 = DummySlotsObject(5, 10)
148 |         assert d1 == d2
149 | 
150 |         d3 = DummySlotsObject(5, 10, 20)
151 |         d4 = DummySlotsObject(5, 11)
152 |         assert d1 != d3
153 |         assert d1 != d4
154 | 
155 | 
156 | class MemoizedClass(object):
157 |     def __init__(self):
158 |         self.calls = 0
159 | 
160 |     @memoize_with_key_fxn(lambda self, a, b: b)  # key fxn only uses b
161 |     def fxn_to_memoize(self, a, b):
162 |         self.calls += 1
163 |         return a + b
164 | 
165 | 
166 | class MemoizedClass2(object):
167 |     def __init__(self):
168 |         self.calls = 0
169 | 
170 |     def fxn(self, a, b):
171 |         self.calls += 1
172 |         return a + b
173 | 
174 |     fxn_memoized = DictMemoized(fxn)
175 | 
176 | 
177 | class TestDictMemoized(object):
178 |     def test(self):
179 |         mc = MemoizedClass2()
180 |         result = mc.fxn_memoized('a', 'b')
181 |         assert result == 'ab'
182 |         assert mc.calls == 1
183 | 
184 |         result2 = mc.fxn_memoized('a', 'b')
185 |         assert result2 == 'ab'
186 |         assert mc.calls == 1
187 | 
188 |         result2 = mc.fxn_memoized('b', 'b')
189 |         assert result2 == 'bb'
190 |         assert mc.calls == 2
191 | 
192 | 
193 | class TestMemoizeWithKey(object):
194 |     def test_caching(self):
195 |         mc = MemoizedClass()
196 |         result = mc.fxn_to_memoize('hey', 'there')
197 |         assert mc.calls == 1
198 |         assert result == 'heythere'
199 | 
200 |         # returns cached result
201 |         result2 = mc.fxn_to_memoize('hey', 'there')
202 |         assert result2 == 'heythere'
203 |         assert mc.calls == 1
204 | 
205 |         # computes new result
206 |         result3 = mc.fxn_to_memoize('hey', 'what')
207 |         assert mc.calls == 2
208 | 
209 |         # only caches on 2nd arg, 'there', not 'you'
210 |         result4 = mc.fxn_to_memoize('you', 'there')
211 |         assert result4 == 'heythere'
212 |         assert mc.calls == 2
213 | 
214 | 
215 | def test_truncated():
216 |     items = [1, 3, 1, 2, 2, 4]
217 |     assert list(truncated(items, 3)) == [1, 3, 1]
218 |     assert list(truncated(items, 0)) == []
219 | 
220 | 
221 | class Dummy(object):
222 |     def __init__(self, x):
223 |         self.x = x
224 | 
225 | 
226 | class TestClassCounter(object):
227 |     def test_count(self):
228 |         counter = ClassCounter(Dummy)
229 |         assert counter.count() == 0
230 | 
231 |         b1 = Dummy('1')
232 |         b2 = Dummy('2')
233 |         assert counter.count() == 2
234 | 
235 |         b3 = Dummy('1')
236 |         assert counter.count() == 3
237 | 
238 |         b1 = Dummy('3')
239 |         assert counter.count() == 3  # lost a reference to 1, due to reassignment
240 | 
241 |         del b3
242 |         assert counter.count() == 2  # lost one due to deletion
243 | 
244 |         # this line is here to prevent b1 and b2 from being garbage collected
245 |         x = [b1, b2]
246 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/log.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import math
  4 | import numbers
  5 | import os
  6 | import platform
  7 | import resource
  8 | import sys
  9 | from collections import MutableMapping
 10 | from contextlib import contextmanager
 11 | from os.path import join
 12 | 
 13 | from IPython.core.display import display, HTML
 14 | from pyhocon import ConfigFactory
 15 | from pyhocon import ConfigMissingException
 16 | from pyhocon import ConfigTree
 17 | from pyhocon import HOCONConverter
 18 | 
 19 | from gtd.utils import NestedDict, Config
 20 | 
 21 | 
 22 | def in_ipython():
 23 |     try:
 24 |         __IPYTHON__
 25 |         return True
 26 |     except NameError:
 27 |         return False
 28 | 
 29 | 
 30 | def print_with_fonts(tokens, sizes, colors, background=None):
 31 | 
 32 |     def style(text, size=12, color='black'):
 33 |         return u'<span style="font-size: {}px; color: {};">{}</span>'.format(size, color, text)
 34 | 
 35 |     styled = [style(token, size, color) for token, size, color in zip(tokens, sizes, colors)]
 36 |     text = u' '.join(styled)
 37 | 
 38 |     if background:
 39 |         text = u'<span style="background-color: {};">{}</span>'.format(background, text)
 40 | 
 41 |     display(HTML(text))
 42 | 
 43 | 
 44 | def gb_used():
 45 |     used = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
 46 |     if platform.system() != 'Darwin':
 47 |         # on Linux, used is in terms of kilobytes
 48 |         power = 2
 49 |     else:
 50 |         # on Mac, used is in terms of bytes
 51 |         power = 3
 52 |     return float(used) / math.pow(1024, power)
 53 | 
 54 | 
 55 | class Metadata(MutableMapping):
 56 |     """A wrapper around ConfigTree.
 57 | 
 58 |     Supports a name_scope contextmanager.
 59 |     """
 60 |     def __init__(self, config_tree=None):
 61 |         if config_tree is None:
 62 |             config_tree = ConfigTree()
 63 | 
 64 |         self._config_tree = config_tree
 65 |         self._namestack = []
 66 | 
 67 |     @contextmanager
 68 |     def name_scope(self, name):
 69 |         self._namestack.append(name)
 70 |         yield
 71 |         self._namestack.pop()
 72 | 
 73 |     def _full_key(self, key):
 74 |         return '.'.join(self._namestack + [key])
 75 | 
 76 |     def __getitem__(self, key):
 77 |         try:
 78 |             val = self._config_tree.get(self._full_key(key))
 79 |         except ConfigMissingException:
 80 |             raise KeyError(key)
 81 | 
 82 |         if isinstance(val, ConfigTree):
 83 |             return Metadata(val)
 84 |         return val
 85 | 
 86 |     def __setitem__(self, key, value):
 87 |         """Put a value (key is a dot-separated name)."""
 88 |         self._config_tree.put(self._full_key(key), value)
 89 | 
 90 |     def __delitem__(self, key):
 91 |         raise NotImplementedError()
 92 | 
 93 |     def __iter__(self):
 94 |         return iter(self._config_tree)
 95 | 
 96 |     def __len__(self):
 97 |         return len(self._config_tree)
 98 | 
 99 |     def __repr__(self):
100 |         return self.to_str()
101 | 
102 |     def to_str(self, fmt='hocon'):
103 |         return HOCONConverter.convert(self._config_tree, fmt)
104 | 
105 |     def to_file(self, path, fmt='hocon'):
106 |         with open(path, 'w') as f:
107 |             f.write(self.to_str(fmt))
108 | 
109 |     @classmethod
110 |     def from_file(cls, path, fmt='hocon'):
111 |         if fmt == 'hocon':
112 |             config_tree = ConfigFactory.parse_file(path)
113 |         elif fmt == 'json':
114 |             with open(path, 'r') as f:
115 |                 d = json.load(f)
116 |             config_tree = ConfigFactory.from_dict(d)
117 |         else:
118 |             raise ValueError('Invalid format: {}'.format(fmt))
119 | 
120 |         return cls(config_tree)
121 | 
122 | 
123 | class SyncedMetadata(Metadata):
124 |     """A Metadata object which writes to file after every change."""
125 |     def __init__(self, path, fmt='hocon'):
126 |         if os.path.exists(path):
127 |             m = Metadata.from_file(path, fmt)
128 |         else:
129 |             m = Metadata()
130 | 
131 |         super(SyncedMetadata, self).__init__(m._config_tree)
132 |         self._path = path
133 |         self._fmt = fmt
134 | 
135 |     def __setitem__(self, key, value):
136 |         super(SyncedMetadata, self).__setitem__(key, value)
137 |         self.to_file(self._path, fmt=self._fmt)
138 | 
139 | 
140 | def print_list(l):
141 |     for item in l:
142 |         print item
143 | 
144 | 
145 | def print_no_newline(s):
146 |     sys.stdout.write(s)
147 |     sys.stdout.flush()
148 | 
149 | 
150 | def set_log_level(level):
151 |     """Set the log-level of the root logger of the logging module.
152 | 
153 |     Args:
154 |         level: can be an integer such as 30 (logging.WARN), or a string such as 'WARN'
155 |     """
156 |     if isinstance(level, str):
157 |         level = logging._levelNames[level]
158 | 
159 |     logger = logging.getLogger()  # gets root logger
160 |     logger.setLevel(level)
161 | 
162 | 
163 | def jupyter_no_margins():
164 |     """Cause Jupyter notebook to take up 100% of window width."""
165 |     display(HTML("<style>.container { width:100% !important; }</style>"))
166 | 
167 | 
168 | class TraceSession(object):
169 |     def __init__(self, tracer):
170 |         self.tracer = tracer
171 |         self._values = {}
172 | 
173 |     @property
174 |     def values(self):
175 |         return self._values
176 | 
177 |     def save(self, save_path):
178 |         with open(save_path, 'w') as f:
179 |             json.dump(self.values, f, indent=4, sort_keys=True)
180 | 
181 |     def __enter__(self):
182 |         if self.tracer._current_session:
183 |             raise RuntimeError('Already in the middle of a TraceSession')
184 | 
185 |         # register as the current session
186 |         self.tracer._current_session = self
187 |         return self
188 | 
189 |     def __exit__(self, exc_type, exc_val, exc_tb):
190 |         # un-register
191 |         self.tracer._current_session = None
192 | 
193 | 
194 | class Tracer(object):
195 |     """Log values computed during program execution.
196 |     
197 |     Values are logged to the currently active TraceSession object.
198 |     """
199 |     def __init__(self):
200 |         self._current_session = None
201 | 
202 |     def session(self):
203 |         return TraceSession(self)
204 | 
205 |     def log(self, logging_callback):
206 |         """If we are in a TraceSession, execute the logging_callback.
207 |         
208 |         The logging_callback should take a `values` dict as its only argument, and modify `values` in some way.
209 |         
210 |         Args:
211 |             logging_callback (Callable[dict]): a function which takes a `values` dict as its only argument.
212 |         """
213 |         if self._current_session is None:
214 |             return
215 |         logging_callback(self._current_session.values)
216 | 
217 |     def log_put(self, name, value):
218 |         """Log a value.
219 |         
220 |         Args:
221 |             name (str): name of the variable
222 |             value (object)
223 |         """
224 |         def callback(values):
225 |             if name in values:
226 |                 raise RuntimeError('{} already logged'.format(name))
227 |             values[name] = value
228 | 
229 |         return self.log(callback)
230 | 
231 |     def log_append(self, name, value):
232 |         """Append a value.
233 | 
234 |         Args:
235 |             name (str): name of the variable
236 |             value (object): value to append
237 |         """
238 |         def callback(values):
239 |             if name not in values:
240 |                 values[name] = []
241 |             values[name].append(value)
242 | 
243 |         return self.log(callback)
244 | 
245 | 
246 | def indent(s, spaces=4):
247 |     whitespace = u' ' * spaces
248 |     return u'\n'.join(whitespace + line for line in s.split(u'\n'))
249 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/codalab.py:
--------------------------------------------------------------------------------
  1 | """Tools for working with CodaLab."""
  2 | import cPickle as pickle
  3 | import json
  4 | import os
  5 | import platform
  6 | import shutil
  7 | import sys
  8 | import tempfile
  9 | from contextlib import contextmanager
 10 | 
 11 | import matplotlib.image as mpimg
 12 | from gtd.io import shell
 13 | 
 14 | __author__ = 'kelvinguu'
 15 | 
 16 | 
 17 | # need to be specified by user
 18 | worksheet = None
 19 | site = None
 20 | 
 21 | 
 22 | def get_uuids():
 23 |     """List all bundle UUIDs in the worksheet."""
 24 |     result = shell('cl ls -w {} -u'.format(worksheet))
 25 |     uuids = result.split('\n')
 26 |     uuids = uuids[1:-1]  # trim non uuids
 27 |     return uuids
 28 | 
 29 | 
 30 | @contextmanager
 31 | def open_file(uuid, path):
 32 |      """Get the raw file content within a particular bundle at a particular path.
 33 | 
 34 |      Path have no leading slash.
 35 |      """
 36 |      # create temporary file just so we can get an unused file path
 37 |      f = tempfile.NamedTemporaryFile()
 38 |      f.close()  # close and delete right away
 39 |      fname = f.name
 40 | 
 41 |      # download file to temporary path
 42 |      cmd ='cl down -o {} -w {} {}/{}'.format(fname, worksheet, uuid, path)
 43 |      try:
 44 |         shell(cmd)
 45 |      except RuntimeError:
 46 |          try:
 47 |             os.remove(fname)  # if file exists, remove it
 48 |          except OSError:
 49 |              pass
 50 |          raise IOError('Failed to open file {}/{}'.format(uuid, path))
 51 | 
 52 |      f = open(fname)
 53 |      yield f
 54 |      f.close()
 55 |      os.remove(fname)  # delete temp file
 56 | 
 57 | 
 58 | class Bundle(object):
 59 |     def __init__(self, uuid):
 60 |         self.uuid = uuid
 61 | 
 62 |     def __getattr__(self, item):
 63 |         """
 64 |         Load attributes: history, meta on demand
 65 |         """
 66 |         if item == 'history':
 67 |             try:
 68 |                 with open_file(self.uuid, 'history.cpkl') as f:
 69 |                     value = pickle.load(f)
 70 |             except IOError:
 71 |                 value = {}
 72 | 
 73 |         elif item == 'meta':
 74 |             try:
 75 |                 with open_file(self.uuid, 'meta.json') as f:
 76 |                     value = json.load(f)
 77 |             except IOError:
 78 |                 value = {}
 79 | 
 80 |             # load codalab info
 81 |             fields = ('uuid', 'name', 'bundle_type', 'state', 'time', 'remote')
 82 |             cmd = 'cl info -w {} -f {} {}'.format(worksheet, ','.join(fields), self.uuid)
 83 |             result = shell(cmd)
 84 |             info = dict(zip(fields, result.split()))
 85 |             value.update(info)
 86 | 
 87 |         elif item in ('stderr', 'stdout'):
 88 |             with open_file(self.uuid, item) as f:
 89 |                 value = f.read()
 90 | 
 91 |         else:
 92 |             raise AttributeError(item)
 93 | 
 94 |         self.__setattr__(item, value)
 95 |         return value
 96 | 
 97 |     def __repr__(self):
 98 |         return self.uuid
 99 | 
100 |     def load_img(self, img_path):
101 |         """
102 |         Return an image object that can be immediately plotted with matplotlib
103 |         """
104 |         with open_file(self.uuid, img_path) as f:
105 |             return mpimg.imread(f)
106 | 
107 | 
108 | def download_logs(bundle, log_dir):
109 |     if bundle.meta['bundle_type'] != 'run' or bundle.meta['state'] == 'queued':
110 |         print 'Skipped {}\n'.format(bundle.uuid)
111 |         return
112 | 
113 |     if isinstance(bundle, str):
114 |         bundle = Bundle(bundle)
115 | 
116 |     uuid = bundle.uuid
117 |     name = bundle.meta['name']
118 |     log_path = os.path.join(log_dir, '{}_{}'.format(name, uuid))
119 | 
120 |     cmd ='cl down -o {} -w {} {}/logs'.format(log_path, worksheet, uuid)
121 | 
122 |     print uuid
123 |     try:
124 |         shell(cmd, verbose=True)
125 |     except RuntimeError:
126 |         print 'Failed to download', bundle.uuid
127 |     print
128 | 
129 | 
130 | def report(render, uuids=None, reverse=True, limit=None):
131 |     if uuids is None:
132 |         uuids = get_uuids()
133 | 
134 |     if reverse:
135 |         uuids = uuids[::-1]
136 | 
137 |     if limit is not None:
138 |         uuids = uuids[:limit]
139 | 
140 |     for uuid in uuids:
141 |         bundle = Bundle(uuid)
142 |         try:
143 |             render(bundle)
144 |         except Exception:
145 |             print 'Failed to render', bundle.uuid
146 | 
147 | 
148 | def monitor_jobs(logdir, uuids=None, reverse=True, limit=None):
149 |     if os.path.exists(logdir):
150 |         delete = raw_input('Overwrite existing logdir? ({})'.format(logdir))
151 |         if delete == 'y':
152 |             shutil.rmtree(logdir)
153 |             os.makedirs(logdir)
154 |     else:
155 |         os.makedirs(logdir)
156 |         print 'Using logdir:', logdir
157 | 
158 |     report(lambda bd: download_logs(bd, logdir), uuids, reverse, limit)
159 | 
160 | 
161 | def tensorboard(logdir):
162 |     print 'Run this in bash:'
163 |     shell('tensorboard --logdir={}'.format(logdir), verbose=True, debug=True)
164 |     print '\nGo to TensorBoard: http://localhost:6006/'
165 | 
166 | 
167 | def add_to_sys_path(path):
168 |     """Add a path to the system PATH."""
169 |     sys.path.insert(0, path)
170 | 
171 | 
172 | def configure_matplotlib():
173 |     """Set Matplotlib backend to 'Agg', which is necessary on CodaLab docker image."""
174 |     import warnings
175 |     import matplotlib
176 |     with warnings.catch_warnings():
177 |         warnings.simplefilter('ignore')
178 |         matplotlib.use('Agg')  # needed when running from server
179 | 
180 | 
181 | def in_codalab():
182 |     """Check if we are running inside CodaLab Docker container or not."""
183 |     # TODO: below is a total hack. If the OS is not a Mac, we assume we're on CodaLab.
184 |     return platform.system() != 'Darwin'
185 | 
186 | 
187 | def upload(full_path, bundle_name=None, excludes='*.ipynb .git .ipynb_checkpoints .ignore'):
188 |     """
189 |     Upload a file or directory to the codalab worksheet
190 |     Args:
191 |         full_path: Path + filename of file to upload
192 |         bundle_name: Name to upload file/directory as. I
193 |     """
194 |     directory, filename = os.path.split(full_path)
195 |     if bundle_name is None:
196 |         bundle_name = filename
197 |     shell('cl up -n {} -w {} {} -x {}'.format(bundle_name, worksheet, full_path, excludes), verbose=True)
198 | 
199 | 
200 | def launch_job(job_name, cmd,
201 |                dependencies=tuple(),
202 |                queue='john', image='kelvinguu/gtd:1.0',
203 |                memory=None, cpus='5',
204 |                network=False,
205 |                debug=False, tail=False):
206 |     """Launch a job on CodaLab (optionally upload code that the job depends on).
207 | 
208 |     Args:
209 |         job_name: name of the job
210 |         cmd: command to execute
211 |         dependencies: list of other bundles that we depend on
212 |         debug: if True, prints SSH commands, but does not execute them
213 |         tail: show the streaming output returned by CodaLab once it launches the job
214 |     """
215 |     print 'Remember to set up SSH tunnel and LOG IN through the command line before calling this.'
216 |     options = '-v -n {} -w {} --request-queue {} --request-docker-image {} --request-cpus {}'.format(
217 |         job_name, worksheet, queue, image, cpus)
218 | 
219 |     if memory:
220 |         options += ' --request-memory {}'.format(memory)
221 |     if network:
222 |         options += ' --request-network'
223 | 
224 |     dep_str = ' '.join(['{0}:{0}'.format(dep) for dep in dependencies])
225 |     full_cmd = "cl run {} {} '{}'".format(options, dep_str, cmd)
226 |     if tail:
227 |         full_cmd += ' -t'
228 |     shell(full_cmd, verbose=True, debug=debug)
229 | 
230 | 
231 | if in_codalab():
232 |     configure_matplotlib()
233 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/source_encoder.py:
--------------------------------------------------------------------------------
  1 | from abc import ABCMeta, abstractmethod, abstractproperty
  2 | from collections import namedtuple
  3 | from itertools import izip
  4 | 
  5 | import torch
  6 | from gtd.ml.torch.recurrent import tile_state, gated_update
  7 | from torch.nn import Module
  8 | from torch.nn import Parameter
  9 | 
 10 | from gtd.ml.torch.seq_batch import SequenceBatchElement
 11 | 
 12 | 
 13 | class SourceEncoder(Module):
 14 |     __metaclass__ = ABCMeta
 15 | 
 16 |     @abstractproperty
 17 |     def hidden_dim(self):
 18 |         raise NotImplementedError
 19 | 
 20 |     @abstractmethod
 21 |     def forward(self, input_embeds_list):
 22 |         """Embed a source sequence.
 23 | 
 24 |         Args:
 25 |             input_embeds_list (list[SequenceBatchElement]): where each element is of shape (batch_size, input_dim)
 26 | 
 27 |         Returns:
 28 |             hidden_states_list (list[SequenceBatchElement]) where each element is (batch_size, hidden_dim)
 29 |         """
 30 |         raise NotImplementedError
 31 | 
 32 | 
 33 | class SimpleSourceEncoder(SourceEncoder):
 34 |     def __init__(self, rnn_cell):
 35 |         """
 36 | 
 37 |         Args:
 38 |             rnn_cell (DecoderCell)
 39 |         """
 40 |         super(SimpleSourceEncoder, self).__init__()
 41 |         self.rnn_cell = rnn_cell
 42 |         hidden_dim = self.rnn_cell.hidden_size
 43 |         self.h0 = Parameter(torch.zeros(hidden_dim))
 44 |         self.c0 = Parameter(torch.zeros(hidden_dim))
 45 |         self._hidden_dim = hidden_dim
 46 | 
 47 |     @property
 48 |     def hidden_dim(self):
 49 |         return self._hidden_dim
 50 | 
 51 |     def forward(self, input_embeds_list):
 52 |         """
 53 | 
 54 |         Args:
 55 |             input_embeds_list (list[SequenceBatchElement]): where each element is of shape (batch_size, input_dim)
 56 | 
 57 |         Returns:
 58 |             hidden_states_list (list[SequenceBatchElement]) where each element is (batch_size, hidden_dim)
 59 |         """
 60 |         batch_size = input_embeds_list[0].values.size()[0]
 61 | 
 62 |         h = tile_state(self.h0, batch_size)  # (batch_size, hidden_dim)
 63 |         c = tile_state(self.c0, batch_size)  # (batch_size, hidden_dim)
 64 | 
 65 |         hidden_states_list = []
 66 | 
 67 |         for t, x in enumerate(input_embeds_list):
 68 |             # x.values has shape (batch_size, input_dim)
 69 |             # x.mask has shape (batch_size, 1)
 70 |             h_new, c_new = self.rnn_cell(x.values, (h, c))
 71 |             h = gated_update(h, h_new, x.mask)
 72 |             c = gated_update(c, c_new, x.mask)
 73 |             hidden_states_list.append(SequenceBatchElement(h, x.mask))
 74 | 
 75 |         return hidden_states_list
 76 | 
 77 | 
 78 | class BidirectionalSourceEncoder(SourceEncoder):
 79 |     def __init__(self, input_dim, hidden_dim, rnn_cell_factory):
 80 |         super(BidirectionalSourceEncoder, self).__init__()
 81 | 
 82 |         if hidden_dim % 2 != 0:
 83 |             raise ValueError('hidden_dim must be even for BidirectionalSourceEncoder.')
 84 |         self._hidden_dim = hidden_dim
 85 | 
 86 |         build_encoder = lambda: SimpleSourceEncoder(rnn_cell_factory(input_dim, hidden_dim / 2))
 87 |         self.forward_encoder = build_encoder()
 88 |         self.backward_encoder = build_encoder()
 89 | 
 90 |     @property
 91 |     def hidden_dim(self):
 92 |         return self._hidden_dim
 93 | 
 94 |     def forward(self, input_embeds_list):
 95 |         """Compute bidirectional RNN embeddings.
 96 |         
 97 |         Args:
 98 |             input_embeds_list (list[SequenceBatchElement])
 99 | 
100 |         Returns:
101 |             forward_states (list[SequenceBatchElement]): ordered left to right
102 |             backward_states (list[SequenceBatchElement]): ordered left to right
103 |         """
104 |         reverse = lambda seq: list(reversed(seq))
105 |         forward_states = self.forward_encoder(input_embeds_list)
106 |         backward_states = reverse(self.backward_encoder(reverse(input_embeds_list)))
107 |         return BidirectionalEncoderOutput(forward_states, backward_states)
108 | 
109 | 
110 | class BidirectionalEncoderOutput(namedtuple('BidirectionalEncoderOutput', ['forward_states', 'backward_states'])):
111 |     """
112 |     Attributes:
113 |         forward_states (list[SequenceBatchElement]): ordered left to right
114 |         backward_states (list[SequenceBatchElement]): ordered left to right
115 |     """
116 |     @property
117 |     def combined_states(self):
118 |         """Concatenates forward and backward hidden states: [forward; backward].
119 |         
120 |         Returns:
121 |             combined_states (list[SequenceBatchElement]): ordered left to right
122 |         """
123 |         combined_states = [SequenceBatchElement(torch.cat([f.values, b.values], 1), f.mask)
124 |                            for f, b in izip(self.forward_states, self.backward_states)]
125 |         return combined_states
126 | 
127 |     @property
128 |     def final_states(self):
129 |         """Return the final forward and backward states.
130 | 
131 |         Returns:
132 |             forward_state (Variable): right-most forward state, of shape (batch_size, hidden_dim)
133 |             backward_state (Variable): left-most backward state, of shape (batch_size, hidden_dim)
134 |         """
135 |         return self.forward_states[-1].values, self.backward_states[0].values
136 | 
137 | 
138 | # TODO(kelvin): test this
139 | class MultiLayerSourceEncoder(SourceEncoder):
140 |     def __init__(self, input_dim, hidden_dim, num_layers, rnn_cell_factory):
141 |         """
142 | 
143 |         Args:
144 |             input_dim (int)
145 |             hidden_dim (int)
146 |             num_layers (int)
147 |             rnn_cell_factory (Callable[[int, int], RNNCell): takes input_dim and output_dim as arguments.
148 |         """
149 |         super(MultiLayerSourceEncoder, self).__init__()
150 |         self.layers = []
151 |         for layer in range(num_layers):
152 |             in_dim = input_dim if layer == 0 else hidden_dim
153 |             out_dim = hidden_dim
154 |             encoder = BidirectionalSourceEncoder(in_dim, out_dim, rnn_cell_factory)
155 |             self.add_module('encoder_layer_{}'.format(layer), encoder)
156 |             self.layers.append(encoder)
157 | 
158 |     @property
159 |     def hidden_dim(self):
160 |         return self.layers[-1].hidden_dim
161 | 
162 |     def forward(self, input_embeds_list):
163 |         """
164 | 
165 |         Args:
166 |             input_embeds_list (list[SequenceBatchElement]): where each element is of shape (batch_size, input_dim)
167 | 
168 |         Returns:
169 |             hidden_states_list (list[SequenceBatchElement]) where each element is (batch_size, hidden_dim)
170 |         """
171 |         for i, layer in enumerate(self.layers):
172 |             if i == 0:
173 |                 prev_hidden_states = input_embeds_list
174 |             else:
175 |                 prev_hidden_states = [SequenceBatchElement(torch.cat([f.values, b.values], 1), f.mask)
176 |                                       for f, b in izip(forward_states, backward_states)]
177 | 
178 |             new_forward_states, new_backward_states = layer(prev_hidden_states)
179 | 
180 |             if i == 0:
181 |                 # no skip connections here, because dimensions don't match
182 |                 forward_states, backward_states = new_forward_states, new_backward_states
183 |             else:
184 |                 # add residuals to previous hidden states
185 |                 add_residuals = lambda a_list, b_list: [SequenceBatchElement(a.values + b.values, a.mask)
186 |                                                         for a, b in izip(a_list, b_list)]
187 | 
188 |                 forward_states = add_residuals(forward_states, new_forward_states)
189 |                 backward_states = add_residuals(backward_states, new_backward_states)
190 | 
191 |         return BidirectionalEncoderOutput(forward_states, backward_states)


--------------------------------------------------------------------------------
/textmorph/edit_model/edit_encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn import Module, Linear, Parameter, Hardtanh
  3 | from gtd.ml.torch.utils import GPUVariable
  4 | from gtd.ml.torch.seq_batch import SequenceBatch
  5 | import numpy as np
  6 | 
  7 | class EditEncoder(Module):
  8 |     """
  9 |     EditEncoder maps insert / delete embeddings into a single edit vector of dimensionality edit_dim
 10 |     """
 11 |     def __init__(self, word_dim, edit_dim, kappa_init, norm_eps, norm_max):
 12 |         super(EditEncoder, self).__init__()
 13 |         self.linear = Linear(edit_dim, edit_dim)
 14 |         self.linear_prenoise = Linear(word_dim, edit_dim/2, bias=False)
 15 |         self.noise_scaler = kappa_init
 16 |         self.norm_eps = norm_eps
 17 |         self.norm_max = norm_max
 18 |         self.normclip = Hardtanh(0, self.norm_max - norm_eps)
 19 | 
 20 |     def forward(self, insert_embeds, insert_embeds_exact, delete_embeds, delete_embeds_exact, draw_samples = False, draw_p = False):
 21 |         """Create agenda vector.
 22 | 
 23 |         Args:
 24 |             insert_embeds (SequenceBatch): of shape (batch_size, max_edits, word_dim)
 25 |             insert_embeds_exact (SequenceBatch): of shape (batch_size, max_edits, word_dim)
 26 |             delete_embeds (SequenceBatch): of shape (batch_size, max_edits, word_dim)
 27 |             delete_embeds_exact (SequenceBatch): of shape (batch_size, max_edits, word_dim)
 28 |             draw_samples (bool) : flag for whether to add noise for variational approx. disable at test time.
 29 | 
 30 |         Returns:
 31 |             edit_embed (Variable): of shape (batch_size, edit_vec_cim)
 32 |         """
 33 |         insert_embed = SequenceBatch.reduce_sum(insert_embeds)  # (batch_size, word_dim)
 34 |         insert_embed += SequenceBatch.reduce_sum(insert_embeds_exact)  # (batch_size, word_dim)
 35 |         delete_embed = SequenceBatch.reduce_sum(delete_embeds)  # (batch_size, word_dim)
 36 |         delete_embed += SequenceBatch.reduce_sum(delete_embeds_exact)  # (batch_size, word_dim)
 37 |         insert_set = self.linear_prenoise(insert_embed)
 38 |         delete_set = self.linear_prenoise(delete_embed)
 39 |         combined_map = torch.cat([insert_set, delete_set], 1)
 40 |         if draw_samples:
 41 |             if draw_p:
 42 |                 batch_size, edit_dim = combined_map.size()
 43 |                 combined_map = self.draw_p_noise(batch_size, edit_dim)
 44 |             else:
 45 |                 combined_map = self.sample_vMF(combined_map, self.noise_scaler)
 46 |         edit_embed = combined_map
 47 |         return edit_embed
 48 | 
 49 |     def seq_batch_noise(self, seq_batch, draw_noise):
 50 |         """
 51 |         Returns a noisy version of seq_batch, in which every vector is noisy and unit norm.
 52 |         :param seq_batch(SequenceBatch): a sequence batch of elements
 53 |         :return: noisy version of seq-batch
 54 |         """
 55 |         values = seq_batch.values
 56 |         mask = seq_batch.mask
 57 |         batch_size, max_edits, w_embed_size = values.size()
 58 |         new_values = GPUVariable(torch.from_numpy(np.zeros((batch_size, max_edits, w_embed_size),dtype=np.float32)))
 59 |         phint = self.sample_vMF(values[:,0,:], self.noise_scaler)
 60 |         prand = self.draw_p_noise(batch_size, w_embed_size)
 61 |         m_expand = mask.expand(batch_size, w_embed_size)
 62 |         new_values[:, 0, :] = phint*m_expand+ prand*(1-m_expand)
 63 |         return SequenceBatch(values=new_values*draw_noise, mask=mask)
 64 | 
 65 |     def draw_p_noise(self, batch_size, edit_dim):
 66 |         rand_draw = GPUVariable(torch.randn(batch_size, edit_dim))
 67 |         rand_draw = rand_draw / torch.norm(rand_draw, p=2, dim=1).expand(batch_size, edit_dim)
 68 |         rand_norms = (torch.rand(batch_size,1)*self.norm_max).expand(batch_size, edit_dim)
 69 |         return rand_draw * GPUVariable(rand_norms)
 70 | 
 71 | 
 72 |     def add_norm_noise(self, munorm, eps):
 73 |         """
 74 |         KL loss is - log(maxvalue/eps)
 75 |         cut at maxvalue-eps, and add [0,eps] noise.
 76 |         """
 77 |         trand = torch.rand(1).expand(munorm.size())*eps
 78 |         return (self.normclip(munorm) + GPUVariable(trand))
 79 | 
 80 |     def sample_vMF(self, mu, kappa):
 81 |         """vMF sampler in pytorch.
 82 | 
 83 |         http://stats.stackexchange.com/questions/156729/sampling-from-von-mises-fisher-distribution-in-python
 84 | 
 85 |         Args:
 86 |             mu (Tensor): of shape (batch_size, 2*word_dim)
 87 |             kappa (Float): controls dispersion. kappa of zero is no dispersion.
 88 |         """
 89 |         batch_size, id_dim = mu.size()
 90 |         result_list = []
 91 |         for i in range(batch_size):
 92 |             munorm = mu[i].norm().expand(id_dim)
 93 |             munoise = self.add_norm_noise(munorm, self.norm_eps)
 94 |             if float(mu[i].norm().data.cpu().numpy()) > 1e-10:
 95 |                 # sample offset from center (on sphere) with spread kappa
 96 |                 w = self._sample_weight(kappa, id_dim)
 97 |                 wtorch = GPUVariable(w*torch.ones(id_dim))
 98 | 
 99 |                 # sample a point v on the unit sphere that's orthogonal to mu
100 |                 v = self._sample_orthonormal_to(mu[i]/munorm, id_dim)
101 | 
102 |                 # compute new point
103 |                 scale_factr = torch.sqrt(GPUVariable(torch.ones(id_dim)) - torch.pow(wtorch,2))
104 |                 orth_term = v * scale_factr
105 |                 muscale = mu[i] * wtorch / munorm
106 |                 sampled_vec = (orth_term + muscale)*munoise
107 |             else:
108 |                 rand_draw = GPUVariable(torch.randn(id_dim))
109 |                 rand_draw = rand_draw / torch.norm(rand_draw, p=2).expand(id_dim)
110 |                 rand_norms = (torch.rand(1) * self.norm_eps).expand(id_dim)
111 |                 sampled_vec = rand_draw*GPUVariable(rand_norms)#mu[i]
112 |             result_list.append(sampled_vec)
113 | 
114 |         return torch.stack(result_list,0)
115 | 
116 |     def _sample_weight(self, kappa, dim):
117 |         """Rejection sampling scheme for sampling distance from center on
118 |         surface of the sphere.
119 |         """
120 |         dim = dim - 1  # since S^{n-1}
121 |         b = dim / (np.sqrt(4. * kappa ** 2 + dim ** 2) + 2 * kappa) # b= 1/(sqrt(4.* kdiv**2 + 1) + 2 * kdiv)
122 |         x = (1. - b) / (1. + b)
123 |         c = kappa * x + dim * np.log(1 - x ** 2)  # dim * (kdiv *x + np.log(1-x**2))
124 | 
125 |         while True:
126 |             z = np.random.beta(dim / 2., dim / 2.)  #concentrates towards 0.5 as d-> inf
127 |             w = (1. - (1. + b) * z) / (1. - (1. - b) * z)
128 |             u = np.random.uniform(low=0, high=1)
129 |             if kappa * w + dim * np.log(1. - x * w) - c >= np.log(u): #thresh is dim *(kdiv * (w-x) + log(1-x*w) -log(1-x**2))
130 |                 return w
131 | 
132 |     def _sample_orthonormal_to(self, mu, dim):
133 |         """Sample point on sphere orthogonal to mu.
134 |         """
135 |         v = GPUVariable(torch.randn(dim))
136 |         rescale_value = mu.dot(v) / mu.norm()
137 |         proj_mu_v = mu * rescale_value.expand(dim)
138 |         ortho = v - proj_mu_v
139 |         ortho_norm = torch.norm(ortho)
140 |         return ortho / ortho_norm.expand_as(ortho)
141 | 
142 | def test_sample_weight(kappa, dim):
143 |     """Rejection sampling scheme for sampling distance from center on
144 |     surface of the sphere.
145 |     """
146 |     dim = dim - 1  # since S^{n-1}
147 |     b = dim / (np.sqrt(4. * kappa ** 2 + dim ** 2) + 2 * kappa)
148 |     x = (1. - b) / (1. + b)
149 |     c = kappa * x + dim * np.log(1 - x ** 2)
150 | 
151 |     while True:
152 |         z = np.random.beta(dim / 2., dim / 2.)
153 |         w = (1. - (1. + b) * z) / (1. - (1. - b) * z)
154 |         u = np.random.uniform(low=0, high=1)
155 |         if kappa * w + dim * np.log(1. - x * w) - c >= np.log(u):
156 |             return w
157 | 
158 | def get_ev(kappa,dim,nsamp):
159 |     samp_in = np.array([test_sample_weight(kappa,dim) for i in xrange(nsamp)])
160 |     return np.mean(samp_in), np.std(samp_in), np.percentile(samp_in, np.arange(0,100,10))
161 | 
162 | def get_mode(kappa,dim):
163 |     return np.sqrt(4*(kappa**2.0)+dim**2.0+6*dim+9)/(2*kappa) - (dim+3.0)/(2*kappa)


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/vocab.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | from abc import ABCMeta, abstractmethod
  3 | from collections import Mapping
  4 | 
  5 | import numpy as np
  6 | 
  7 | from gtd.chrono import verboserate
  8 | from gtd.io import num_lines
  9 | from gtd.utils import EqualityMixin, random_seed
 10 | 
 11 | 
 12 | class Vocab(object):
 13 |     __metaclass__ = ABCMeta
 14 | 
 15 |     @abstractmethod
 16 |     def word2index(self, w):
 17 |         pass
 18 | 
 19 |     @abstractmethod
 20 |     def index2word(self, i):
 21 |         pass
 22 | 
 23 | 
 24 | class SimpleVocab(Vocab, EqualityMixin):
 25 |     """A simple vocabulary object."""
 26 | 
 27 |     def __init__(self, tokens):
 28 |         """Create a vocab.
 29 | 
 30 |         Args:
 31 |             tokens (list[unicode]): a unique list of unicode tokens
 32 | 
 33 |         If t = tokens[i], this vocab will map token t to the integer i.
 34 |         """
 35 |         if not isinstance(tokens, list):
 36 |             raise ValueError('tokens must be a list')
 37 | 
 38 |         # build mapping
 39 |         word2index = {}
 40 |         for i, tok in enumerate(tokens):
 41 |             word2index[tok] = i
 42 | 
 43 |         if len(tokens) != len(word2index):
 44 |             raise ValueError('tokens must be unique')
 45 | 
 46 |         self._index2word = list(tokens)  # make a copy
 47 |         self._word2index = word2index
 48 | 
 49 |     @property
 50 |     def tokens(self):
 51 |         """Return the full list of tokens sorted by their index."""
 52 |         return self._index2word
 53 | 
 54 |     def __iter__(self):
 55 |         """Iterate through the full list of tokens."""
 56 |         return iter(self._index2word)
 57 | 
 58 |     def __len__(self):
 59 |         """Total number of tokens indexed."""
 60 |         return len(self._index2word)
 61 | 
 62 |     def __contains__(self, w):
 63 |         """Check if a token has been indexed by this vocab."""
 64 |         return w in self._word2index
 65 | 
 66 |     def word2index(self, w):
 67 |         return self._word2index[w]
 68 | 
 69 |     def index2word(self, i):
 70 |         return self._index2word[i]
 71 | 
 72 |     def words2indices(self, words):
 73 |         return map(self.word2index, words)
 74 | 
 75 |     def indices2words(self, indices):
 76 |         return [self.index2word(i) for i in indices]
 77 | 
 78 |     def save(self, path):
 79 |         """Save SimpleVocab to file path.
 80 | 
 81 |         Args:
 82 |             path (str)
 83 |         """
 84 |         with open(path, 'w') as f:
 85 |             for word in self._index2word:
 86 |                 f.write(word)
 87 |                 f.write('\n')
 88 | 
 89 |     @classmethod
 90 |     def load(cls, path):
 91 |         """Load SimpleVocab from file path.
 92 | 
 93 |         Args:
 94 |             path (str)
 95 | 
 96 |         Returns:
 97 |             SimpleVocab
 98 |         """
 99 |         strip_newline = lambda s: s[:-1]
100 |         with open(path, 'r') as f:
101 |             tokens = [strip_newline(line) for line in f]
102 |         return cls(tokens)
103 | 
104 | 
105 | class WordVocab(SimpleVocab):
106 |     """WordVocab.
107 | 
108 |     IMPORTANT NOTE: WordVocab is blind to casing! All words are converted to lower-case.
109 | 
110 |     A WordVocab is required to have the following special tokens: UNK, START, STOP.
111 |     """
112 |     UNK = u'<unk>'
113 |     START = u'<start>'
114 |     STOP = u'<stop>'
115 |     SPECIAL_TOKENS = (UNK, START, STOP)
116 | 
117 |     def __init__(self, tokens):
118 |         super(WordVocab, self).__init__([t.lower() for t in tokens])
119 | 
120 |         # make sure all special tokens present
121 |         for special in self.SPECIAL_TOKENS:
122 |             if special not in self:
123 |                 raise ValueError('All special tokens must be present in tokens. Missing {}'.format(special))
124 | 
125 |     def word2index(self, w):
126 |         """Map a word to an integer.
127 | 
128 |         Automatically lower-cases the word before mapping it.
129 | 
130 |         If the word is not known to the vocab, return the index for UNK.
131 |         """
132 |         sup = super(WordVocab, self)
133 |         try:
134 |             return sup.word2index(w.lower())
135 |         except KeyError:
136 |             return sup.word2index(self.UNK)
137 | 
138 | 
139 | class SimpleEmbeddings(Mapping):
140 |     def __init__(self, array, vocab):
141 |         """Create embeddings object.
142 | 
143 |         Args:
144 |             array (np.array): has shape (vocab_size, embed_dim)
145 |             vocab (SimpleVocab): a Vocab object
146 |         """
147 |         assert len(array.shape) == 2
148 |         assert array.shape[0] == len(vocab)  # entries line up
149 | 
150 |         self.array = array
151 |         self.vocab = vocab
152 | 
153 |     def __contains__(self, w):
154 |         return w in self.vocab
155 | 
156 |     def __getitem__(self, w):
157 |         idx = self.vocab.word2index(w)
158 |         return np.copy(self.array[idx])
159 | 
160 |     def __iter__(self):
161 |         return iter(self.vocab)
162 | 
163 |     def __len__(self):
164 |         return len(self.vocab)
165 | 
166 |     @property
167 |     def embed_dim(self):
168 |         return self.array.shape[1]
169 | 
170 |     @classmethod
171 |     def from_file(cls, file_path, embed_dim, vocab_size=None):
172 |         """Load word embeddings.
173 | 
174 |         Args:
175 |             file_path (str)
176 |             embed_dim (int): expected embed_dim
177 |             vocab_size (int): max # of words in the vocab. If not specified, uses all available vectors in file.
178 |         """
179 |         if vocab_size is None:
180 |             vocab_size = num_lines(file_path)
181 | 
182 |         words = []
183 |         embeds = []
184 |         with codecs.open(file_path, 'r', encoding='utf-8') as f:
185 |             lines = verboserate(f, desc='Loading embeddings from {}'.format(file_path), total=vocab_size)
186 |             for i, line in enumerate(lines):
187 |                 if i == vocab_size: break
188 |                 tokens = line.split()
189 |                 word, embed = tokens[0], np.array([float(tok) for tok in tokens[1:]], dtype=np.float32)
190 |                 if len(embed) != embed_dim:
191 |                     raise ValueError('expected {} dims, got {} dims'.format(embed_dim, len(embed)))
192 |                 words.append(word)
193 |                 embeds.append(embed)
194 | 
195 |         vocab = SimpleVocab(words)
196 |         embed_matrix = np.stack(embeds)
197 |         embed_matrix = embed_matrix.astype(np.float32)
198 |         assert embed_matrix.shape == (vocab_size, embed_dim)
199 |         return cls(embed_matrix, vocab)
200 | 
201 |     def to_file(self, file_path):
202 |         array = self.array
203 |         with codecs.open(file_path, 'w', encoding='utf-8') as f:
204 |             for i, word in enumerate(self.vocab):
205 |                 vec_str = u' '.join(str(x) for x in array[i])
206 |                 f.write(u'{} {}'.format(word, vec_str))
207 |                 f.write('\n')
208 | 
209 |     def with_special_tokens(self, random_seed=0):
210 |         """Return a new SimpleEmbeddings object with special tokens inserted at the front of the vocab.
211 |         
212 |         In the new vocab, special tokens will occupy indices 0, 1, ..., len(special_tokens) - 1.
213 |         The special tokens will have randomly generated embeddings.
214 | 
215 |         Args:
216 |             random_seed (int)
217 |         
218 |         Returns:
219 |             SimpleEmbeddings
220 |         """
221 |         special_tokens = list(WordVocab.SPECIAL_TOKENS)
222 |         _, embed_dim = self.array.shape
223 |         special_tokens_array_shape = (len(special_tokens), embed_dim)
224 |         special_tokens_array = emulate_distribution(special_tokens_array_shape, self.array, seed=random_seed)
225 |         special_tokens_array = special_tokens_array.astype(np.float32)
226 | 
227 |         new_array = np.concatenate((special_tokens_array, self.array), axis=0)
228 |         new_vocab = WordVocab(special_tokens + self.vocab.tokens)
229 | 
230 |         return SimpleEmbeddings(new_array, new_vocab)
231 | 
232 | 
233 | def emulate_distribution(shape, target_samples, seed=None):
234 |     m = np.mean(target_samples)
235 |     s = np.std(target_samples)
236 | 
237 |     with random_seed(seed):
238 |         samples = np.random.normal(m, s, size=shape)
239 | 
240 |     return samples
241 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/graph.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict, Counter, deque
  2 | import numpy as np
  3 | import random
  4 | 
  5 | from gtd import utils
  6 | 
  7 | 
  8 | # defines whether an edge is inverted or not
  9 | inverted = lambda r: r[:2] == '**'
 10 | invert = lambda r: r[2:] if inverted(r) else '**' + r
 11 | 
 12 | 
 13 | class Graph(object):
 14 |     def __init__(self, triples):
 15 |         self.triples = triples
 16 |         neighbors = defaultdict(lambda: defaultdict(set))
 17 |         relation_args = defaultdict(lambda: defaultdict(set))
 18 | 
 19 |         for s, r, t in triples:
 20 |             relation_args[r]['s'].add(s)
 21 |             relation_args[r]['t'].add(t)
 22 |             neighbors[s][r].add(t)
 23 |             neighbors[t][invert(r)].add(s)
 24 | 
 25 |         def freeze(d):
 26 |             frozen = {}
 27 |             for key, subdict in d.iteritems():
 28 |                 frozen[key] = {}
 29 |                 for subkey, set_val in subdict.iteritems():
 30 |                     frozen[key][subkey] = tuple(set_val)
 31 |             return frozen
 32 | 
 33 |         # WARNING: both neighbors and relation_args must not have default initialization.
 34 |         # Default init is dangerous, because we sometimes perform uniform sampling over
 35 |         # all keys in the dictionary. This distribution will get altered if a user asks about
 36 |         # entities or relations that weren't present.
 37 | 
 38 |         # self.neighbors[start][relation] = (end1, end2, ...)
 39 |         # self.relation_args[relation][position] = (ent1, ent2, ...)
 40 |         # position is either 's' (domain) or 't' (range)
 41 |         self.neighbors = freeze(neighbors)
 42 |         self.relation_args = freeze(relation_args)
 43 |         self.random_entities = []
 44 | 
 45 |         # cpp_graph = graph_traversal.Graph()
 46 |         # for s, r, t in triples:
 47 |         #     cpp_graph.add_edge(s, r, t)
 48 |         #     cpp_graph.add_edge(t, invert(r), s)
 49 |         # self.cpp_graph = cpp_graph
 50 |         cpp_graph = None
 51 | 
 52 |     def shortest_path(self, source, target):
 53 |         # use breadth-first search
 54 | 
 55 |         queue = deque()
 56 |         explored = {}  # stores backpointers
 57 | 
 58 |         def enqueue(node, backpointer):
 59 |             queue.appendleft(node)
 60 |             explored[node] = backpointer
 61 | 
 62 |         def path(node):
 63 |             current = node
 64 |             path = [current]
 65 |             while True:
 66 |                 backpointer = explored[current]
 67 |                 if backpointer:
 68 |                     rel, current = backpointer
 69 |                     path.extend((rel, current))
 70 |                 else:
 71 |                     break  # we've hit the source
 72 |             return path[::-1]  # reverse
 73 | 
 74 |         enqueue(source, None)
 75 | 
 76 |         while len(queue) != 0:
 77 |             current = queue.pop()
 78 |             for rel, nbrs in self.neighbors[current].iteritems():
 79 |                 for nbr in nbrs:
 80 |                     if nbr not in explored:
 81 |                         enqueue(nbr, (rel, current))
 82 |                     if nbr == target:
 83 |                         return path(nbr)
 84 | 
 85 | 
 86 |     def random_walk_probs(self, start, path):
 87 |         return self.cpp_graph.exact_random_walk_probs(start, list(path))
 88 | 
 89 |     def walk_all(self, start, path, positive_branch_factor=float('inf')):
 90 |         if positive_branch_factor == 0:
 91 |             return set()
 92 | 
 93 |         approx = positive_branch_factor != float('inf')
 94 | 
 95 |         if approx:
 96 |             return set(self.cpp_graph.approx_path_traversal(start, list(path), positive_branch_factor))
 97 |         else:
 98 |             return set(self.cpp_graph.path_traversal(start, list(path)))
 99 | 
100 |     def is_trivial_query(self, start, path):
101 |         return self.cpp_graph.is_trivial_query(start, list(path))
102 | 
103 |     def type_matching_entities(self, path, position):
104 |         if position == 's':
105 |             r = path[0]
106 |         elif position == 't':
107 |             r = path[-1]
108 |         else:
109 |             raise ValueError(position)
110 | 
111 |         try:
112 |             if not inverted(r):
113 |                 return self.relation_args[r][position]
114 |             else:
115 |                 inv_pos = 's' if position == 't' else 't'
116 |                 return self.relation_args[invert(r)][inv_pos]
117 |         except KeyError:
118 |             # nothing type-matches
119 |             return tuple()
120 | 
121 |     # TODO: test this
122 |     def random_walk(self, start, length, no_return=False):
123 |         """
124 |         If no_return, the random walk never revisits the same node. Can sometimes return None, None.
125 |         """
126 |         max_attempts = 1000
127 |         for i in range(max_attempts):
128 | 
129 |             sampled_path = []
130 |             visited = set()
131 |             current = start
132 |             for k in range(length):
133 |                 visited.add(current)
134 | 
135 |                 r = random.choice(self.neighbors[current].keys())
136 |                 sampled_path.append(r)
137 | 
138 |                 candidates = self.neighbors[current][r]
139 | 
140 |                 if no_return:
141 |                     current = utils.sample_excluding(candidates, visited)
142 |                 else:
143 |                     current = random.choice(candidates)
144 | 
145 |                 # no viable next step
146 |                 if current is None:
147 |                     break
148 | 
149 |             # failed to find a viable walk. Try again.
150 |             if current is None:
151 |                 continue
152 | 
153 |             return tuple(sampled_path), current
154 | 
155 |         return None, None
156 | 
157 |     def random_walk_constrained(self, start, path):
158 |         """
159 |         Warning! Can sometimes return None.
160 |         """
161 | 
162 |         # if start node isn't present we can't take this walk
163 |         if start not in self.neighbors:
164 |             return None
165 | 
166 |         current = start
167 |         for r in path:
168 |             rels = self.neighbors[current]
169 |             if r not in rels:
170 |                 # no viable next steps
171 |                 return None
172 |             current = random.choice(rels[r])
173 |         return current
174 | 
175 |     def random_entity(self):
176 |         if len(self.random_entities) == 0:
177 |             self.random_entities = list(np.random.choice(self.neighbors.keys(), size=20000, replace=True))
178 |         return self.random_entities.pop()
179 | 
180 |     def relation_stats(self):
181 |         stats = defaultdict(dict)
182 |         rel_counts = Counter(r for s, r, t in self.triples)
183 | 
184 |         for r, args in self.relation_args.iteritems():
185 |             out_degrees, in_degrees = [], []
186 |             for s in args['s']:
187 |                 out_degrees.append(len(self.neighbors[s][r]))
188 |             for t in args['t']:
189 |                 in_degrees.append(len(self.neighbors[t][invert(r)]))
190 | 
191 |             domain = float(len(args['s']))
192 |             range = float(len(args['t']))
193 |             out_degree = np.mean(out_degrees)
194 |             in_degree = np.mean(in_degrees)
195 |             stat = {'avg_out_degree': out_degree,
196 |                     'avg_in_degree': in_degree,
197 |                     'min_degree': min(in_degree, out_degree),
198 |                     'in/out': in_degree / out_degree,
199 |                     'domain': domain,
200 |                     'range': range,
201 |                     'r/d': range / domain,
202 |                     'total': rel_counts[r],
203 |                     'log(total)': np.log(rel_counts[r])
204 |                     }
205 | 
206 |             # include inverted relation
207 |             inv_stat = {'avg_out_degree': in_degree,
208 |                         'avg_in_degree': out_degree,
209 |                         'min_degree': stat['min_degree'],
210 |                         'in/out': out_degree / in_degree,
211 |                         'domain': range,
212 |                         'range': domain,
213 |                         'r/d': domain / range,
214 |                         'total': stat['total'],
215 |                         'log(total)': stat['log(total)']
216 |                         }
217 | 
218 |             stats[r] = stat
219 |             stats[invert(r)] = inv_stat
220 | 
221 |         return stats


--------------------------------------------------------------------------------
/textmorph/turk/turk.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from string import Template
  3 | 
  4 | import boto
  5 | from boto.mturk.question import Overview, QuestionContent, SelectionAnswer, Question, AnswerSpecification, QuestionForm
  6 | 
  7 | from gtd.turk import Task, get_mturk_connection, standard_quals
  8 | from gtd.utils import Config
  9 | from textmorph import data
 10 | 
 11 | 
 12 | """
 13 | To review completed HITs:
 14 | - Go to: https://requester.mturk.com/mturk/manageHITs
 15 | 
 16 | To do a HIT:
 17 | - Go to: https://worker.mturk.com/
 18 | - Search for "percy liang"
 19 | - Click "Accept & Work"
 20 |   - For some reason, I had trouble viewing these HITs on Google Chrome (invalid URL parameter error).
 21 |   - On Firefox, things are fine.
 22 | """
 23 | 
 24 | config = Config.from_file(data.workspace.config)
 25 | mtc = get_mturk_connection(config.aws_access_key_id,
 26 |                            config.aws_secret_access_key, sandbox=False)
 27 | 
 28 | 
 29 | class SimilarityTask(Task):
 30 | 
 31 |     def __init__(self, debug):
 32 |         # load from configuration
 33 |         conf = Config.from_file(data.workspace.turk.similarity.config.txt)
 34 |         self.title = conf.title
 35 |         self.description = conf.description
 36 |         self.keywords = conf.keywords
 37 |         self.price = conf.price
 38 |         self.duration = eval(conf.duration)
 39 |         self.approval_delay = eval(conf.approval_delay)
 40 | 
 41 |         # store form specification as JSON, to be built automatically on launch
 42 |         with open(data.workspace.turk.similarity.form.json) as form_json:
 43 |             self.form_json = form_json.read()
 44 | 
 45 |         price_per_hit = 0.0 if debug else self.price
 46 | 
 47 |         quals = standard_quals(debug)
 48 | 
 49 |         hit_type_ids = mtc.register_hit_type(title=self.title, description=self.description, reward=price_per_hit,
 50 |                                              duration=self.duration,
 51 |                                              keywords=self.keywords, approval_delay=self.approval_delay, qual_req=quals)
 52 |         hit_type_id = hit_type_ids[0].HITTypeId
 53 | 
 54 |         super(SimilarityTask, self).__init__(hit_type_id, mtc)
 55 | 
 56 |     def launch(self, data={}):
 57 |         qf = QuestionForm()
 58 |         form_json = BotoFormGenerator.inject_data(self.form_json, data)
 59 |         BotoFormGenerator.from_json(qf, form_json)
 60 |         return self.create_hit(qf)
 61 | 
 62 | 
 63 | class CoherenceTask(Task):
 64 | 
 65 |     def __init__(self, debug):
 66 |         # load from configuration
 67 |         conf = Config.from_file(data.workspace.turk.coherence.config.txt)
 68 |         self.title = conf.title
 69 |         self.description = conf.description
 70 |         self.keywords = conf.keywords
 71 |         self.price = conf.price
 72 |         self.duration = eval(conf.duration)
 73 |         self.approval_delay = eval(conf.approval_delay)
 74 | 
 75 |         # store form specification as JSON, to be built automatically on launch
 76 |         with open(data.workspace.turk.coherence.form.json) as form_json:
 77 |             self.form_json = form_json.read()
 78 | 
 79 |         price_per_hit = 0.0 if debug else self.price
 80 | 
 81 |         quals = standard_quals(debug)
 82 | 
 83 |         hit_type_ids = mtc.register_hit_type(title=self.title, description=self.description, reward=price_per_hit,
 84 |                                              duration=self.duration,
 85 |                                              keywords=self.keywords, approval_delay=self.approval_delay, qual_req=quals)
 86 |         hit_type_id = hit_type_ids[0].HITTypeId
 87 | 
 88 |         super(CoherenceTask, self).__init__(hit_type_id, mtc)
 89 | 
 90 |     def launch(self, data={}):
 91 |         qf = QuestionForm()
 92 |         form_json = BotoFormGenerator.inject_data(self.form_json, data)
 93 |         BotoFormGenerator.from_json(qf, form_json)
 94 |         return self.create_hit(qf)
 95 | 
 96 | 
 97 | class BotoFormGenerator(object):
 98 | 
 99 |     form_types = {'Overview', 'QuestionContent', 'SelectionAnswer', 'Question', 'AnswerSpecification', 'QuestionForm', 'FormattedContent'}
100 | 
101 |     @staticmethod
102 |     def from_json(question_form, json_data):
103 |         """
104 |         Construct a QuestionForm from a JSON specification
105 |         """
106 | 
107 |         form_data = json.loads(json_data, strict=False)
108 | 
109 |         # construct objects and build QuestionForm
110 |         for obj_data in form_data['form']:
111 |             obj = BotoFormGenerator._from_data(obj_data)
112 |             question_form.append(obj)
113 | 
114 |     @staticmethod
115 |     def _from_data(form_data):
116 |         """
117 |         Generates and populates boto.mturk.question objects from a specification.
118 |         """
119 | 
120 |         if type(form_data) is not dict:
121 |             return form_data
122 | 
123 |         """
124 |         Functions for creating form objects. 
125 |         args_dict is a dictionary containing a mapping from names to arguments. 
126 |         Positional and keyword arguments pertaining to the particular object
127 |         are extracted from args_dict and passed appropriately to the object 
128 |         constructor.
129 | 
130 |         It's very easy to add functionality to this scheme. Simply add a form_type
131 |         and a make_{} function with the correct required args_, and it can 
132 |         immediately be used in the JSON spec.
133 |         """
134 |         def make_args(args_dict, args_):
135 |             # positional arguments
136 |             args = [args_dict[k] for k in args_]
137 |             # keyword arguments
138 |             kwargs = {k: v for k, v in args_dict.iteritems() if k not in args_}
139 |             return args, kwargs
140 | 
141 |         def add_field(obj, field):
142 |             (fl_name, fl_value) = next(field.iteritems())
143 |             obj.append_field(fl_name, fl_value)
144 | 
145 |         def add_append(obj, append):
146 |             obj.append(append)
147 | 
148 |         def make_Overview(args_dict, args_=[]):
149 |             args, kwargs = make_args(args_dict, args_)
150 |             return boto.mturk.question.Overview(*args, **kwargs)
151 | 
152 |         def make_Question(args_dict, args_=['identifier', 'content', 'answer_spec']):
153 |             args, kwargs = make_args(args_dict, args_)
154 |             return boto.mturk.question.Question(*args, **kwargs)
155 | 
156 |         def make_QuestionContent(args_dict, args_=[]):
157 |             args, kwargs = make_args(args_dict, args_)
158 |             return boto.mturk.question.QuestionContent(*args, **kwargs)
159 | 
160 |         def make_SelectionAnswer(args_dict, args_=[]):
161 |             args, kwargs = make_args(args_dict, args_)
162 |             return boto.mturk.question.SelectionAnswer(*args, **kwargs)
163 | 
164 |         def make_AnswerSpecification(args_dict, args_=['spec']):
165 |             args, kwargs = make_args(args_dict, args_)
166 |             return boto.mturk.question.AnswerSpecification(*args, **kwargs)
167 | 
168 |         def make_FormattedContent(args_dict, args_=['content']):
169 |             args, kwargs = make_args(args_dict, args_)
170 |             return boto.mturk.question.FormattedContent(*args, **kwargs)
171 | 
172 |         k, v = next(form_data.iteritems())
173 |         if k in BotoFormGenerator.form_types:
174 |             make_fn = eval("make_{}".format(k))
175 |             args = {}  # arguments to the object, that may be other objects
176 |             # list of to be appended form objects (Field-type or otherwise
177 |             fields = []
178 | 
179 |             for arg_k, arg_v in v.iteritems():  # iterate over arguments to the form object
180 |                 # Fields _or_ form objects to be appended (e.g.
181 |                 # FormattedContent)
182 |                 if arg_k == "fields":
183 |                     fields = arg_v
184 |                 else:  # recurse and build form object argument
185 |                     args[arg_k] = BotoFormGenerator._from_data(arg_v)
186 | 
187 |             obj = make_fn(args)
188 |             for fl in fields:
189 |                 fl_k, fl_v = next(fl.iteritems())
190 |                 if fl_k == "field":
191 |                     add_field(obj, fl_v)
192 |                 if fl_k == "append":
193 |                     ap = BotoFormGenerator._from_data(fl_v)
194 |                     add_append(obj, ap)
195 |             return obj
196 | 
197 |         return None
198 | 
199 |     @staticmethod
200 |     def inject_data(json_data, data):
201 |         """
202 |         Insert data into the JSON format specification.
203 |         This is used to dynamically create forms with different questions using
204 |         the same specification.
205 |         """
206 |         return Template(json_data).substitute(**data)
207 | 


--------------------------------------------------------------------------------
/textmorph/edit_model/attention_decoder.py:
--------------------------------------------------------------------------------
  1 | from itertools import izip
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | from torch.nn import LSTMCell, Linear, Parameter, Softmax
  6 | 
  7 | from collections import namedtuple
  8 | from gtd.ml.torch.attention import Attention, AttentionOutput, DummyAttention
  9 | from gtd.ml.torch.decoder_cell import DecoderCell, DecoderCellOutput, RNNState, RNNInput
 10 | from gtd.ml.torch.recurrent import gated_update, tile_state
 11 | from gtd.ml.torch.utils import GPUVariable
 12 | from gtd.utils import UnicodeMixin
 13 | from gtd.ml.torch.decoder import RNNContextCombiner
 14 | 
 15 | 
 16 | class AttentionContextCombiner(RNNContextCombiner):
 17 |     def __call__(self, encoder_output, x):
 18 |         return AttentionRNNInput(x=x, agenda=encoder_output.agenda, source_embeds=encoder_output.source_embeds, insert_embeds=encoder_output.insert_embeds, delete_embeds=encoder_output.delete_embeds)
 19 | 
 20 | class AttentionDecoderCell(DecoderCell):
 21 |     def __init__(self, token_embedder, agenda_dim, decoder_dim, encoder_dim, attn_dim, no_insert_delete_attn, num_layers):
 22 |         super(AttentionDecoderCell, self).__init__()
 23 | 
 24 |         input_dim = token_embedder.embed_dim
 25 |         self.num_layers = num_layers
 26 | 
 27 |         # see definition of `x_augment` in `forward` method
 28 |         # we augment the input to each RNN layer with 3 attention contexts + the agenda
 29 |         augment_dim = encoder_dim + input_dim + input_dim + agenda_dim
 30 | 
 31 |         self.rnn_cells = []
 32 |         for layer in range(num_layers):
 33 |             in_dim = input_dim if layer == 0 else decoder_dim  # first layer takes word vectors
 34 |             out_dim = decoder_dim
 35 |             rnn_cell = LSTMCell(in_dim + augment_dim, out_dim)
 36 |             self.add_module('decoder_layer_{}'.format(layer), rnn_cell)
 37 |             self.rnn_cells.append(rnn_cell)
 38 | 
 39 |         # see definition of `z` in `forward` method
 40 |         # to predict words, we condition on the hidden state h + 3 attention contexts
 41 |         z_dim = decoder_dim + encoder_dim + 2 * input_dim
 42 |         if no_insert_delete_attn:
 43 |             z_dim = decoder_dim + encoder_dim
 44 | 
 45 |         self.vocab_projection_pos = Linear(z_dim, input_dim)  # TODO(kelvin): these big params may need regularization
 46 |         self.vocab_projection_neg = Linear(z_dim, input_dim)
 47 |         self.relu = torch.nn.ReLU()
 48 | 
 49 |         self.h0 = Parameter(torch.zeros(decoder_dim))
 50 |         self.c0 = Parameter(torch.zeros(decoder_dim))
 51 |         self.vocab_softmax = Softmax()
 52 | 
 53 |         self.source_attention = Attention(encoder_dim, decoder_dim, attn_dim)
 54 |         if not no_insert_delete_attn:
 55 |             self.insert_attention = Attention(input_dim, decoder_dim, attn_dim)
 56 |             self.delete_attention = Attention(input_dim, decoder_dim, attn_dim)
 57 |         else:
 58 |             self.insert_attention = DummyAttention(input_dim, decoder_dim, attn_dim)
 59 |             self.delete_attention = DummyAttention(input_dim, decoder_dim, attn_dim)
 60 | 
 61 |         self.token_embedder = token_embedder
 62 |         self.no_insert_delete_attn = no_insert_delete_attn
 63 | 
 64 |     def initialize(self, batch_size):
 65 |         h = tile_state(self.h0, batch_size)
 66 |         c = tile_state(self.c0, batch_size)
 67 | 
 68 |         # no initial weights, context is just zero vector
 69 |         init_attn = lambda attention: AttentionOutput(None, GPUVariable(torch.zeros(batch_size, attention.memory_dim)))
 70 | 
 71 |         return AttentionRNNState([h] * self.num_layers, [c] * self.num_layers, init_attn(self.source_attention),
 72 |                         init_attn(self.insert_attention), init_attn(self.delete_attention))
 73 | 
 74 |     def forward(self, rnn_state, decoder_cell_input, advance):
 75 |         dci = decoder_cell_input
 76 |         mask = advance
 77 | 
 78 |         # this will be concatenated to x at every layer
 79 |         # we are conditioning on the attention from the previous time step and the agenda from the encoder
 80 |         x_augment = torch.cat([rnn_state.source_attn.context,
 81 |                                rnn_state.insert_attn.context,
 82 |                                rnn_state.delete_attn.context,
 83 |                                dci.agenda], 1)
 84 | 
 85 |         hs, cs = [], []
 86 |         x = dci.x  # input word vector
 87 |         for layer in range(self.num_layers):
 88 |             rnn_cell = self.rnn_cells[layer]
 89 |             old_h, old_c = rnn_state.hs[layer], rnn_state.cs[layer]
 90 |             rnn_input = torch.cat([x, x_augment], 1)
 91 |             h, c = rnn_cell(rnn_input, (old_h, old_c))
 92 |             h = gated_update(old_h, h, mask)
 93 |             c = gated_update(old_c, c, mask)
 94 |             hs.append(h)
 95 |             cs.append(c)
 96 | 
 97 |             if layer == 0:
 98 |                 x = h  # no skip connection on the first layer
 99 |             else:
100 |                 x = x + h
101 | 
102 |         # compute attention using bottom layer
103 |         source_attn = self.source_attention(dci.source_embeds, hs[0])
104 |         insert_attn = self.insert_attention(dci.insert_embeds, hs[0])
105 |         delete_attn = self.delete_attention(dci.delete_embeds, hs[0])
106 |         if not self.no_insert_delete_attn:
107 |             z = torch.cat([x, source_attn.context, insert_attn.context, delete_attn.context], 1)
108 |         else:
109 |             z = torch.cat([x, source_attn.context], 1)
110 | 
111 |         # has shape (batch_size, decoder_dim + encoder_dim + input_dim + input_dim)
112 | 
113 |         vocab_query_pos = self.vocab_projection_pos(z)
114 |         vocab_query_neg = self.vocab_projection_neg(z)
115 |         word_vocab = self.token_embedder.vocab
116 |         word_embeds = self.token_embedder.embeds
117 |         vocab_logit_pos = self.relu(torch.mm(vocab_query_pos, word_embeds.t())) # (batch_size, vocab_size)
118 |         vocab_logit_neg = self.relu(torch.mm(vocab_query_neg, word_embeds.t()))  # (batch_size, vocab_size)
119 |         vocab_probs = self.vocab_softmax(vocab_logit_pos - vocab_logit_neg)
120 |         # TODO(kelvin): prevent model from putting probability on UNK
121 | 
122 |         rnn_state = AttentionRNNState(hs, cs, source_attn, insert_attn, delete_attn)
123 | 
124 |         return DecoderCellOutput(rnn_state, vocab=word_vocab, vocab_probs=vocab_probs)
125 | 
126 |     def rnn_state_type(self):
127 |         return AttentionRNNState
128 | 
129 |     def rnn_input_type(self):
130 |         return AttentionRNNInput
131 | 
132 | class AttentionRNNState(namedtuple('AttentionRNNState', ['hs','cs','source_attn','insert_attn','delete_attn']), RNNState):
133 |     """
134 |     Attributes:
135 |     hs (list[Variable]): a list of the hidden states for each layer of a multi-layer RNN.
136 |         Each Variable has shape (batch_size, hidden_dim).
137 |     cs (list[Variable]): a list of the cell states for each layer of a multi-layer RNN
138 |         Each Variable has shape (batch_size, hidden_dim).
139 |     source_attn (AttentionOutput)
140 |     insert_attn (AttentionOutput)
141 |     delete_attn (AttentionOutput)
142 |     """
143 |     pass
144 | 
145 | class AttentionRNNInput(namedtuple('AttentionRNNInput', ['x','agenda','source_embeds','insert_embeds','delete_embeds']), RNNInput):
146 |     """
147 | Attributes:
148 |     x (Variable): of shape (batch_size, word_dim), embedding of word generated at previous time step
149 |     agenda (Variable): of shape (batch_size, agenda_dim)
150 |     source_embeds (SequenceBatch): of shape (batch_size, source_seq_length, hidden_size)
151 |     insert_embeds (SequenceBatch): of shape (batch_size, max_edits, embed_dim)
152 |     delete_embeds (SequenceBatch): of shape (batch_size, max_edits, embed_dim)
153 |     """
154 |     pass
155 | 
156 |         
157 | 
158 | class AttentionTrace(UnicodeMixin):
159 |     __slots__ = ['name', 'tokens', 'attention_weights']
160 | 
161 |     def __init__(self, name, tokens, attention_weights):
162 |         """Construct AttentionTrace.
163 | 
164 |         Args:
165 |             name (unicode): name of attention mechanism
166 |             tokens (list[unicode])
167 |             attention_weights (np.ndarray): a 1D array. May be longer than len(tokens) due to batching.
168 |         """
169 |         assert len(attention_weights.shape) == 1
170 | 
171 |         # any attention weights exceeding length of tokens should be zero
172 |         for i in range(len(tokens), len(attention_weights)):
173 |             assert attention_weights[i] == 0
174 | 
175 |         self.name = name
176 |         self.tokens = tokens
177 |         self.attention_weights = attention_weights
178 | 
179 |     def __unicode__(self):
180 |         total_mass = np.sum(self.attention_weights)
181 |         s = u' '.join(u'{}[{:.2f}]'.format(t, w) for t, w in izip(self.tokens, self.attention_weights))
182 |         return u'{:10}[{:.2f}]: {}'.format(self.name, total_mass, s)
183 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/tf/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import pytest
  6 | import tensorflow as tf
  7 | from numpy.testing import assert_array_equal, assert_array_almost_equal
  8 | from tensorflow.python.framework.errors import InvalidArgumentError
  9 | 
 10 | from gtd.ml.tf.utils import TensorDebugger, clean_session, expand_dims_for_broadcast, broadcast, Saver, \
 11 |     guarantee_initialized_variables, gather_2d
 12 | from gtd.ml.tf.tests.test_framework import clean_test_session
 13 | 
 14 | 
 15 | class TestTensorDebugger(TestCase):
 16 |     def test_tensor_debugger_deps(self):
 17 |         tdb = TensorDebugger()
 18 | 
 19 |         x = tf.constant(3, name='x')
 20 |         z = tf.mul(x, 3, name='z')
 21 |         with tf.control_dependencies([x]):
 22 |             y = tf.constant(8, name='y')
 23 | 
 24 |         deps = tdb.dependency_graph
 25 | 
 26 |         # control dependencies depend on x's output
 27 |         self.assertEqual(deps['y'], {'x:0'})
 28 | 
 29 |         # each output depends on its op
 30 |         self.assertEqual(deps['y:0'], {'y'})
 31 | 
 32 |         # downstream ops depend on the output of earlier ops
 33 |         self.assertTrue('x:0' in deps['z'])
 34 | 
 35 |     def test_tensor_debugger_multiple(self):
 36 |         tdb = TensorDebugger()
 37 | 
 38 |         x = tf.constant([1, 2])
 39 |         tdb.register('x', x)
 40 |         zs = []
 41 |         for k in range(3):
 42 |             y = tf.constant(k)
 43 |             z = tf.reduce_sum(x * y)
 44 |             # register multiple nodes under the same name
 45 |             tdb.register('y', y)
 46 |             zs.append(z)
 47 | 
 48 |         # 0, (1 + 2), (2 + 4)
 49 |         final = tf.pack(zs)
 50 | 
 51 |         with tf.Session() as sess:
 52 |             results, bp_results = tdb.debug(sess, final, {})
 53 | 
 54 |             def test(a, b):
 55 |                 self.assertTrue(np.array_equal(a, b))
 56 | 
 57 |             # result correctly passed back
 58 |             test(results, [0, 3, 6])
 59 |             # values in for loop accumulated as list
 60 |             test(bp_results['y'], [0, 1, 2])
 61 | 
 62 |     def test_tensor_debugger_exec_path(self):
 63 |         tdb = TensorDebugger()
 64 | 
 65 |         x = tf.constant(5, name='x')
 66 |         y = tf.placeholder(tf.int32, name='y')
 67 | 
 68 |         z = tf.mul(x, y, 'z')
 69 |         w = tf.constant(4, name='w')
 70 | 
 71 |         f = tf.mul(z, w, 'f')
 72 |         g = tf.constant(3, name='g')
 73 | 
 74 |         with tf.control_dependencies([f]):
 75 |             h = tf.constant(11, name='h')
 76 | 
 77 |         # don't register x
 78 |         tdb.register('y', y)
 79 |         tdb.register('z', z)
 80 |         tdb.register('w', w)
 81 |         tdb.register('f', f)
 82 |         tdb.register('g', g, force_run=True)
 83 |         tdb.register('h', h)
 84 | 
 85 |         with tf.Session() as sess:
 86 |             result, bp_results = tdb.debug(sess, f, {y: 2})
 87 |             # result is a single value, not a list
 88 |             self.assertEqual(result, 40)
 89 |             # excludes x, because not registered. excludes h, because not on execution path.
 90 |             # includes g, because of force_run
 91 |             self.assertEqual(bp_results, {'y': 2, 'z': 10, 'w': 4, 'g': 3})
 92 | 
 93 |             results, bp_results = tdb.debug(sess, [h, g], {y: 2})
 94 |             # returns a list
 95 |             self.assertEqual(results, [11, 3])
 96 |             # includes y, z, w and f because h depends on them through control_dependencies
 97 |             # includes g because of force_run
 98 |             self.assertEqual(bp_results, {'y': 2, 'z': 10, 'f': 40, 'w': 4, 'g': 3})
 99 | 
100 | 
101 | def test_expand_dims_for_broadcast():
102 |     with clean_session():
103 |         arr = tf.constant([
104 |             [
105 |                 [1, 2, 3],
106 |                 [4, 5, 6],
107 |                 [4, 5, 6],
108 |             ],
109 |             [
110 |                 [1, 2, 3],
111 |                 [4, 5, 6],
112 |                 [4, 5, 6],
113 |             ],
114 |         ], dtype=tf.float32)
115 |         weights = tf.constant([1, 2], dtype=tf.float32)
116 | 
117 |         assert arr.get_shape().as_list() == [2, 3, 3]
118 |         assert weights.get_shape().as_list() == [2]
119 | 
120 |         new_weights = expand_dims_for_broadcast(weights, arr)
121 |         assert new_weights.eval().shape == (2, 1, 1)
122 | 
123 |         bad_weights = tf.constant([1, 2, 3], dtype=tf.float32)
124 |         bad_new_weights = expand_dims_for_broadcast(bad_weights, arr)
125 | 
126 |         with pytest.raises(InvalidArgumentError):
127 |             bad_new_weights.eval()
128 | 
129 | 
130 | class TestGather2D(object):
131 |     @pytest.fixture
132 |     def x(self):
133 |         x = tf.constant([
134 |             [[1, 2], [2, 2], [3, 3]],
135 |             [[4, 5], [5, 4], [6, 6]],
136 |             [[7, 7], [8, 7], [9, 9]],
137 |             [[0, 8], [1, 1], [2, 2]]
138 |         ], dtype=tf.int32)
139 |         return x
140 | 
141 |     @pytest.mark.usefixtures('clean_test_session')
142 |     def test(self, x):
143 |         i = tf.constant([[0, 2],
144 |                          [3, 0]],
145 |                         dtype=tf.int32)
146 |         j = tf.constant([[1, 1],
147 |                          [0, 2]],
148 |                         dtype=tf.int32)
149 |         vals = gather_2d(x, i, j)
150 | 
151 |         correct = np.array([
152 |             [[2, 2], [8, 7]],
153 |             [[0, 8], [3, 3]],
154 |         ], dtype=np.int32)
155 | 
156 |         assert_array_almost_equal(correct, vals.eval())
157 | 
158 |         assert vals.get_shape().as_list() == [2, 2, 2]
159 | 
160 |     @pytest.mark.usefixtures('clean_test_session')
161 |     def test_broadcast(self, x):
162 |         i = tf.constant([[0, 2],
163 |                          [3, 0]],
164 |                         dtype=tf.int32)
165 |         j = tf.constant([[1, 2]], dtype=tf.int32)  # needs to be broadcast up
166 |         vals = gather_2d(x, i, j)
167 | 
168 |         correct = np.array([
169 |             [[2, 2], [9, 9]],
170 |             [[1, 1], [3, 3]],
171 |         ], dtype=np.int32)
172 | 
173 |         assert_array_almost_equal(correct, vals.eval())
174 | 
175 | 
176 | def test_broadcast():
177 |     with clean_session():
178 |         values = tf.constant([
179 |             [
180 |                 [1, 2],
181 |                 [1, 2],
182 |             ],
183 |             [
184 |                 [1, 2],
185 |                 [3, 4],
186 |             ],
187 |             [
188 |                 [5, 6],
189 |                 [7, 8],
190 |             ]
191 |         ], dtype=tf.float32)
192 | 
193 |         mask = tf.constant([
194 |             [1, 0],
195 |             [1, 1],
196 |             [0, 1],
197 |         ], dtype=tf.float32)
198 | 
199 |         correct = np.array([
200 |             [
201 |                 [1, 1],
202 |                 [0, 0],
203 |             ],
204 |             [
205 |                 [1, 1],
206 |                 [1, 1],
207 |             ],
208 |             [
209 |                 [0, 0],
210 |                 [1, 1],
211 |             ]
212 |         ], dtype=np.float32)
213 | 
214 |         assert values.get_shape().as_list() == [3, 2, 2]
215 |         assert mask.get_shape().as_list() == [3, 2]
216 | 
217 |         mask = expand_dims_for_broadcast(mask, values)
218 |         assert mask.get_shape().as_list() == [3, 2, 1]
219 | 
220 |         mask = broadcast(mask, values)
221 |         assert mask.get_shape().as_list() == [3, 2, 2]
222 | 
223 |         mask_val = mask.eval()
224 | 
225 |         assert_array_equal(mask_val, correct)
226 | 
227 | 
228 | class TestSaver(object):
229 |     @pytest.fixture
230 |     def v(self):
231 |         return tf.get_variable('v', shape=[], initializer=tf.constant_initializer(5))
232 | 
233 |     @pytest.mark.usefixtures('clean_test_session')
234 |     def test_restore(self, tmpdir, v):
235 |         save_100_path = str(tmpdir.join('weights-100'))
236 |         save_10_path = str(tmpdir.join('weights-10'))
237 | 
238 |         saver = Saver(str(tmpdir))
239 |         assign_op = tf.assign(v, 12)
240 | 
241 |         sess = tf.get_default_session()
242 |         guarantee_initialized_variables(sess)
243 | 
244 |         assert v.eval() == 5
245 |         saver.save(100)  # save as step 100
246 | 
247 |         sess.run(assign_op)
248 |         assert v.eval() == 12
249 |         saver.save(10)  # save as step 10
250 | 
251 |         saver.restore()  # restores from the larger step number by default (100)
252 |         assert v.eval() == 5  # restored
253 | 
254 |         saver.restore(10)  # force restore number 10
255 |         assert v.eval() == 12
256 | 
257 |         saver.restore(save_100_path)
258 |         assert v.eval() == 5
259 | 
260 |         # latest should be the largest step number, not necessarily last saved
261 |         assert saver.latest_checkpoint == save_100_path
262 |         assert os.path.exists(save_100_path)
263 | 
264 |         assert saver.checkpoint_paths == {
265 |             10: save_10_path,
266 |             100: save_100_path,
267 |         }
268 | 


--------------------------------------------------------------------------------
/third-party/gtd/gtd/ml/torch/tests/test_seq_batch.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | import torch
  4 | from gtd.ml.torch.utils import GPUVariable
  5 | from gtd.ml.torch.utils import assert_tensor_equal
  6 | 
  7 | from gtd.ml.torch.seq_batch import SequenceBatch, SequenceBatchElement
  8 | from gtd.ml.vocab import SimpleVocab
  9 | 
 10 | 
 11 | class TestSequenceBatch(object):
 12 |     @pytest.fixture
 13 |     def sequences(self):
 14 |         return [
 15 |             ['a', 'b', 'b', 'c'],
 16 |             ['c'],
 17 |             [],
 18 |         ]
 19 | 
 20 |     @pytest.fixture
 21 |     def vocab(self):
 22 |         return SimpleVocab(['<unk>', 'a', 'b', 'c', '<start>', '<stop>'])
 23 | 
 24 |     def test_from_sequences(self, sequences, vocab):
 25 |         seq_batch = SequenceBatch.from_sequences(sequences, vocab)
 26 | 
 27 |         assert_tensor_equal(seq_batch.values,
 28 |                             np.array([
 29 |                                 [1, 2, 2, 3],
 30 |                                 [3, 0, 0, 0],
 31 |                                 [0, 0, 0, 0],
 32 |                             ], dtype=np.int32))
 33 | 
 34 |         assert_tensor_equal(seq_batch.mask,
 35 |                             np.array([
 36 |                                 [1, 1, 1, 1],
 37 |                                 [1, 0, 0, 0],
 38 |                                 [0, 0, 0, 0],
 39 |                             ], dtype=np.float32))
 40 | 
 41 |     def test_min_seq_length(self, vocab):
 42 |         seq_batch = SequenceBatch.from_sequences([[], [], []], vocab, min_seq_length=2)
 43 |         assert_tensor_equal(seq_batch.values, np.zeros((3, 2)))
 44 |         assert_tensor_equal(seq_batch.mask, np.zeros((3, 2)))
 45 | 
 46 |     def test_mask_validation(self):
 47 |         mask = GPUVariable(torch.FloatTensor([[1, 0, 0, 0],
 48 |                                               [1, 1, 0, 0],
 49 |                                               [1, 1, 1, 0]]))
 50 | 
 51 |         values = mask  # just set values = mask, since it doesn't matter
 52 | 
 53 |         # should not raise any errors
 54 |         SequenceBatch(values, mask)
 55 | 
 56 |         non_binary_mask = GPUVariable(torch.FloatTensor([[1, 0, 0, 0],
 57 |                                                          [1, 1.2, 0, 0],
 58 |                                                          [1, 1, 1, 0]]))
 59 | 
 60 |         with pytest.raises(ValueError):
 61 |             SequenceBatch(mask, non_binary_mask)
 62 | 
 63 |         non_left_justified_mask = GPUVariable(torch.FloatTensor([[1, 0, 0, 1],
 64 |                                                                  [1, 1, 0, 0],
 65 |                                                                  [1, 1, 1, 0]]))
 66 | 
 67 |         with pytest.raises(ValueError):
 68 |             SequenceBatch(mask, non_left_justified_mask)
 69 | 
 70 |     def test_split(self):
 71 |         input_embeds = GPUVariable(torch.LongTensor([
 72 |             # batch item 1
 73 |             [
 74 |                 [1, 2], [2, 3], [5, 6]
 75 |             ],
 76 |             # batch item 2
 77 |             [
 78 |                 [4, 8], [3, 5], [0, 0]
 79 |             ],
 80 |         ]))
 81 | 
 82 |         input_mask = GPUVariable(torch.FloatTensor([
 83 |             [1, 1, 1],
 84 |             [1, 1, 0],
 85 |         ]))
 86 | 
 87 |         sb = SequenceBatch(input_embeds, input_mask)
 88 | 
 89 |         elements = sb.split()
 90 |         input_list = [e.values for e in elements]
 91 |         mask_list = [e.mask for e in elements]
 92 | 
 93 |         assert len(input_list) == 3
 94 |         assert_tensor_equal(input_list[0], [[1, 2], [4, 8]])
 95 |         assert_tensor_equal(input_list[1], [[2, 3], [3, 5]])
 96 |         assert_tensor_equal(input_list[2], [[5, 6], [0, 0]])
 97 | 
 98 |         assert len(mask_list) == 3
 99 |         assert_tensor_equal(mask_list[0], [[1], [1]])
100 |         assert_tensor_equal(mask_list[1], [[1], [1]])
101 |         assert_tensor_equal(mask_list[2], [[1], [0]])
102 | 
103 |     def test_cat(self):
104 |         x1 = SequenceBatchElement(
105 |             GPUVariable(torch.FloatTensor([
106 |                 [[1, 2], [3, 4]],
107 |                 [[8, 2], [9, 0]]])),
108 |             GPUVariable(torch.FloatTensor([
109 |                 [1],
110 |                 [1]
111 |             ])))
112 |         x2 = SequenceBatchElement(
113 |             GPUVariable(torch.FloatTensor([
114 |                 [[-1, 20], [3, 40]],
115 |                 [[-8, 2], [9, 10]]])),
116 |             GPUVariable(torch.FloatTensor([
117 |                 [1],
118 |                 [0]
119 |             ])))
120 |         x3 = SequenceBatchElement(
121 |             GPUVariable(torch.FloatTensor([
122 |                 [[-1, 20], [3, 40]],
123 |                 [[-8, 2], [9, 10]]])),
124 |             GPUVariable(torch.FloatTensor([
125 |                 [0],
126 |                 [0]
127 |             ])))
128 | 
129 |         result = SequenceBatch.cat([x1, x2, x3])
130 | 
131 |         assert_tensor_equal(result.values,
132 |                             [
133 |                                 [[[1, 2], [3, 4]], [[-1, 20], [3, 40]], [[-1, 20], [3, 40]]],
134 |                                 [[[8, 2], [9, 0]], [[-8, 2], [9, 10]], [[-8, 2], [9, 10]]],
135 |                             ])
136 | 
137 |         assert_tensor_equal(result.mask,
138 |                             [
139 |                                 [1, 1, 0],
140 |                                 [1, 0, 0]
141 |                             ])
142 | 
143 |     @pytest.fixture
144 |     def some_seq_batch(self):
145 |         values = GPUVariable(torch.FloatTensor([
146 |             [[1, 2], [4, 5], [4, 4]],
147 |             [[0, 4], [43, 5], [-1, 20]],
148 |             [[-1, 20], [43, 5], [0, 0]],
149 |         ]))
150 |         mask = GPUVariable(torch.FloatTensor([
151 |             [1, 1, 0],
152 |             [1, 0, 0],
153 |             [0, 0, 0],
154 |         ]))
155 |         return SequenceBatch(values, mask)
156 | 
157 |     def test_weighted_sum(self, some_seq_batch):
158 |         weights = GPUVariable(torch.FloatTensor([
159 |             [0.5, 0.3, 0],
160 |             [0.8, 0.2, 0],
161 |             [0, 0, 0],
162 |         ]))
163 |         result = SequenceBatch.weighted_sum(some_seq_batch, weights)
164 | 
165 |         # [1, 2] * 0.5 + [4, 5] * 0.3 = [0.5 + 1.2, 1 + 1.5] = [1.7, 2.5]
166 |         # [0, 4] * 0.8 = [0, 3.2]
167 |         # 0
168 | 
169 |         # Weights on entries where mask[i, j] = 0 get ignored, as desired.
170 |         assert_tensor_equal(result, [
171 |             [1.7, 2.5],
172 |             [0, 3.2],
173 |             [0, 0],
174 |         ])
175 | 
176 |     def test_reduce_sum(self, some_seq_batch):
177 |         result = SequenceBatch.reduce_sum(some_seq_batch)
178 | 
179 |         assert_tensor_equal(result, [
180 |             [5, 7],
181 |             [0, 4],
182 |             [0, 0],
183 |         ])
184 | 
185 |     def test_reduce_mean(self, some_seq_batch):
186 |         result = SequenceBatch.reduce_mean(some_seq_batch, allow_empty=True)
187 | 
188 |         assert_tensor_equal(result, [
189 |             [2.5, 3.5],
190 |             [0, 4],
191 |             [0, 0]
192 |         ])
193 | 
194 |         with pytest.raises(ValueError):
195 |             SequenceBatch.reduce_mean(some_seq_batch, allow_empty=False)
196 | 
197 |     def test_reduce_prod(self, some_seq_batch):
198 |         result = SequenceBatch.reduce_prod(some_seq_batch)
199 |         assert_tensor_equal(result, [
200 |             [4, 10],
201 |             [0, 4],
202 |             [1, 1]
203 |         ])
204 | 
205 |     def test_reduce_max(self, some_seq_batch):
206 | 
207 |         with pytest.raises(ValueError):
208 |             # should complain about empty sequence
209 |             SequenceBatch.reduce_max(some_seq_batch)
210 | 
211 |         values = GPUVariable(torch.FloatTensor([
212 |             [[1, 2], [4, 5], [4, 4]],  # actual max is in later elements, but shd be suppressed by mask
213 |             [[0, -4], [43, -5], [-1, -20]],  # note that all elements in 2nd dim are negative
214 |         ]))
215 |         mask = GPUVariable(torch.FloatTensor([
216 |             [1, 0, 0],
217 |             [1, 1, 0],
218 |         ]))
219 |         seq_batch = SequenceBatch(values, mask)
220 |         result = SequenceBatch.reduce_max(seq_batch)
221 | 
222 |         assert_tensor_equal(result, [
223 |             [1, 2],
224 |             [43, -4],
225 |         ])
226 | 
227 |     def test_embed(self):
228 |         sequences = [
229 |             [],
230 |             [1, 2, 3],
231 |             [3, 3],
232 |             [2]
233 |         ]
234 | 
235 |         vocab = SimpleVocab([0, 1, 2, 3, 4])
236 |         indices = SequenceBatch.from_sequences(sequences, vocab)
237 | 
238 |         embeds = GPUVariable(torch.FloatTensor([
239 |             [0, 0],
240 |             [2, 2],   # 1
241 |             [3, 4],   # 2
242 |             [-10, 1], # 3
243 |             [11, -1]  # 4
244 |         ]))
245 | 
246 |         embedded = SequenceBatch.embed(indices, embeds)
247 | 
248 |         correct = np.array([
249 |             [[0, 0], [0, 0], [0, 0]],
250 |             [[2, 2], [3, 4], [-10, 1]],
251 |             [[-10, 1], [-10, 1], [0, 0]],
252 |             [[3, 4], [0, 0], [0, 0]]
253 |         ], dtype=np.float32)
254 |         assert_tensor_equal(embedded.values, correct)


--------------------------------------------------------------------------------