├── gtd ├── textmorph ├── __init__.py ├── edit_model │ ├── __init__.py │ ├── agenda.py │ ├── main.py │ ├── edit_noiser.py │ ├── edit_encoder.py │ └── attention_decoder.py ├── language_model │ ├── __init__.py │ └── main.py ├── turk │ ├── similarity │ │ └── config.txt │ ├── coherence │ │ └── config.txt │ └── turk.py └── data.py ├── .gitignore ├── third-party └── gtd │ ├── gtd │ ├── __init__.py │ ├── ml │ │ ├── __init__.py │ │ ├── tf │ │ │ ├── __init__.py │ │ │ ├── tests │ │ │ │ ├── __init__.py │ │ │ │ ├── test_framework.py │ │ │ │ └── test_utils.py │ │ │ ├── training_run.py │ │ │ └── profile.py │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── test_vocab.py │ │ │ └── test_utils.py │ │ ├── torch │ │ │ ├── __init__.py │ │ │ ├── tests │ │ │ │ ├── __init__.py │ │ │ │ ├── test_recurrent.py │ │ │ │ ├── test_utils.py │ │ │ │ ├── test_token_embedder.py │ │ │ │ ├── test_alignments.py │ │ │ │ ├── test_source_encoder.py │ │ │ │ ├── test_attention.py │ │ │ │ └── test_seq_batch.py │ │ │ ├── recurrent.py │ │ │ ├── feed_forward.py │ │ │ ├── simple_decoder_cell.py │ │ │ ├── training_run.py │ │ │ ├── decoder_cell.py │ │ │ ├── multilayered_decoder_cell.py │ │ │ ├── alignments.py │ │ │ ├── token_embedder.py │ │ │ ├── checkpoints.py │ │ │ ├── attention.py │ │ │ ├── utils.py │ │ │ └── source_encoder.py │ │ ├── utils.py │ │ ├── training_run.py │ │ ├── training_run_viewer.py │ │ └── vocab.py │ ├── tests │ │ ├── __init__.py │ │ ├── test_graph.py │ │ ├── test_io.py │ │ ├── test_log.py │ │ ├── test_lm.py │ │ └── test_utils.py │ ├── git_utils.py │ ├── plot.py │ ├── text.py │ ├── profile_imports.py │ ├── log.py │ ├── codalab.py │ └── graph.py │ ├── .gitignore │ ├── requirements.txt │ ├── setup.py │ └── scripts │ ├── git_logs.py │ ├── run_nlpsub.py │ └── run_docker.py ├── config.json ├── configs ├── optim │ ├── debug.txt │ └── default.txt ├── eval │ ├── debug.txt │ ├── short.txt │ └── default.txt ├── language_model │ ├── default.txt │ └── onebil.txt └── edit_model │ ├── autogen.sh │ ├── edit_test.txt │ ├── edit_onebil.txt │ ├── edit_baseline.txt │ └── edit_logp.txt ├── README.md ├── Dockerfile └── run_docker.py /gtd: -------------------------------------------------------------------------------- 1 | third-party/gtd/gtd -------------------------------------------------------------------------------- /textmorph/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .idea -------------------------------------------------------------------------------- /textmorph/edit_model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /textmorph/language_model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/tf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/tf/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third-party/gtd/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.ipynb 3 | *.pyc 4 | .cache 5 | .DS_Store 6 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | {"docker_image": "kelvinguu/textmorph:1.2", "data_env_var": "TEXTMORPH_DATA"} 2 | -------------------------------------------------------------------------------- /configs/optim/debug.txt: -------------------------------------------------------------------------------- 1 | optim { 2 | seed = 0 # random seed 3 | learning_rate = 0.001 4 | batch_size = 32 # examples per batch 5 | max_iters = 1500 # max number of mini-batch steps to take 6 | } -------------------------------------------------------------------------------- /configs/optim/default.txt: -------------------------------------------------------------------------------- 1 | optim { 2 | seed = 0 # random seed 3 | learning_rate = 0.001 4 | batch_size = 128 # examples per batch 5 | max_iters = 400000 # max number of mini-batch steps to take 6 | } -------------------------------------------------------------------------------- /configs/eval/debug.txt: -------------------------------------------------------------------------------- 1 | eval { 2 | num_examples = 4 # number of examples to periodically evaluate on 3 | big_num_examples = 8 4 | eval_steps = 10 5 | big_eval_steps = 20 6 | save_steps = 50 7 | alive_steps = 5 8 | } -------------------------------------------------------------------------------- /configs/eval/short.txt: -------------------------------------------------------------------------------- 1 | eval { 2 | num_examples = 32 # number of examples to periodically evaluate on 3 | big_num_examples = 512 4 | eval_steps = 500 5 | big_eval_steps = 5000 6 | save_steps = 5000 7 | alive_steps = 30 8 | } 9 | -------------------------------------------------------------------------------- /configs/eval/default.txt: -------------------------------------------------------------------------------- 1 | eval { 2 | num_examples = 32 # number of examples to periodically evaluate on 3 | big_num_examples = 128 4 | eval_steps = 500 5 | big_eval_steps = 5000 6 | save_steps = 5000 7 | alive_steps = 30 8 | } 9 | -------------------------------------------------------------------------------- /textmorph/turk/similarity/config.txt: -------------------------------------------------------------------------------- 1 | title = "Similarity Task" 2 | description = "Determine similarity of sentences" 3 | keywords = "sentence, similarity" 4 | price = 0.20 # default 5 | duration = 60 * 60 # 60 minutes per HIT 6 | approval_delay = 3600 * 24 * 7 # 7 days for auto-approval 7 | form_json = "form.json" 8 | -------------------------------------------------------------------------------- /third-party/gtd/requirements.txt: -------------------------------------------------------------------------------- 1 | line_profiler==1.0 2 | matplotlib==1.4.3 3 | numpy==1.11.0 4 | psycopg2==2.6.1 5 | pytest==2.9.2 6 | spacy==0.99 7 | SQLAlchemy==1.1.0b3 8 | tensorflow==0.8.0 9 | ipython==5.1.0 10 | scipy==0.18.0 11 | faulthandler==2.4 12 | futures==3.0.5 13 | jsonpickle==0.9.2 14 | fabric==1.12.0 15 | -------------------------------------------------------------------------------- /textmorph/turk/coherence/config.txt: -------------------------------------------------------------------------------- 1 | title = "Coherence Task" 2 | description = "Determine the coherence and grammaticality of sentences" 3 | keywords = "sentence, coherence, grammar" 4 | price = 0.40 # default 5 | duration = 60 * 60 # 60 minutes per HIT 6 | approval_delay = 3600 * 24 * 7 # 7 days for auto-approval 7 | form_json = "form.json" 8 | -------------------------------------------------------------------------------- /textmorph/language_model/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from gtd.utils import Config 4 | from textmorph.language_model.training_run import LMTrainingRuns 5 | 6 | 7 | arg_parser = argparse.ArgumentParser() 8 | arg_parser.add_argument('config_path') 9 | args = arg_parser.parse_args() 10 | 11 | runs = LMTrainingRuns() 12 | config = Config.from_file(args.config_path) 13 | run = runs.new(config) 14 | 15 | run.train() -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/tf/training_run.py: -------------------------------------------------------------------------------- 1 | from gtd.ml.training_run import TrainingRun 2 | from gtd.utils import cached_property 3 | 4 | 5 | class TFTrainingRun(TrainingRun): 6 | def __init__(self, config, save_dir): 7 | super(TFTrainingRun, self).__init__(config, save_dir) 8 | 9 | @cached_property 10 | def saver(self): 11 | from gtd.ml.tf.utils import Saver 12 | return Saver(self.workspace.checkpoints, keep_checkpoint_every_n_hours=5) -------------------------------------------------------------------------------- /configs/language_model/default.txt: -------------------------------------------------------------------------------- 1 | include "../optim/default.txt" 2 | include "../eval/default.txt" 3 | 4 | model { 5 | vocab_size = 10000 6 | word_dim = 300 7 | agenda_dim = 100 8 | hidden_dim = 100 9 | num_layers = 3 10 | kl_weight_steps = 50000 11 | kl_weight_rate = 8 12 | kl_weight_cap = 1.0 13 | dci_keep_rate = 0.8 14 | wvec_path = glove.6B.300d_yelp.txt 15 | type = 0 # 0 = language model, 1 = SVAE 16 | } 17 | 18 | dataset { 19 | path = yelp_dataset_static 20 | } 21 | -------------------------------------------------------------------------------- /configs/language_model/onebil.txt: -------------------------------------------------------------------------------- 1 | include "../optim/default.txt" 2 | include "../eval/default.txt" 3 | 4 | model { 5 | vocab_size = 10000 6 | word_dim = 300 7 | agenda_dim = 100 8 | hidden_dim = 100 9 | num_layers = 3 10 | kl_weight_steps = 50000 11 | kl_weight_rate = 8 12 | kl_weight_cap = 1.0 13 | dci_keep_rate = 0.8 14 | wvec_path = glove.6B.300d_onebil.txt 15 | type = 0 # 0 = language model, 1 = SVAE 16 | } 17 | 18 | dataset { 19 | path = onebil_dataset_static 20 | } 21 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/git_utils.py: -------------------------------------------------------------------------------- 1 | import git 2 | 3 | def commit_diff(c): 4 | """Return the set of changed files. 5 | 6 | Args: 7 | c (git.Commit) 8 | 9 | Returns: 10 | set[str]: a set of file paths (relative to the git repo's root directory). 11 | """ 12 | changed = set() 13 | 14 | def add_path(blob): 15 | if blob is not None: 16 | changed.add(blob.path) 17 | 18 | prev_c = c.parents[0] 19 | for x in c.diff(prev_c): 20 | add_path(x.a_blob) 21 | add_path(x.b_blob) 22 | return changed -------------------------------------------------------------------------------- /configs/edit_model/autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | options=("attend_pr = 1.0" "lamb_reg = 10.0" "lamb_reg = 25.0" "lamb_reg = 100.0" "edit_dim = 128" "edit_dim = 512" "norm_eps = 0.01" "norm_eps = 0.5" "norm_eps = 1.0" "kill_edit = True") 4 | 5 | 6 | arraylen=${#options[@]} 7 | #for opt in "${options[@]}" 8 | for (( i=0; i<${arraylen}; i++ )); 9 | do 10 | echo $i 11 | echo "include \"edit_baseline.txt\" 12 | editor{ 13 | "${options[$i]}" 14 | }" > configs/edit_model/tmp$i 15 | ./nlpsub.py -g 1 -n testruns 'python textmorph/edit_model/main.py configs/edit_model/tmp'$i 16 | done 17 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/tests/test_graph.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from gtd.graph import Graph 4 | 5 | 6 | class TestGraph(TestCase): 7 | 8 | def test_shortest_path(self): 9 | 10 | triples = [ 11 | ('1', '2', '3'), 12 | ('3', '4', '5'), 13 | ('1', '0', '5'), 14 | ] 15 | self.assertEqual( 16 | Graph(triples).shortest_path('1', '5'), 17 | ['1', '0', '5'] 18 | ) 19 | self.assertEqual( 20 | Graph(triples[:2]).shortest_path('1', '5'), 21 | ['1', '2', '3', '4', '5'] 22 | ) 23 | -------------------------------------------------------------------------------- /third-party/gtd/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from distutils.core import setup, Command 3 | 4 | 5 | class Test(Command): 6 | user_options = [] 7 | 8 | def initialize_options(self): 9 | pass 10 | 11 | def finalize_options(self): 12 | pass 13 | 14 | def run(self): 15 | import subprocess 16 | errno = subprocess.call(['py.test', '-v', '--doctest-modules', 'gtd']) 17 | raise SystemExit(errno) 18 | 19 | 20 | setup(name='gtd', 21 | version='1.0', 22 | packages=['gtd'], 23 | description='Get things done.', 24 | cmdclass={'test': Test}, 25 | ) -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/tests/test_vocab.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from gtd.ml.vocab import SimpleVocab, SimpleEmbeddings 5 | 6 | 7 | @pytest.fixture 8 | def vocab(): 9 | return SimpleVocab(['a', 'b', 'c']) 10 | 11 | 12 | @pytest.fixture 13 | def embeds(vocab): 14 | array = np.eye(len(vocab)) 15 | return SimpleEmbeddings(array, vocab) 16 | 17 | 18 | class TestSimpleVocab(object): 19 | def test_save_load(self, vocab, tmpdir): 20 | path = str(tmpdir.join('vocab.txt')) 21 | vocab.save(path) 22 | new_vocab = SimpleVocab.load(path) 23 | assert vocab == new_vocab -------------------------------------------------------------------------------- /textmorph/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | from gtd.io import Workspace 3 | 4 | 5 | # Set location of local data directory from environment variable 6 | env_var = 'TEXTMORPH_DATA' 7 | if env_var not in os.environ: 8 | assert False, env_var + ' environmental variable must be set.' 9 | root = os.environ[env_var] 10 | 11 | # define workspace 12 | workspace = Workspace(root) 13 | 14 | # config 15 | workspace.add_file('config', 'config.txt') 16 | 17 | # Training runs 18 | workspace.add_dir('edit_runs', 'edit_runs') 19 | workspace.add_dir('lm_runs', 'lm_runs') 20 | workspace.add_dir('retriever_runs', 'retriever_runs') 21 | 22 | # user IDs 23 | workspace.add_file('user_ids', 'user_ids.json') 24 | 25 | # word vectors 26 | workspace.add_dir('word_vectors', 'word_vectors') 27 | 28 | # nearest neighbors 29 | workspace.add_dir('nearest_sentences', 'nearest_sentences') -------------------------------------------------------------------------------- /textmorph/edit_model/agenda.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import Module, Linear 3 | 4 | 5 | class AgendaMaker(Module): 6 | def __init__(self, source_dim, edit_dim, agenda_dim): 7 | super(AgendaMaker, self).__init__() 8 | self.linear = Linear(source_dim + edit_dim, agenda_dim) 9 | 10 | def forward(self, source_embed, edit_embed): 11 | """Create agenda vector from source text embedding and edit embedding. 12 | 13 | Args: 14 | source_embed (Variable): of shape (batch_size, source_dim) 15 | edit_embed (Variable): of shape (batch_size, edit_dim) 16 | 17 | Returns: 18 | agenda (Variable): of shape (batch_size, agenda_dim) 19 | """ 20 | inp = torch.cat([source_embed, edit_embed], 1) # (batch_size, hidden_dim + edit_dim) 21 | return self.linear(inp) -------------------------------------------------------------------------------- /third-party/gtd/scripts/git_logs.py: -------------------------------------------------------------------------------- 1 | from git import Repo 2 | from os.path import join 3 | 4 | import sys 5 | print sys.path 6 | 7 | from gtd.git_utils import commit_diff 8 | from gtd.chrono import verboserate 9 | 10 | 11 | repo_path = sys.argv[1] 12 | max_count = sys.argv[2] 13 | files = set(sys.argv[3:]) 14 | 15 | def format_commit(c): 16 | msg = c.message.split('\n')[0] 17 | return '{}\t{}'.format(c.hexsha, msg) 18 | 19 | repo = Repo(repo_path) 20 | commits = list(repo.iter_commits('master', max_count=max_count)) 21 | lines = [] 22 | for c in verboserate(commits, desc='Scanning commits', total=max_count): 23 | if len(files & commit_diff(c)) == 0: 24 | continue 25 | lines.append(format_commit(c)) 26 | 27 | log_path = join(repo_path, 'git-logs.tsv') 28 | with open(log_path, 'w') as f: 29 | for line in lines: 30 | f.write(line) 31 | f.write('\n') -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Neural editor 2 | 3 | Source code accompanying our paper, "Generating Sentences by Editing Prototypes" ([paper](https://arxiv.org/abs/1709.08878), [slides](http://kelvinguu.com/posts/generating-sentences-by-editing-prototypes/)). 4 | 5 | **Authors:** Kelvin Guu\*, Tatsunori B. Hashimoto\*, Yonatan Oren, Percy Liang 6 | (\* equal contribution) 7 | 8 | - A detailed description of the training algorithm is now [available here](http://kelvinguu.com/public/projects/neural_editor_training.pdf). 9 | - We are drafting a more detailed README in the 10 | [README](https://github.com/kelvinguu/neural-editor/tree/readme) branch (see here for dataset links) 11 | - This is research code meant to serve as a reference implementation. We do not 12 | recommend heavily extending or modifying this codebase for other purposes. 13 | 14 | If you have questions, please email Kelvin at `guu.kelvin` at `gmail.com`. 15 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/tests/test_recurrent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from gtd.ml.torch.utils import GPUVariable 3 | 4 | from gtd.ml.torch.recurrent import tile_state, gated_update 5 | from gtd.ml.torch.utils import assert_tensor_equal 6 | 7 | 8 | def test_tile_state(): 9 | h = GPUVariable(torch.FloatTensor([1, 2, 3])) 10 | h_tiled = tile_state(h, 3) 11 | assert_tensor_equal(h_tiled, [[1, 2, 3], [1, 2, 3], [1, 2, 3]]) 12 | 13 | 14 | def test_gated_update(): 15 | h = GPUVariable(torch.FloatTensor([ 16 | [1, 2, 3], 17 | [4, 5, 6], 18 | ])) 19 | h_new = GPUVariable(torch.FloatTensor([ 20 | [-1, 2, 3], 21 | [4, 8, 0], 22 | ])) 23 | update = GPUVariable(torch.FloatTensor([[0], [1]])) # only update the second row 24 | 25 | out = gated_update(h, h_new, update) 26 | 27 | assert_tensor_equal(out, [ 28 | [1, 2, 3], 29 | [4, 8, 0] 30 | ]) -------------------------------------------------------------------------------- /configs/edit_model/edit_test.txt: -------------------------------------------------------------------------------- 1 | include "../optim/debug.txt" 2 | include "../eval/debug.txt" 3 | 4 | seed = 0 5 | 6 | editor { 7 | decoder_cell = AttentionDecoderCell 8 | vocab_size = 10000 # a proper size would be >20000 9 | word_dim = 300 10 | hidden_dim = 256 # hidden state dim of encoder and decoder 11 | agenda_dim = 256 # agenda vector dim 12 | edit_dim = 256 # edit vector dimension 13 | attention_dim = 128 14 | encoder_layers = 3 15 | decoder_layers = 3 16 | no_insert_delete_attn = False 17 | edit_dropout = True 18 | ident_pr = 0.1 19 | attend_pr = 0.5 20 | enable_vae = True 21 | lamb_reg = 50.0 22 | norm_eps = 0.1 23 | norm_max = 7.5 24 | kill_edit = True 25 | embed_sentence = False 26 | wvec_path = glove.6B.300d_yelp.txt 27 | } 28 | 29 | dataset { 30 | # this path should be relative to $SQUAD_ENTAILMENT_DATA 31 | path = yelp_dataset_small 32 | use_diff = True 33 | } 34 | 35 | -------------------------------------------------------------------------------- /configs/edit_model/edit_onebil.txt: -------------------------------------------------------------------------------- 1 | include "../optim/default.txt" 2 | include "../eval/default.txt" 3 | 4 | seed = 0 5 | 6 | editor { 7 | decoder_cell = AttentionDecoderCell 8 | vocab_size = 10000 # a proper size would be >20000 9 | word_dim = 300 10 | hidden_dim = 256 # hidden state dim of encoder and decoder 11 | agenda_dim = 256 # agenda vector dim 12 | edit_dim = 256 # edit vector dimension 13 | attention_dim = 128 14 | encoder_layers = 3 15 | decoder_layers = 3 16 | no_insert_delete_attn = False 17 | edit_dropout = True 18 | ident_pr = 0.01 19 | attend_pr = 0.0 20 | enable_vae = True 21 | lamb_reg = 50.0 22 | norm_eps = 0.1 23 | norm_max = 7.5 24 | kill_edit = True 25 | embed_sentence = False 26 | wvec_path = glove.6B.300d_onebil.txt 27 | } 28 | 29 | dataset { 30 | # this path should be relative to $SQUAD_ENTAILMENT_DATA 31 | path = onebillion_split 32 | use_diff = True 33 | } 34 | 35 | -------------------------------------------------------------------------------- /configs/edit_model/edit_baseline.txt: -------------------------------------------------------------------------------- 1 | include "../optim/default.txt" 2 | include "../eval/short.txt" 3 | 4 | seed = 0 5 | 6 | editor { 7 | decoder_cell = AttentionDecoderCell 8 | vocab_size = 10000 # a proper size would be >20000 9 | word_dim = 300 10 | hidden_dim = 256 # hidden state dim of encoder and decoder 11 | agenda_dim = 256 # agenda vector dim 12 | edit_dim = 128 # edit vector dimension 13 | attention_dim = 128 14 | encoder_layers = 3 15 | decoder_layers = 3 16 | no_insert_delete_attn = False 17 | edit_dropout = True 18 | ident_pr = 0.1 19 | attend_pr = 0.0 20 | enable_vae = True 21 | lamb_reg = 100.0 22 | norm_eps = 0.1 23 | norm_max = 14.0 24 | kill_edit = False 25 | embed_sentence = False 26 | wvec_path = glove.6B.300d_yelp.txt 27 | } 28 | 29 | dataset { 30 | # this path should be relative to $SQUAD_ENTAILMENT_DATA 31 | path = yelp_dataset_large_split 32 | use_diff = True 33 | } 34 | 35 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from gtd.ml.torch.utils import expand_dims_for_broadcast, assert_tensor_equal, is_binary 5 | 6 | 7 | def test_expand_dims_for_broadcast(): 8 | low_tensor = torch.FloatTensor([[1, 2, 3], [4, 5, 6]]) # (2, 3) 9 | high_tensor = torch.zeros(2, 3, 8, 1) 10 | 11 | new_tensor = expand_dims_for_broadcast(low_tensor, high_tensor) 12 | 13 | assert new_tensor.size() == (2, 3, 1, 1) 14 | 15 | assert_tensor_equal(new_tensor.squeeze(), low_tensor) 16 | 17 | with pytest.raises(AssertionError): 18 | bad_tensor = torch.zeros(2, 4, 8, 1) # prefix doesn't match 19 | expand_dims_for_broadcast(low_tensor, bad_tensor) 20 | 21 | 22 | def test_is_binary(): 23 | t1 = torch.FloatTensor([0, 1, 0, 0]) 24 | t2 = torch.FloatTensor([0, -1, 0, 0]) 25 | t3 = torch.FloatTensor([0, 0.1, 0.2, 0]) 26 | assert is_binary(t1) 27 | assert not is_binary(t2) 28 | assert not is_binary(t3) -------------------------------------------------------------------------------- /configs/edit_model/edit_logp.txt: -------------------------------------------------------------------------------- 1 | include "../optim/default.txt" 2 | include "../eval/default.txt" 3 | 4 | optim { 5 | learning_rate = 0.001 6 | } 7 | 8 | seed = 0 9 | 10 | editor { 11 | decoder_cell = AttentionDecoderCell 12 | vocab_size = 10000 # a proper size would be >20000 13 | word_dim = 300 14 | hidden_dim = 256 # hidden state dim of encoder and decoder 15 | agenda_dim = 256 # agenda vector dim 16 | edit_dim = 128 # edit vector dimension 17 | attention_dim = 128 18 | encoder_layers = 3 19 | decoder_layers = 3 20 | no_insert_delete_attn = False 21 | edit_dropout = True 22 | ident_pr = 0.1 23 | attend_pr = 0.0 24 | enable_vae = True 25 | lamb_reg = 15.0 26 | norm_eps = 1.0 27 | norm_max = 10.0 28 | kill_edit = False 29 | embed_sentence = False 30 | wvec_path = glove.6B.300d_yelp.txt 31 | } 32 | 33 | dataset { 34 | # this path should be relative to $SQUAD_ENTAILMENT_DATA 35 | path = yelp_dataset_large_split 36 | use_diff = True 37 | } 38 | 39 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/tf/profile.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tensorflow as tf 3 | from tensorflow.python.client import timeline 4 | 5 | 6 | class ProfiledSession(tf.Session): 7 | def __init__(self, *args, **kwargs): 8 | super(ProfiledSession, self).__init__(*args, **kwargs) 9 | 10 | def run(self, fetches, feed_dict=None): 11 | """like Session.run, but return a Timeline object in Chrome trace format (JSON). 12 | 13 | Save the json to a file, go to chrome://tracing, and open the file. 14 | 15 | Args: 16 | fetches 17 | feed_dict 18 | 19 | Returns: 20 | dict: a JSON dict 21 | """ 22 | options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) 23 | run_metadata = tf.RunMetadata() 24 | super(ProfiledSession, self).run(fetches, feed_dict, options=options, run_metadata=run_metadata) 25 | 26 | # Create the Timeline object, and write it to a json 27 | tl = timeline.Timeline(run_metadata.step_stats) 28 | ctf = tl.generate_chrome_trace_format() 29 | return json.loads(ctf) -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def temperature_smooth(sampling_probs, temperature): 5 | """Smooth a discrete distribution by raising/lowering temperature. 6 | 7 | Args: 8 | sampling_probs (np.ndarray): 1D numpy array 9 | temperature (float) 10 | 11 | Returns: 12 | np.ndarray: 1D array of same shape as sampling_probs 13 | """ 14 | if not isinstance(sampling_probs, np.ndarray): 15 | raise TypeError("sampling_probs must be numpy array.") 16 | 17 | if temperature <= 0: 18 | raise ValueError("Temperature must be positive.") 19 | 20 | if not np.isfinite(temperature): 21 | raise ValueError("Temperature must be finite.") 22 | 23 | if abs(np.sum(sampling_probs) - 1.0) > 0.001: 24 | raise ValueError("sampling_probs must sum to 1.") 25 | 26 | if not np.all(sampling_probs >= 0): 27 | raise ValueError("sampling_probs must all be non-negative.") 28 | 29 | logits = np.log(sampling_probs) # should be in range [-inf, 0] 30 | unnormalized = np.exp(logits / temperature) 31 | probs = unnormalized / np.sum(unnormalized) 32 | return probs -------------------------------------------------------------------------------- /third-party/gtd/gtd/tests/test_io.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gtd.io import IntegerDirectories, split_path 4 | 5 | 6 | class TestIntegerDirectories(object): 7 | @pytest.fixture 8 | def int_dirs(self, tmpdir): 9 | tmpdir.mkdir('152_blah') 10 | tmpdir.mkdir('153_woo') 11 | tmpdir.mkdir('1_') # no suffix, should still match 12 | tmpdir.mkdir('-1') # no suffix, should still match 13 | tmpdir.mkdir('_10') # prefix is not integer, ignore 14 | tmpdir.mkdir('.DS_Store') 15 | tmpdir.mkdir('other') 16 | return IntegerDirectories(str(tmpdir)) 17 | 18 | def test_keys(self, int_dirs): 19 | assert int_dirs.keys() == [-1, 1, 152, 153] 20 | assert len(int_dirs) == 4 21 | 22 | def test_largest_int(self, int_dirs): 23 | assert int_dirs.largest_int == 153 24 | 25 | def test_new_dir(self, tmpdir, int_dirs): 26 | correct = str(tmpdir.join('154')) 27 | assert int_dirs.new_dir() == correct 28 | 29 | def test_new_dir_named(self, tmpdir, int_dirs): 30 | correct = str(tmpdir.join('154')) + '_foobar' 31 | assert int_dirs.new_dir('foobar') == correct 32 | 33 | 34 | def test_split_path(): 35 | path = '/Users/Joe/Documents/file.txt' 36 | assert split_path(path) == ['Users', 'Joe', 'Documents', 'file.txt'] -------------------------------------------------------------------------------- /textmorph/edit_model/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from gtd.io import save_stdout 4 | from gtd.log import set_log_level 5 | from gtd.utils import Config 6 | from textmorph.edit_model.training_run import EditTrainingRuns 7 | 8 | set_log_level('DEBUG') 9 | 10 | 11 | arg_parser = argparse.ArgumentParser() 12 | arg_parser.add_argument('exp_id', nargs='+') 13 | arg_parser.add_argument('-c', '--check_commit', default='strict') 14 | arg_parser.add_argument('-p', '--profile', action='store_true') 15 | args = arg_parser.parse_args() 16 | 17 | # create experiment 18 | experiments = EditTrainingRuns(check_commit=(args.check_commit=='strict')) 19 | 20 | exp_id = args.exp_id 21 | if exp_id == ['default']: 22 | # new default experiment 23 | exp = experiments.new() 24 | elif len(exp_id) == 1 and exp_id[0].isdigit(): 25 | # reload old experiment 26 | exp = experiments[int(exp_id[0])] 27 | else: 28 | # new experiment according to configs 29 | config = Config.from_file(exp_id[0]) 30 | for filename in exp_id[1:]: 31 | config = Config.merge(config, Config.from_file(filename)) 32 | exp = experiments.new(config) # new experiment from config 33 | 34 | # start training 35 | exp.workspace.add_file('stdout', 'stdout.txt') 36 | exp.workspace.add_file('stderr', 'stderr.txt') 37 | 38 | 39 | with save_stdout(exp.workspace.root): 40 | exp.train() 41 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/tests/test_log.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from gtd.log import Metadata, SyncedMetadata 4 | 5 | 6 | class TestMetadata(object): 7 | @pytest.fixture 8 | def m(self): 9 | m = Metadata() 10 | m['a'] = 10 # this is overwritten 11 | m['b'] = 'test' 12 | 13 | # namescope setitem 14 | with m.name_scope('c'): 15 | m['foo'] = 140 16 | 17 | # nested setitem 18 | m['a.foo'] = 120 19 | m['c.bar'] = 'what' 20 | 21 | return m 22 | 23 | def test_getitem(self, m): 24 | assert m['b'] == 'test' 25 | 26 | def test_nested_getitem(self, m): 27 | assert m['a.foo'] == 120 28 | assert m['c.foo'] == 140 29 | 30 | def test_namescope_getitem(self, m): 31 | with m.name_scope('c'): 32 | assert m['bar'] == 'what' 33 | 34 | def test_nested_metadata(self, m): 35 | m_sub = m['a'] 36 | assert isinstance(m_sub, Metadata) 37 | assert m_sub['foo'] == 120 38 | 39 | def test_contains(self, m): 40 | assert 'b' in m 41 | assert 'bar' not in m 42 | assert 'c.bar' in m 43 | 44 | 45 | class TestSyncedMetadata(TestMetadata): # run all the metadata tests 46 | def test_syncing(self, tmpdir): 47 | meta_path = str(tmpdir.join('meta.txt')) 48 | s = SyncedMetadata(meta_path) 49 | 50 | with s.name_scope('job'): 51 | s['memory'] = 128 52 | 53 | s2 = SyncedMetadata(meta_path) # reload the file 54 | 55 | assert s2['job.memory'] == 128 -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from numpy.testing import assert_almost_equal 4 | 5 | from gtd.ml.utils import temperature_smooth 6 | 7 | 8 | def test_temperature_smooth(): 9 | smooth = lambda probs, temp: temperature_smooth(np.array(probs, dtype=np.float32), temp) 10 | same = lambda x1, x2: assert_almost_equal(x1, x2, decimal=4) 11 | 12 | probs = [0., 0.2, 0.4, 0.4] 13 | third = 1./3 14 | correct = [0., third, third, third] 15 | same(smooth(probs, 100000), correct) 16 | 17 | # doesn't sum to 1 18 | with pytest.raises(ValueError): 19 | smooth([1, 2, 0], 1) 20 | 21 | # contains negative numbers 22 | with pytest.raises(ValueError): 23 | smooth([1, -1, 1], 1) 24 | 25 | # temperature = 0 26 | with pytest.raises(ValueError): 27 | probs = [0, 0.25, 0.75, 0] 28 | smooth(probs, 0) 29 | 30 | # temperature = inf 31 | with pytest.raises(ValueError): 32 | probs = [0, 0.25, 0.75, 0] 33 | smooth(probs, float('inf')) 34 | 35 | # temperature = 1 36 | probs = [0, 0.25, 0.75, 0] 37 | same(smooth(probs, 1), probs) # shouldn't alter probs 38 | 39 | # contains 1 40 | probs = [1, 0, 0] 41 | same(smooth(probs, 10), probs) 42 | same(smooth(probs, 0.1), probs) 43 | 44 | a = np.exp(2) 45 | b = np.exp(3) 46 | 47 | probs = [0, a/(a+b), b/(a+b)] 48 | smoothed = smooth(probs, 11) 49 | 50 | a2 = np.exp(2. / 11) 51 | b2 = np.exp(3. / 11) 52 | correct = [0, a2/(a2+b2), b2/(a2+b2)] 53 | same(smoothed, correct) -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/recurrent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from gtd.ml.torch.utils import GPUVariable 3 | from torch.nn import Module 4 | 5 | from gtd.ml.torch.utils import conditional 6 | 7 | 8 | def tile_state(h, batch_size): 9 | """Tile a given hidden state batch_size times. 10 | 11 | Args: 12 | h (Variable): a single hidden state of shape (hidden_dim,) 13 | batch_size (int) 14 | 15 | Returns: 16 | a Variable of shape (batch_size, hidden_dim) 17 | """ 18 | tiler = GPUVariable(torch.ones(batch_size, 1)) 19 | return torch.mm(tiler, h.unsqueeze(0)) # (batch_size, hidden_size) 20 | 21 | 22 | def gated_update(h, h_new, update): 23 | """If update == 1.0, return h_new; if update == 0.0, return h. 24 | 25 | Applies this logic to each element in a batch. 26 | 27 | Args: 28 | h (Variable): of shape (batch_size, hidden_dim) 29 | h_new (Variable): of shape (batch_size, hidden_dim) 30 | update (Variable): of shape (batch_size, 1). 31 | 32 | Returns: 33 | Variable: of shape (batch_size, hidden_dim) 34 | 35 | """ 36 | batch_size, hidden_dim = h.size() 37 | gate = update.expand(batch_size, hidden_dim) 38 | return conditional(gate, h_new, h) 39 | 40 | 41 | class AdditionCell(Module): 42 | """Just add the input vector to the hidden state vector.""" 43 | 44 | def __init__(self, input_dim, hidden_dim): 45 | super(AdditionCell, self).__init__() 46 | self.W = GPUVariable(torch.eye(input_dim, hidden_dim)) 47 | # truncates input if input_dim > hidden_dim 48 | # pads with zeros if input_dim < hidden_dim 49 | self.hidden_size = hidden_dim 50 | 51 | def forward(self, x, hc): 52 | h, c = hc 53 | h = x.mm(self.W) + h 54 | return h, c -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/feed_forward.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import Module, Linear 3 | 4 | 5 | class FeedForwardNetwork(Module): 6 | """A standard feedforward network, with residual connections for equal-sized layers.""" 7 | def __init__(self, layer_dims): 8 | """Construct network. 9 | 10 | For len(layer_dims) == 3: 11 | 12 | y = f(x * W1 + b1) * W2 + b2 13 | 14 | x: (batch_size, layer_dims[0]) 15 | W1: (layer_dims[0], layer_dims[1]) 16 | W2: (layer_dims[1], layer_dims[2]) 17 | 18 | Note that there is no nonlinearity after final linear transform. 19 | 20 | Args: 21 | layer_dims (list[int]): 22 | layer_dims[0] = input dimension 23 | layer_dims[-1] = output dimension 24 | """ 25 | if len(layer_dims) < 3: 26 | raise ValueError("len(layer_dims) == 2 is just linear, and fewer layers does not make sense.") 27 | 28 | super(FeedForwardNetwork, self).__init__() 29 | self.nonlinearity = torch.nn.Tanh() # same for all layers 30 | self.layers = [] 31 | for i in range(len(layer_dims) - 1): 32 | # these layers include a bias term 33 | layer = Linear(layer_dims[i], layer_dims[i + 1]) 34 | # make sure to register sub-module 35 | self.add_module('linear_{}'.format(i), layer) 36 | self.layers.append(layer) 37 | 38 | def forward(self, x): 39 | for i, layer in enumerate(self.layers): 40 | x_prev = x 41 | x = layer(x) 42 | if i != len(self.layers) - 1: 43 | x = self.nonlinearity(x) # apply nonlinearity if it is not the final layer 44 | 45 | if x.size() == x_prev.size(): 46 | x = x + x_prev # residual connection 47 | 48 | return x -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/tests/test_token_embedder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import torch 4 | from torch.nn import Embedding 5 | 6 | from gtd.ml.torch.token_embedder import TokenEmbedder 7 | from gtd.ml.torch.utils import GPUVariable 8 | from gtd.ml.torch.utils import assert_tensor_equal 9 | from gtd.ml.vocab import SimpleVocab 10 | from gtd.utils import Bunch 11 | 12 | 13 | class TestTokenEmbedder(object): 14 | @pytest.fixture 15 | def embedder(self): 16 | vocab = SimpleVocab(['', '', ''] + ['a', 'b', 'c']) 17 | arr = np.eye(len(vocab), dtype=np.float32) 18 | word_embeddings = Bunch(vocab=vocab, array=arr) 19 | return TokenEmbedder(word_embeddings) 20 | 21 | def test_embedding_from_array(self): 22 | emb = TokenEmbedder._embedding_from_array(np.array([[9, 9], [8, 7]], dtype=np.float32)) 23 | assert isinstance(emb, Embedding) 24 | values = emb(GPUVariable(torch.LongTensor([[0, 0], [1, 0]]))) 25 | 26 | assert_tensor_equal(values, 27 | [ 28 | [[9, 9], [9, 9]], 29 | [[8, 7], [9, 9]], 30 | ]) 31 | 32 | def test_embed_indices(self, embedder): 33 | indices = GPUVariable(torch.LongTensor([ 34 | [0, 1], 35 | [2, 2], 36 | [4, 5], 37 | ])) 38 | 39 | embeds = embedder.embed_indices(indices) 40 | 41 | assert_tensor_equal(embeds, [ 42 | [[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]], 43 | [[0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0]], 44 | [[0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]], 45 | ]) 46 | 47 | def test_embed_tokens(self, embedder): 48 | tokens = ['b', 'c', 'c'] 49 | embeds = embedder.embed_tokens(tokens) 50 | 51 | assert_tensor_equal(embeds, [ 52 | [0, 0, 0, 0, 1, 0], 53 | [0, 0, 0, 0, 0, 1], 54 | [0, 0, 0, 0, 0, 1], 55 | ]) -------------------------------------------------------------------------------- /textmorph/edit_model/edit_noiser.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from textmorph.edit_model.editor import EditExample 4 | 5 | 6 | class EditNoiser(object): 7 | 8 | def __init__(self, ident_pr = 0.1, attend_pr = 0.5): 9 | self.ident_pr = ident_pr 10 | self.attend_pr = attend_pr 11 | 12 | def __call__(self, examples): 13 | """Return a batch of noisy EditExamples. 14 | 15 | Does not modify the original EditExamples. 16 | """ 17 | return [self._noise(ex) for ex in examples] 18 | 19 | def dropout_split(self, word_list): 20 | pr_list = [1.0-self.attend_pr, self.attend_pr] 21 | if len(word_list)>0: 22 | num_sampled = np.random.choice(np.arange(len(pr_list)), 1, p=pr_list) 23 | num_sampled = min(num_sampled, len(word_list)) 24 | choice_index = np.random.choice(np.arange(len(word_list)), num_sampled, replace=False) 25 | mask = np.zeros(len(word_list), dtype=bool) 26 | mask[choice_index] = True 27 | warray = np.array(word_list) 28 | return (warray[mask]).tolist(), (warray[np.invert(mask)]).tolist() 29 | else: 30 | return [], [] 31 | 32 | def _noise(self, ex): 33 | """Return a noisy EditExample. 34 | 35 | Note: this strategy is only appropriate for diff-style EditExamples. 36 | 37 | Args: 38 | ex (EditExample) 39 | 40 | Returns: 41 | EditExample: a new example. Does not modify the original example. 42 | """ 43 | ident_map = np.random.binomial(1,self.ident_pr) 44 | if ident_map: 45 | return EditExample(ex.source_words, [], [], [], [], ex.source_words) 46 | else: 47 | insert_exact, insert_approx= self.dropout_split(ex.insert_exact_words) 48 | delete_exact, delete_approx = self.dropout_split(ex.delete_exact_words) 49 | return EditExample(ex.source_words, insert_approx, insert_exact, delete_approx, delete_exact, ex.target_words) 50 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/tests/test_lm.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from numpy.testing import assert_approx_equal 3 | from gtd.lm import last_k, CountLM, LMSampler, normalize_counts 4 | import pytest 5 | 6 | 7 | @pytest.fixture 8 | def lm(): 9 | return CountLM(3) 10 | 11 | 12 | @pytest.fixture 13 | def lm_sampler(lm): 14 | return LMSampler(lm) 15 | 16 | 17 | def test_last_k(): 18 | tokens = [1, 2, 3, 4] 19 | assert last_k(tokens, 2) == (3, 4) 20 | assert last_k(tokens, 4) == (1, 2, 3, 4) 21 | assert last_k(tokens, 0) == tuple() 22 | 23 | 24 | def test_get_contexts(lm): 25 | tokens = [1, 2, 3, 4, 5] 26 | assert list(lm._get_contexts(tokens)) == [tuple(), (5,), (4, 5), (3, 4, 5)] 27 | 28 | assert list(lm._get_contexts([1, 2])) == [tuple(), (2,), (1, 2)] 29 | 30 | 31 | def test_largest_known_context(lm): 32 | contexts = {tuple(), (3,), (2, 3), (1, 2)} 33 | assert lm._largest_context([1, 2, 3], contexts) == (2, 3) 34 | assert lm._largest_context([2, 3, 0], contexts) == tuple() 35 | 36 | 37 | def test_normalize_counts(): 38 | c = Counter([1, 1, 2, 2, 3]) 39 | assert normalize_counts(c) == Counter({1: .4, 2: .4, 3: .2}) 40 | 41 | 42 | @pytest.mark.skip 43 | def test_sample_from_distribution(lm_sampler): 44 | distr = {'a': 0.3, 'b': 0.7} 45 | ctr = Counter() 46 | # law of large numbers test 47 | for k in range(100000): 48 | ctr[lm_sampler._sample_from_distribution(distr)] += 1 49 | empirical = normalize_counts(ctr) 50 | for key in distr.keys() + empirical.keys(): 51 | assert_approx_equal(empirical[key], distr[key], significant=2) 52 | 53 | def test_sequence_probability(lm): 54 | lm = CountLM(3) 55 | lines = ['apple pear banana', 'pear banana apple', 'banana pear banana'] 56 | for line in lines: 57 | tokens = line.split() 58 | lm.record_counts(tokens, append_end=True) 59 | 60 | probs = lm.sequence_probability(['pear', 'apple', 'pear']) 61 | assert probs == [('pear', 0.3333333333333333), ('apple', 0.0), ('pear', 0.5)] -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/simple_decoder_cell.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | import torch 4 | from torch.nn import LSTMCell, Linear, Parameter, Softmax 5 | 6 | from gtd.ml.torch.decoder_cell import DecoderCell, DecoderCellOutput, RNNState, RNNInput 7 | from gtd.ml.torch.recurrent import tile_state, gated_update 8 | 9 | 10 | class SimpleRNNState(namedtuple('SimpleRNNState', ['h', 'c']), RNNState): 11 | pass 12 | 13 | 14 | class SimpleRNNInput(namedtuple('SimpleRNNInput', ['x', 'agenda']), RNNInput): 15 | pass 16 | 17 | 18 | class SimpleDecoderCell(DecoderCell): 19 | def __init__(self, token_embedder, hidden_dim, input_dim, agenda_dim): 20 | super(SimpleDecoderCell, self).__init__() 21 | self.rnn_cell = LSTMCell(input_dim + agenda_dim, hidden_dim) 22 | self.linear = Linear(hidden_dim, input_dim) 23 | self.h0 = Parameter(torch.zeros(hidden_dim)) 24 | self.c0 = Parameter(torch.zeros(hidden_dim)) 25 | self.softmax = Softmax() 26 | self.token_embedder = token_embedder 27 | 28 | @property 29 | def rnn_state_type(self): 30 | return SimpleRNNState 31 | 32 | @property 33 | def rnn_input_type(self): 34 | return SimpleRNNInput 35 | 36 | def initialize(self, batch_size): 37 | h = tile_state(self.h0, batch_size) 38 | c = tile_state(self.c0, batch_size) 39 | return SimpleRNNState(h, c) 40 | 41 | def forward(self, rnn_state, rnn_input, advance): 42 | rnn_input_embed = torch.cat([rnn_input.x, rnn_input.agenda], 1) 43 | h, c = self.rnn_cell(rnn_input_embed, (rnn_state.h, rnn_state.c)) 44 | 45 | # don't update if sequence has terminated 46 | h = gated_update(rnn_state.h, h, advance) 47 | c = gated_update(rnn_state.c, c, advance) 48 | 49 | query = self.linear(h) 50 | word_vocab = self.token_embedder.vocab 51 | word_embeds = self.token_embedder.embeds 52 | vocab_logits = torch.mm(query, word_embeds.t()) # (batch_size, vocab_size) 53 | vocab_probs = self.softmax(vocab_logits) 54 | 55 | # no attention over source, insert and delete embeds 56 | rnn_state = SimpleRNNState(h, c) 57 | 58 | return DecoderCellOutput(rnn_state, vocab=word_vocab, vocab_probs=vocab_probs) -------------------------------------------------------------------------------- /third-party/gtd/gtd/plot.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for plotting 3 | """ 4 | import os 5 | 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from gtd.io import makedirs 9 | from gtd.log import in_ipython 10 | 11 | 12 | def hinton(matrix, max_weight=None, ax=None, xtick=None, ytick=None, inverted_color=False): 13 | """Draw Hinton diagram for visualizing a weight matrix. 14 | 15 | Copied from: http://matplotlib.org/examples/specialty_plots/hinton_demo.html 16 | """ 17 | ax = ax if ax is not None else plt.gca() 18 | if not max_weight: 19 | max_weight = 2**np.ceil(np.log(np.abs(matrix).max())/np.log(2)) 20 | 21 | ax.patch.set_facecolor('gray') 22 | ax.set_aspect('equal', 'box') 23 | ax.xaxis.set_major_locator(plt.NullLocator()) 24 | ax.yaxis.set_major_locator(plt.NullLocator()) 25 | 26 | for (x, y), w in np.ndenumerate(matrix): 27 | if inverted_color: 28 | color = 'black' if w > 0 else 'white' 29 | else: 30 | color = 'white' if w > 0 else 'black' 31 | size = np.sqrt(np.abs(w)) 32 | rect = plt.Rectangle([x - size / 2, y - size / 2], size, size, 33 | facecolor=color, edgecolor=color) 34 | ax.add_patch(rect) 35 | 36 | ax.autoscale_view() 37 | ax.invert_yaxis() 38 | 39 | if xtick: 40 | ax.set_xticks(np.arange(matrix.shape[0])) 41 | ax.set_xticklabels(xtick) 42 | if ytick: 43 | ax.set_yticks(np.arange(matrix.shape[1])) 44 | ax.set_yticklabels(ytick) 45 | return ax 46 | 47 | 48 | def show(title, directory=''): 49 | """If in IPython, show, otherwise, save to file.""" 50 | import matplotlib.pyplot as plt 51 | if in_ipython(): 52 | plt.show() 53 | else: 54 | # ensure directory exists 55 | makedirs(directory) 56 | 57 | plt.savefig(os.path.join(directory, title) + '.png') 58 | # close all figures to conserve memory 59 | plt.close('all') 60 | 61 | 62 | def plot_pdf(x, cov_factor=None, *args, **kwargs): 63 | import matplotlib.pyplot as plt 64 | from scipy.stats import gaussian_kde 65 | density = gaussian_kde(x) 66 | xgrid = np.linspace(min(x), max(x), 200) 67 | if cov_factor is not None: 68 | density.covariance_factor = lambda: cov_factor 69 | density._compute_covariance() 70 | y = density(xgrid) 71 | plt.plot(xgrid, y, *args, **kwargs) 72 | 73 | 74 | def rgb_to_hex(rgb): 75 | return '#%02x%02x%02x' % rgb -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/tests/test_alignments.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from gtd.ml.torch.utils import assert_tensor_equal 3 | 4 | from gtd.ml.torch.alignments import Alignments 5 | 6 | 7 | class TestAlignments(object): 8 | @pytest.fixture 9 | def source_words(self): 10 | return [ 11 | ['a', 'c', 'b', 'c'], 12 | ['1', '3', '2', '2', '2'], 13 | [], 14 | ] 15 | 16 | @pytest.fixture 17 | def target_words(self): 18 | return [ 19 | ['c', 'z', 'b', 'c'], 20 | ['1', 'c'], 21 | ['2', '4'], 22 | ] 23 | 24 | @pytest.fixture 25 | def aligns(self, source_words, target_words): 26 | return Alignments(source_words, target_words) 27 | 28 | def test(self, aligns): 29 | assert_tensor_equal(aligns.indices, 30 | [ 31 | [[1, 3], [0, 0], [2, 0], [1, 3]], 32 | [[0, 0], [0, 0], [0, 0], [0, 0]], 33 | [[0, 0], [0, 0], [0, 0], [0, 0]], 34 | ]) 35 | 36 | assert_tensor_equal(aligns.mask, 37 | [ 38 | [[1, 1], [0, 0], [1, 0], [1, 1]], 39 | [[1, 0], [0, 0], [0, 0], [0, 0]], 40 | [[0, 0], [0, 0], [0, 0], [0, 0]], 41 | ]) 42 | 43 | def test_split(self, aligns): 44 | items = aligns.split() 45 | assert len(items) == 4 46 | 47 | assert_tensor_equal(items[0].values, 48 | [ 49 | [1, 3], 50 | [0, 0], 51 | [0, 0] 52 | ]) 53 | 54 | assert_tensor_equal(items[0].mask, 55 | [ 56 | [1, 1], 57 | [1, 0], 58 | [0, 0] 59 | ]) 60 | 61 | assert_tensor_equal(items[2].values, 62 | [ 63 | [2, 0], 64 | [0, 0], 65 | [0, 0] 66 | ]) 67 | 68 | assert_tensor_equal(items[2].mask, 69 | [ 70 | [1, 0], 71 | [0, 0], 72 | [0, 0] 73 | ]) -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/training_run.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import numpy as np 3 | from torch.nn.utils import clip_grad_norm 4 | 5 | from gtd.ml.training_run import TrainingRun 6 | from gtd.ml.torch.checkpoints import Checkpoints 7 | from gtd.utils import cached_property 8 | 9 | 10 | class TorchTrainingRun(TrainingRun): 11 | def __init__(self, config, save_dir): 12 | super(TorchTrainingRun, self).__init__(config, save_dir) 13 | self.workspace.add_dir('checkpoints', 'checkpoints') 14 | 15 | @cached_property 16 | def checkpoints(self): 17 | return Checkpoints(self.workspace.checkpoints) 18 | 19 | @classmethod 20 | def _finite_grads(cls, parameters): 21 | """Check that all parameter gradients are finite. 22 | 23 | Args: 24 | parameters (List[Parameter]) 25 | 26 | Return: 27 | bool 28 | """ 29 | for param in parameters: 30 | if param.grad is None: continue 31 | if not np.isfinite(param.grad.data.sum()): 32 | return False 33 | return True 34 | 35 | @classmethod 36 | def _take_grad_step(cls, train_state, loss, max_grad_norm=float('inf')): 37 | """Try to take a gradient step w.r.t. loss. 38 | 39 | If the gradient is finite, takes a step. Otherwise, does nothing. 40 | 41 | Args: 42 | train_state (TrainState) 43 | loss (Variable): a differentiable scalar variable 44 | max_grad_norm (float): gradient norm is clipped to this value. 45 | 46 | Returns: 47 | bool: True if the gradient was finite. 48 | """ 49 | model, optimizer = train_state.model, train_state.optimizer 50 | optimizer.zero_grad() 51 | loss.backward() 52 | 53 | # clip according to the max allowed grad norm 54 | grad_norm = clip_grad_norm(model.parameters(), max_grad_norm, norm_type=2) 55 | # (this returns the gradient norm BEFORE clipping) 56 | 57 | # track the gradient norm over time 58 | train_state.track_grad_norms(grad_norm) 59 | 60 | finite_grads = cls._finite_grads(model.parameters()) 61 | 62 | # take a step if the grads are finite 63 | if finite_grads: 64 | optimizer.step() 65 | 66 | # increment step count 67 | train_state.increment_train_steps() 68 | 69 | return finite_grads 70 | 71 | def _update_metadata(self, train_state): 72 | self.metadata['last_seen'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 73 | self.metadata['steps'] = train_state.train_steps 74 | self.metadata['max_grad_norm'] = train_state.max_grad_norm -------------------------------------------------------------------------------- /third-party/gtd/scripts/run_nlpsub.py: -------------------------------------------------------------------------------- 1 | #!/u/nlp/packages/anaconda2/bin/python 2 | 3 | # THIS SCRIPT SHOULD BE SYMLINKED INTO THE ROOT OF YOUR GIT REPO 4 | # It assumes that config.json and run_docker.py can also be found at the root of your repo. 5 | 6 | import argparse 7 | import json 8 | import os 9 | from datetime import datetime 10 | import subprocess 11 | 12 | from os.path import abspath, dirname, join 13 | 14 | arg_parser = argparse.ArgumentParser() 15 | arg_parser.add_argument('-n', '--name', default='unnamed', help='Job name.') 16 | arg_parser.add_argument('-t', '--tail', action='store_true', help='Tail the output.') 17 | arg_parser.add_argument('-x', '--debug', action='store_true', help='Print command instead of running.') 18 | arg_parser.add_argument('-m', '--host', action='append', help='Allowed hosts.') 19 | arg_parser.add_argument('-g', '--gpu', default='0', help='GPU to use.') 20 | arg_parser.add_argument('command', nargs='+', help='Command passed to run_docker.py') 21 | args = arg_parser.parse_args() 22 | 23 | repo_dir = abspath(dirname(__file__)) 24 | with open(join(repo_dir, 'config.json'), 'r') as f: 25 | config = json.load(f) 26 | data_env_var = config['data_env_var'] # environment variable used by code to locate data, e.g. 'TEXTMORPH_DATA' 27 | data_dir = os.environ[data_env_var] 28 | 29 | time_str = datetime.now().strftime('%Y-%m-%d_%H.%M.%S') 30 | log_dir = join(data_dir, 'nlpsub', '{}_{}'.format(args.name, time_str)) 31 | 32 | nlpsub_options = ['--queue=jag', 33 | '--cores=1', 34 | '--mem=2g', 35 | '--priority=high', 36 | '--name={}'.format(args.name), 37 | '--log-dir={}'.format(log_dir), 38 | '--mail=bea', 39 | '--clobber', 40 | '--verbose'] 41 | if args.tail: 42 | nlpsub_options.append('--tail') 43 | 44 | if args.host is not None: 45 | nlpsub_options.append('--hosts={}'.format(','.join(args.host))) 46 | 47 | def bash_string(s): 48 | s = s.replace('\\', '\\\\') # \ -> \\ 49 | s = s.replace('\"', '\\\"') # " -> \" 50 | return '\"{}\"'.format(s) # s -> "s" 51 | 52 | 53 | cmd = ' '.join(args.command) 54 | 55 | docker_cmd = '/u/nlp/packages/anaconda2/bin/python run_docker.py -r -g {gpu} {command}'.format(gpu=args.gpu, command=bash_string(cmd)) 56 | 57 | nlpsub_cmd = 'nlpsub {options} {command}'.format(options=' '.join(nlpsub_options), command=bash_string(docker_cmd)) 58 | 59 | print 'Logging to: {}'.format(log_dir) 60 | print 'Allowed hosts: {}'.format(args.host) 61 | print 'GPU to use: {}'.format(args.gpu) 62 | print 'Command inside Docker: {}'.format(cmd) 63 | print nlpsub_cmd 64 | print 65 | 66 | if not args.debug: 67 | subprocess.call(nlpsub_cmd, shell=True) 68 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/tests/test_source_encoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from gtd.ml.torch.recurrent import AdditionCell 5 | from gtd.ml.torch.seq_batch import SequenceBatch 6 | from gtd.ml.torch.source_encoder import BidirectionalSourceEncoder 7 | from gtd.ml.torch.token_embedder import TokenEmbedder 8 | from gtd.ml.torch.utils import assert_tensor_equal 9 | from gtd.ml.vocab import SimpleVocab 10 | from gtd.utils import Bunch 11 | 12 | 13 | class TestBidirectionalSourceEncoder(object): 14 | @pytest.fixture 15 | def encoder(self): 16 | return BidirectionalSourceEncoder(1, 2, AdditionCell) 17 | 18 | @pytest.fixture 19 | def input_embeds_list(self): 20 | sequences = [ 21 | [1, 2, 3], 22 | [8, 4, 2, 1, 1], 23 | [], 24 | ] 25 | 26 | # token 1 maps to embedding [1], 2 maps to [2] and so on... 27 | vocab = SimpleVocab([1, 2, 3, 4, 5, 6, 7, 8]) 28 | array = np.expand_dims(np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np.float32), 1) 29 | token_embedder = TokenEmbedder(Bunch(vocab=vocab, array=array)) 30 | 31 | seq_embeds = token_embedder.embed_seq_batch(SequenceBatch.from_sequences(sequences, vocab)) 32 | return seq_embeds.split() 33 | 34 | def test_combined_states(self, encoder, input_embeds_list): 35 | states = encoder(input_embeds_list).combined_states 36 | 37 | # forward encoder is cumulatively summing from the left 38 | # backward encoder is cumulatively summing from the right 39 | 40 | # both encoders should ignore masked time steps 41 | assert_tensor_equal(states[0].values, [[1, 6], 42 | [8, 16], 43 | [0, 0], 44 | ]) 45 | assert_tensor_equal(states[0].mask, [[1], [1], [0]]) 46 | 47 | assert_tensor_equal(states[2].values, [[6, 3], 48 | [14, 4], 49 | [0, 0], 50 | ]) 51 | assert_tensor_equal(states[2].mask, [[1], [1], [0]]) 52 | 53 | assert_tensor_equal(states[3].values, [[6, 0], 54 | [15, 2], 55 | [0, 0], 56 | ]) 57 | assert_tensor_equal(states[3].mask, [[0], [1], [0]]) 58 | 59 | def test_final_states(self, encoder, input_embeds_list): 60 | forward, backward = encoder(input_embeds_list).final_states 61 | assert_tensor_equal(forward, [[6], [16], [0]]) 62 | assert_tensor_equal(backward, [[6], [16], [0]]) 63 | 64 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM kelvinguu/pytorch:1.2 2 | # FROM tensorflow/tensorflow:0.12.0 3 | # FROM continuumio/anaconda 4 | 5 | # Add the PostgreSQL PGP key to verify their Debian packages. 6 | # It should be the same key as https://www.postgresql.org/media/keys/ACCC4CF8.asc 7 | RUN apt-key adv --keyserver hkp://p80.pool.sks-keyservers.net:80 --recv-keys B97B0AFCAA1A47F044F244A07FCC7D46ACCC4CF8 8 | 9 | # Add PostgreSQL's repository. It contains the most recent stable release of PostgreSQL, ``9.3``. 10 | RUN echo "deb http://apt.postgresql.org/pub/repos/apt/ precise-pgdg main" > /etc/apt/sources.list.d/pgdg.list 11 | 12 | # Install ``python-software-properties``, ``software-properties-common`` and PostgreSQL 9.3 13 | # There are some warnings (in red) that show up during the build. You can hide 14 | # them by prefixing each apt-get statement with DEBIAN_FRONTEND=noninteractive 15 | RUN apt-get update && apt-get install -y python-software-properties software-properties-common postgresql-9.3 postgresql-client-9.3 postgresql-contrib-9.3 16 | 17 | RUN apt-get update 18 | RUN apt-get --yes --force-yes install libffi6 libffi-dev libssl-dev libpq-dev git 19 | 20 | RUN pip install --upgrade pip 21 | RUN pip install jupyter 22 | RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension # add Jupyter notebook extension 23 | 24 | RUN pip install fabric 25 | RUN pip install pyOpenSSL==16.2.0 26 | RUN pip install psycopg2==2.6.1 27 | RUN pip install SQLAlchemy==1.1.0b3 28 | RUN pip install cherrypy==8.1.2 29 | RUN pip install bottle==0.12.10 30 | RUN pip install boto==2.43.0 31 | 32 | RUN pip install requests 33 | RUN pip install nltk==3.2.3 34 | RUN python -m nltk.downloader punkt # download tokenizer data 35 | 36 | RUN pip install keras==1.1.0 37 | RUN pip install pyhocon line_profiler pytest tqdm faulthandler python-Levenshtein gitpython futures jsonpickle prettytable tensorboard_logger click 38 | 39 | RUN apt-get update 40 | RUN apt-get install -y vim less tmux nmap 41 | COPY .tmux.conf /root 42 | 43 | # vim bindings for Jupyter 44 | # https://github.com/lambdalisue/jupyter-vim-binding 45 | RUN mkdir -p $(jupyter --data-dir)/nbextensions 46 | RUN git clone https://github.com/lambdalisue/jupyter-vim-binding $(jupyter --data-dir)/nbextensions/vim_binding 47 | RUN jupyter nbextension enable vim_binding/vim_binding 48 | 49 | # autoreload for Jupyter 50 | RUN ipython profile create 51 | RUN echo 'c.InteractiveShellApp.exec_lines = []' >> ~/.ipython/profile_default/ipython_config.py 52 | RUN echo 'c.InteractiveShellApp.exec_lines.append("%load_ext autoreload")' >> ~/.ipython/profile_default/ipython_config.py 53 | RUN echo 'c.InteractiveShellApp.exec_lines.append("%autoreload 2")' >> ~/.ipython/profile_default/ipython_config.py 54 | 55 | # just installing so we can get tensorboard 56 | RUN pip install tensorflow 57 | 58 | RUN pip install annoy pympler -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/decoder_cell.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod, abstractproperty 2 | from collections import namedtuple 3 | 4 | import torch 5 | from torch.nn import Module 6 | 7 | from gtd.ml.torch.utils import NamedTupleLike 8 | 9 | 10 | # marker class 11 | class RNNState(NamedTupleLike): 12 | __slots__ = [] 13 | 14 | 15 | # marker class 16 | class RNNInput(NamedTupleLike): 17 | __slots__ = [] 18 | 19 | 20 | PredictionBatch = namedtuple('PredictionBatch', ['probs', 'vocab']) 21 | """ 22 | Attributes: 23 | probs (np.ndarray): of shape (batch_size, vocab_size) 24 | vocab (Vocab) 25 | """ 26 | 27 | 28 | class DecoderCellOutput(namedtuple('DecoderCellOutput', ['rnn_state', 'vocab', 'vocab_probs'])): 29 | """ 30 | Attributes: 31 | rnn_state (RNNState) 32 | vocab (Vocab) 33 | vocab_probs (Variable): of shape (batch_size, vocab_size) 34 | """ 35 | 36 | def loss(self, target_word): 37 | """Compute loss for this time step. 38 | 39 | Args: 40 | target_word (Variable): LongTensor of shape (batch_size,) 41 | 42 | Returns: 43 | loss (Variable): of shape (batch_size,) 44 | """ 45 | target_prob = torch.gather(self.vocab_probs, 1, target_word.unsqueeze(1)).squeeze(1) # (batch_size,) 46 | assert len(target_prob.size()) == 1 47 | 48 | loss = -torch.log(target_prob + 1e-45) # negative log-likelihood 49 | # added 1e-45 to prevent loss from being -infinity in the case where probs is close to 0 50 | return loss 51 | 52 | @property 53 | def predictions(self): 54 | """Return a PredictionBatch. 55 | 56 | Returns: 57 | PredictionBatch 58 | """ 59 | return PredictionBatch(self.vocab_probs.data.cpu().numpy(), self.vocab) 60 | 61 | 62 | class DecoderCell(Module): 63 | __metaclass__ = ABCMeta 64 | 65 | @abstractproperty 66 | def rnn_state_type(self): 67 | pass 68 | 69 | @abstractproperty 70 | def rnn_input_type(self): 71 | pass 72 | 73 | @abstractmethod 74 | def initialize(self, batch_size): 75 | """Return initial RNNState. 76 | 77 | Args: 78 | batch_size (int) 79 | 80 | Returns: 81 | RNNState 82 | """ 83 | raise NotImplementedError 84 | 85 | def forward(self, rnn_state, rnn_input, advance): 86 | """Advance the decoder by one step. 87 | 88 | Args: 89 | rnn_state (RNNState): the previous RNN state. 90 | rnn_input (RNNInput): any inputs at this time step. 91 | advance (Variable): of shape (batch_size, 1). The RNN should advance on example i iff mask[i] == 1. 92 | 93 | Returns: 94 | DecoderCellOutput 95 | """ 96 | raise NotImplementedError -------------------------------------------------------------------------------- /run_docker.py: -------------------------------------------------------------------------------- 1 | #!/u/nlp/packages/anaconda2/bin/python 2 | 3 | # THIS SCRIPT SHOULD BE SYMLINKED INTO THE ROOT OF YOUR GIT REPO 4 | # It assumes that config.json can also be found at the root of your repo. 5 | 6 | import argparse 7 | import json 8 | import os 9 | 10 | from os.path import dirname, abspath, join 11 | import subprocess 12 | 13 | arg_parser = argparse.ArgumentParser() 14 | arg_parser.add_argument('-r', '--root', action='store_true', help='Run as root in Docker.') 15 | arg_parser.add_argument('-g', '--gpu', default='', help='GPU to use.') 16 | arg_parser.add_argument('-d', '--debug', action='store_true', help='Print command instead of running.') 17 | arg_parser.add_argument('command', nargs='?', default=None, 18 | help='Command to execute in Docker. If no command is specified, ' \ 19 | 'you enter interactive mode. ' \ 20 | 'To execute a command with spaces, wrap ' \ 21 | 'the entire command in quotes.') 22 | args = arg_parser.parse_args() 23 | 24 | repo_dir = abspath(dirname(__file__)) 25 | 26 | with open(join(repo_dir, 'config.json'), 'r') as f: 27 | config = json.load(f) 28 | 29 | image = config['docker_image'] # name of the Docker image, e.g. 'kelvinguu/textmorph:1.0' 30 | data_env_var = config['data_env_var'] # environment variable used by code to locate data, e.g. 'TEXTMORPH_DATA' 31 | data_dir = os.environ[data_env_var] 32 | 33 | my_uid = subprocess.check_output(['echo', '$UID']).strip() 34 | 35 | docker_args = ["--net host", # access to the Internet 36 | "--publish 8888:8888", # only certain ports are exposed 37 | "--publish 6006:6006", 38 | "--publish 8080:8080", 39 | "--ipc=host", 40 | "--rm", 41 | "--volume {}:/data".format(data_dir), 42 | "--volume {}:/code".format(repo_dir), 43 | "--env {}=/data".format(data_env_var), 44 | "--env PYTHONPATH=/code", 45 | "--env NLTK_DATA=/data/nltk", 46 | "--env CUDA_VISIBLE_DEVICES={}".format(args.gpu), 47 | "--workdir /code"] 48 | 49 | # interactive mode 50 | if args.command is None: 51 | docker_args.append('--interactive') 52 | docker_args.append('--tty') 53 | args.command = '/bin/bash' 54 | 55 | if not args.root: 56 | docker_args.append('--user={}'.format(my_uid)) 57 | 58 | if args.gpu == '': 59 | # run on CPU 60 | docker = 'docker' 61 | else: 62 | # run on GPU 63 | docker = 'nvidia-docker' 64 | 65 | pull_cmd = "docker pull {}".format(image) 66 | 67 | run_cmd = '{docker} run {options} {image} {command}'.format(docker=docker, 68 | options=' '.join(docker_args), 69 | image=image, 70 | command=args.command) 71 | print 'Data directory: {}'.format(data_dir) 72 | print 'Command to run inside Docker: {}'.format(args.command) 73 | 74 | print pull_cmd 75 | print run_cmd 76 | if not args.debug: 77 | subprocess.call(pull_cmd, shell=True) 78 | subprocess.call(run_cmd, shell=True) 79 | -------------------------------------------------------------------------------- /third-party/gtd/scripts/run_docker.py: -------------------------------------------------------------------------------- 1 | #!/u/nlp/packages/anaconda2/bin/python 2 | 3 | # THIS SCRIPT SHOULD BE SYMLINKED INTO THE ROOT OF YOUR GIT REPO 4 | # It assumes that config.json can also be found at the root of your repo. 5 | 6 | import argparse 7 | import json 8 | import os 9 | 10 | from os.path import dirname, abspath, join 11 | import subprocess 12 | 13 | arg_parser = argparse.ArgumentParser() 14 | arg_parser.add_argument('-r', '--root', action='store_true', help='Run as root in Docker.') 15 | arg_parser.add_argument('-g', '--gpu', default='', help='GPU to use.') 16 | arg_parser.add_argument('-d', '--debug', action='store_true', help='Print command instead of running.') 17 | arg_parser.add_argument('command', nargs='?', default=None, 18 | help='Command to execute in Docker. If no command is specified, ' \ 19 | 'you enter interactive mode. ' \ 20 | 'To execute a command with spaces, wrap ' \ 21 | 'the entire command in quotes.') 22 | args = arg_parser.parse_args() 23 | 24 | repo_dir = abspath(dirname(__file__)) 25 | 26 | with open(join(repo_dir, 'config.json'), 'r') as f: 27 | config = json.load(f) 28 | 29 | image = config['docker_image'] # name of the Docker image, e.g. 'kelvinguu/textmorph:1.0' 30 | data_env_var = config['data_env_var'] # environment variable used by code to locate data, e.g. 'TEXTMORPH_DATA' 31 | data_dir = os.environ[data_env_var] 32 | 33 | my_uid = subprocess.check_output(['echo', '$UID']).strip() 34 | 35 | docker_args = ["--net host", # access to the Internet 36 | "--publish 8888:8888", # only certain ports are exposed 37 | "--publish 6006:6006", 38 | "--publish 8080:8080", 39 | "--ipc=host", 40 | "--rm", 41 | "--volume {}:/data".format(data_dir), 42 | "--volume {}:/code".format(repo_dir), 43 | "--env {}=/data".format(data_env_var), 44 | "--env PYTHONPATH=/code", 45 | "--env NLTK_DATA=/data/nltk", 46 | "--env CUDA_VISIBLE_DEVICES={}".format(args.gpu), 47 | "--workdir /code"] 48 | 49 | # interactive mode 50 | if args.command is None: 51 | docker_args.append('--interactive') 52 | docker_args.append('--tty') 53 | args.command = '/bin/bash' 54 | 55 | if not args.root: 56 | docker_args.append('--user={}'.format(my_uid)) 57 | 58 | if args.gpu == '': 59 | # run on CPU 60 | docker = 'docker' 61 | else: 62 | # run on GPU 63 | docker = 'nvidia-docker' 64 | 65 | pull_cmd = "docker pull {}".format(image) 66 | 67 | run_cmd = '{docker} run {options} {image} {command}'.format(docker=docker, 68 | options=' '.join(docker_args), 69 | image=image, 70 | command=args.command) 71 | print 'Data directory: {}'.format(data_dir) 72 | print 'Command to run inside Docker: {}'.format(args.command) 73 | 74 | print pull_cmd 75 | print run_cmd 76 | if not args.debug: 77 | subprocess.call(pull_cmd, shell=True) 78 | subprocess.call(run_cmd, shell=True) 79 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/multilayered_decoder_cell.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | import torch 4 | from torch.nn import LSTMCell, Linear, Parameter, Softmax 5 | 6 | from gtd.ml.torch.decoder_cell import DecoderCell, DecoderCellOutput, RNNState, RNNInput 7 | from gtd.ml.torch.recurrent import tile_state, gated_update 8 | from gtd.ml.torch.utils import GPUVariable, try_gpu 9 | 10 | 11 | class MultilayeredRNNState(namedtuple('MultilayeredRNNState', ['hs', 'cs']), RNNState): 12 | pass 13 | 14 | 15 | class MultilayeredRNNInput(namedtuple('MultilayeredRNNInput', ['x', 'agenda']), RNNInput): 16 | pass 17 | 18 | 19 | class MultilayeredDecoderCell(DecoderCell): 20 | def __init__(self, token_embedder, hidden_dim, input_dim, agenda_dim, num_layers): 21 | super(MultilayeredDecoderCell, self).__init__() 22 | self.linear = Linear(hidden_dim, input_dim) 23 | self.h0 = Parameter(torch.zeros(hidden_dim)) 24 | self.c0 = Parameter(torch.zeros(hidden_dim)) 25 | self.softmax = Softmax() 26 | self.token_embedder = token_embedder 27 | self.num_layers = num_layers 28 | 29 | self.rnn_cells = [] 30 | for layer in range(num_layers): 31 | in_dim = (input_dim + agenda_dim) if layer == 0 else hidden_dim # inputs to first layer are word vectors 32 | out_dim = hidden_dim 33 | rnn_cell = LSTMCell(in_dim, out_dim) 34 | self.add_module('decoder_layer_{}'.format(layer), rnn_cell) 35 | self.rnn_cells.append(rnn_cell) 36 | 37 | @property 38 | def rnn_state_type(self): 39 | return MultilayeredRNNState 40 | 41 | @property 42 | def rnn_input_type(self): 43 | return MultilayeredRNNInput 44 | 45 | def initialize(self, batch_size): 46 | h = tile_state(self.h0, batch_size) 47 | c = tile_state(self.c0, batch_size) 48 | return MultilayeredRNNState([h] * self.num_layers, [c] * self.num_layers) 49 | 50 | def forward(self, rnn_state, rnn_input, advance): 51 | x = torch.cat([rnn_input.x, rnn_input.agenda], 1) 52 | hs, cs = [], [] 53 | for layer in range(self.num_layers): 54 | rnn_cell = self.rnn_cells[layer] 55 | 56 | # collect the h, c belonging to the previous time-step at the corresponding depth 57 | h_prev_t, c_prev_t = rnn_state.hs[layer], rnn_state.cs[layer] 58 | 59 | # forward pass and masking 60 | h, c = rnn_cell(x, (h_prev_t, c_prev_t)) 61 | h = gated_update(h_prev_t, h, advance) 62 | c = gated_update(c_prev_t, c, advance) 63 | hs.append(h) 64 | cs.append(c) 65 | 66 | if layer == 0: 67 | x = h # no skip connection on the first layer 68 | else: 69 | x = x + h 70 | 71 | query = self.linear(x) 72 | word_vocab = self.token_embedder.vocab 73 | word_embeds = self.token_embedder.embeds 74 | vocab_logits = torch.mm(query, word_embeds.t()) # (batch_size, vocab_size) 75 | vocab_probs = self.softmax(vocab_logits) 76 | 77 | rnn_state = MultilayeredRNNState(hs, cs) 78 | 79 | return DecoderCellOutput(rnn_state, vocab=word_vocab, vocab_probs=vocab_probs) -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/alignments.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from itertools import izip 3 | 4 | import numpy as np 5 | import torch 6 | from gtd.ml.torch.utils import GPUVariable 7 | 8 | from gtd.ml.torch.seq_batch import SequenceBatch 9 | 10 | 11 | class Alignments(object): 12 | """ 13 | Attributes: 14 | indices (Variable): of shape (batch_size, max_target_seq_length, max_alignments) 15 | mask (Variable): of shape (batch_size, max_target_seq_length, max_alignments) 16 | 17 | max_alignments is always at least 1, so that indices and mask do not have a dimension of 0 (which confuses 18 | downstream Torch ops) 19 | """ 20 | def __init__(self, source_words, target_words): 21 | """Represent alignments as a Tensor. 22 | 23 | Args: 24 | source_words (list[list[unicode]]): batch of source sequences 25 | target_words (list[list[unicode]]): batch of target sequences 26 | """ 27 | assert len(source_words) == len(target_words) 28 | # compute alignments 29 | alignments_batch = [self._align(s, t) for s, t in izip(source_words, target_words)] 30 | 31 | # compute dimensions of alignment tensor 32 | batch_size = len(alignments_batch) 33 | max_target_seq_length = 0 34 | max_alignments = 1 # make this dimension at least 1, so that we don't get a tensor with no entries 35 | for alignments in alignments_batch: 36 | max_target_seq_length = max(max_target_seq_length, len(alignments)) 37 | for align in alignments: 38 | max_alignments = max(max_alignments, len(align)) 39 | 40 | indices = -1 * np.ones((batch_size, max_target_seq_length, max_alignments), dtype=np.int64) 41 | # filled with -1's for padding. 42 | # int64 gets converted into torch.LongTensor 43 | 44 | for i, alignments in enumerate(alignments_batch): 45 | for j, align in enumerate(alignments): 46 | for k, idx in enumerate(align): 47 | indices[i, j, k] = idx 48 | 49 | mask = (indices != -1).astype(np.float32) 50 | indices[indices == -1] = 0 51 | self._indices = GPUVariable(torch.from_numpy(indices)) 52 | self._mask = GPUVariable(torch.from_numpy(mask)) 53 | 54 | @property 55 | def indices(self): 56 | return self._indices 57 | 58 | @property 59 | def mask(self): 60 | return self._mask 61 | 62 | @classmethod 63 | def _align(self, source_seq, target_seq): 64 | """For each target word, give its positions in the source sequence. 65 | 66 | Args: 67 | source_seq (list[unicode]) 68 | target_seq (list[unicode]) 69 | 70 | Returns: 71 | alignments (list[list[int]]): alignments[i] is an ordered list of the indices where target_seq[i] 72 | appears in source_seq. 73 | """ 74 | alignments_dict = defaultdict(list) 75 | for i, word in enumerate(source_seq): 76 | alignments_dict[word].append(i) 77 | 78 | alignments = [] 79 | for word in target_seq: 80 | alignments.append(alignments_dict[word]) 81 | 82 | return alignments 83 | 84 | def split(self): 85 | """Split alignments object into per-time-step alignments. 86 | 87 | Returns: 88 | list[SequenceBatch]: where each element has shape (batch_size, max_alignments) 89 | """ 90 | indices_list = [v.squeeze(1) for v in self.indices.split(1, dim=1)] 91 | mask_list = [v.squeeze(1) for v in self.mask.split(1, dim=1)] 92 | return [SequenceBatch(i, m) for i, m in izip(indices_list, mask_list)] -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/tests/test_attention.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from gtd.ml.torch.attention import Attention 5 | from gtd.ml.torch.seq_batch import SequenceBatch 6 | from gtd.ml.torch.utils import GPUVariable 7 | from gtd.ml.torch.utils import assert_tensor_equal 8 | 9 | 10 | class TestAttention(object): 11 | 12 | def test_forward(self): 13 | float_tensor = lambda arr: torch.FloatTensor(arr) 14 | float_tensor_var = lambda arr: GPUVariable(torch.FloatTensor(arr)) 15 | 16 | batch_size, num_cells = 5, 2 17 | memory_dim, query_dim, attn_dim = 4, 3, 2 18 | 19 | memory_transform = np.array([ # Wh: (memory_dim x attn_dim) 20 | [.1, .5], 21 | [.2, .6], 22 | [.3, .7], 23 | [.4, .8], 24 | ]) 25 | query_transform = np.array([ # Ws: (query_dim x attn_dim) 26 | [.3, .4], 27 | [.2, .5], 28 | [.1, .6], 29 | ]) 30 | v_transform = np.array([ # v: (attn_dim x 1) 31 | [.1], 32 | [.8], 33 | ]) 34 | 35 | mem_values = np.array([ # (batch_size x num_cells x memory_dim) 36 | [ 37 | [.1, .2, .3, .4], 38 | [.4, .5, .6, .7], 39 | ], 40 | [ 41 | [.2, .3, .4, .5], 42 | [.6, .7, .8, .9], 43 | ], 44 | [ 45 | [.3, .4, .5, .6], 46 | [.7, .8, .9, .1], 47 | ], 48 | [ 49 | [-8, -9, -10, -11], 50 | [-12, -13, -14, -15], 51 | ], 52 | [ 53 | [8, 9, 10, 11], 54 | [12, 13, 14, 15], 55 | ] 56 | ]) 57 | mem_values = float_tensor_var(mem_values) 58 | mem_mask = np.array([ 59 | [1, 0], 60 | [1, 1], 61 | [1, 0], 62 | [0, 0], 63 | [0, 1], 64 | ]) 65 | mem_mask = float_tensor_var(mem_mask) 66 | memory_cells = SequenceBatch(values=mem_values, mask=mem_mask) 67 | query = np.array([ # (batch_size x query_dim) 68 | [.1, .2, .3], 69 | [.4, .5, .6], 70 | [.7, .8, .9], 71 | [10, 11, 12], 72 | [13, 14, 15], 73 | ]) 74 | query = float_tensor_var(query) 75 | 76 | # compute manually 77 | # Et = np.array([ [ 0.65388812, 0.81788159], 78 | # [ 0.81039669, 0.87306204], 79 | # [ 0.86236411, 0.86977563]]) 80 | manual_weights = np.array([[ 1., 0.], 81 | [ 0.4843, 0.5156], 82 | [ 1., 0.], 83 | [0, 0], 84 | [0, 1.], 85 | ]) 86 | manual_context = np.array([[ 0.1, 0.2, 0.3, 0.4], 87 | [ 0.4062, 0.5062, 0.6062, 0.7062], 88 | [ 0.3, 0.4, 0.5, 0.6], 89 | [0, 0, 0, 0], 90 | [12, 13, 14, 15], 91 | ]) 92 | 93 | # compute with module 94 | attn = Attention(memory_dim, query_dim, attn_dim) 95 | attn.memory_transform.data.set_(float_tensor(memory_transform)) 96 | attn.query_transform.data.set_(float_tensor(query_transform)) 97 | attn.v_transform.data.set_(float_tensor(v_transform)) 98 | 99 | attn_out = attn(memory_cells, query) 100 | assert_tensor_equal(attn_out.weights, manual_weights, decimal=4) 101 | assert_tensor_equal(attn_out.context, manual_context, decimal=4) -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/token_embedder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from gtd.ml.torch.utils import GPUVariable 3 | from torch.nn import Embedding, Module 4 | 5 | from gtd.ml.torch.seq_batch import SequenceBatch 6 | 7 | 8 | class TokenEmbedder(Module): 9 | """ 10 | Attributes: 11 | vocab (WordVocab) 12 | embeds (Variable): of shape (vocab_size, embed_dim) 13 | embed_dim (int) 14 | """ 15 | 16 | def __init__(self, word_embeddings, trainable=True): 17 | """Create TokenEmbedder. 18 | 19 | Args: 20 | word_embeddings (WordEmbeddings) 21 | trainable (bool): if False, the embedding array will not see 22 | gradient steps 23 | """ 24 | super(TokenEmbedder, self).__init__() 25 | 26 | arr = word_embeddings.array # np.ndarray 27 | vocab_size, embed_dim = arr.shape 28 | 29 | assert vocab_size == len(word_embeddings.vocab) 30 | self.vocab = word_embeddings.vocab 31 | self.embed_dim = embed_dim 32 | 33 | # create Embedding Module 34 | vocab_size, embed_dim = arr.shape 35 | self._embedding = TrainFlagEmbedding( 36 | vocab_size, embed_dim, arr, trainable=trainable) 37 | 38 | @property 39 | def embeds(self): 40 | """Return Variable of shape (vocab_size, embed_dim).""" 41 | return self._embedding.weight 42 | 43 | def embed_indices(self, indices): 44 | """Embed array of indices. 45 | 46 | Args: 47 | indices (Variable[LongTensor]): of shape (X1, X2) or (X1) 48 | 49 | Returns: 50 | embeds (Variable[FloatTensor]): of shape (X1, X2, embed_dim) or (X1, embed_dim) 51 | """ 52 | return self._embedding(indices) 53 | 54 | def embed_seq_batch(self, seq_batch): 55 | """Embed elements of a SequenceBatch. 56 | 57 | Args: 58 | seq_batch (SequenceBatch) 59 | 60 | Returns: 61 | SequenceBatch 62 | """ 63 | return SequenceBatch(self._embedding(seq_batch.values), seq_batch.mask) 64 | 65 | def embed_tokens(self, tokens): 66 | """Embed list of tokens. 67 | 68 | Args: 69 | tokens (list[unicode]) 70 | 71 | Returns: 72 | embeds (Variable[FloatTensor]): of shape (len(tokens), embed_dim) 73 | """ 74 | vocab = self.vocab 75 | indices = GPUVariable(torch.LongTensor([vocab.word2index(t) for t in tokens])) 76 | return self._embedding(indices) 77 | 78 | 79 | class TrainFlagEmbedding(Module): 80 | """Small wrapper around PyTorch Embedding object. Exports a trainable 81 | flag, which allows you to fix the weights matrix. Obeys same interface as 82 | PyTorch Embedding object, except for extra constructor arguments. 83 | """ 84 | 85 | def __init__(self, num_embeddings, embedding_dim, 86 | initial_embeddings, **kwargs): 87 | """Constructs TrainFlagEmbedding with embeddings initialized with 88 | initial_embeddings. 89 | 90 | Args: 91 | num_embeddings (int) 92 | embedding_dim (int) 93 | initial_embeddings (np.array): (num_embeddings, embedding_dim) 94 | trainable (bool): if False, weights matrix will not change. 95 | (default True) 96 | kwargs: all other supported keywords in torch.nn.Embeddings. 97 | """ 98 | super(TrainFlagEmbedding, self).__init__() 99 | trainable = kwargs.pop("trainable", True) 100 | self._trainable = trainable 101 | if trainable: 102 | embedding = Embedding( 103 | num_embeddings, embedding_dim, **kwargs) 104 | embedding.weight.data.set_( 105 | torch.from_numpy(initial_embeddings)) 106 | self._embedding = embedding 107 | self._weight = embedding.weight 108 | else: 109 | self._weight = GPUVariable( 110 | torch.from_numpy(initial_embeddings)) 111 | 112 | @property 113 | def weight(self): 114 | return self._weight 115 | 116 | def forward(self, index): 117 | """Looks up a batch of indices. 118 | 119 | Args: 120 | index (Variable[LongTensor]): (batch, indices per batch) 121 | 122 | Returns: 123 | Tensor: (batch, indices per batch, embedding_dim) 124 | """ 125 | if self._trainable: 126 | return self._embedding(index) 127 | else: 128 | batch, num_indices = index.size() 129 | flattened_index = index.view(batch * num_indices) 130 | embeddings = torch.index_select( 131 | self._weight, 0, flattened_index) 132 | return embeddings.view(batch, num_indices, -1) 133 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/checkpoints.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | from os import listdir 3 | import os 4 | from os.path import join 5 | 6 | import torch 7 | 8 | import gtd 9 | from gtd.ml.torch.utils import RandomState 10 | 11 | 12 | class TrainState(object): 13 | def __init__(self, model, optimizer, train_steps, random_state, max_grad_norm): 14 | """Construct a snapshot of training state. 15 | 16 | Args: 17 | model (Module) 18 | optimizer (Optimizer) 19 | train_steps (int) 20 | random_state (RandomState) 21 | max_grad_norm (float): used for gradient clipping 22 | """ 23 | self.model = model 24 | self.optimizer = optimizer 25 | self.train_steps = train_steps 26 | self.random_state = random_state 27 | self.max_grad_norm = max_grad_norm 28 | 29 | def increment_train_steps(self): 30 | self.train_steps += 1 31 | 32 | def track_grad_norms(self, grad_norm): 33 | # we will clip grad norm to be at most 2x the norm of anything we've tracked so far 34 | self.max_grad_norm = max(self.max_grad_norm, 2 * grad_norm) 35 | 36 | def save(self, path): 37 | gtd.io.makedirs(path) 38 | 39 | # Store the latest random state 40 | self.random_state = RandomState() 41 | 42 | # save model 43 | torch.save(self.model.state_dict(), join(path, 'model')) 44 | torch.save(self.optimizer.state_dict(), join(path, 'optimizer')) 45 | 46 | # pickle remaining attributes 47 | d = {attr: getattr(self, attr) for attr in ['train_steps', 'random_state', 'max_grad_norm']} 48 | with open(join(path, 'metadata.p'), 'w') as f: 49 | pickle.dump(d, f) 50 | 51 | @classmethod 52 | def load(cls, path, model, optimizer): 53 | with open(join(path, 'metadata.p'), 'r') as f: 54 | d = pickle.load(f) 55 | 56 | # load model 57 | optimizer.load_state_dict(torch.load(join(path, 'optimizer'))) 58 | model.load_state_dict(torch.load(join(path, 'model'))) 59 | train_state = TrainState(model=model, optimizer=optimizer, **d) 60 | return train_state 61 | 62 | @classmethod 63 | def initialize(cls, model, optimizer): 64 | train_steps = 0 65 | max_grad_norm = 0 66 | random_state = RandomState() 67 | return TrainState(model=model, optimizer=optimizer, train_steps=train_steps, 68 | random_state=random_state, max_grad_norm=max_grad_norm) 69 | 70 | 71 | class Checkpoints(object): 72 | def __init__(self, checkpoints_dir): 73 | self._path = checkpoints_dir 74 | 75 | @property 76 | def checkpoint_numbers(self): 77 | """Return the train steps at which checkpoints were saved (sorted ascending).""" 78 | dirs = [d for d in listdir(self._path) if d.endswith('.checkpoint')] 79 | return sorted([int(d[:-11]) for d in dirs]) # '.checkpoint' is 11 characters 80 | 81 | @property 82 | def latest_checkpoint_number(self): 83 | """Return the train_steps of the latest saved checkpoint. 84 | 85 | If no checkpoints, return None. 86 | """ 87 | nums = self.checkpoint_numbers 88 | if len(nums) == 0: 89 | return None 90 | else: 91 | return max(nums) 92 | 93 | def load(self, train_steps, model, optimizer): 94 | """Load the checkpoint for a particular training step. 95 | 96 | Args: 97 | model (Module) 98 | optimizer (Optimizer) 99 | 100 | Returns: 101 | TrainState 102 | """ 103 | ckpt_path = join(self._path, '{}.checkpoint'.format(train_steps)) 104 | if not os.path.exists(ckpt_path): 105 | raise ValueError('Checkpoint #{} does not exist.'.format(train_steps)) 106 | return TrainState.load(ckpt_path, model, optimizer) 107 | 108 | def save(self, train_state): 109 | """Save TrainState.""" 110 | ckpt_path = join(self._path, '{}.checkpoint'.format(train_state.train_steps)) 111 | train_state.save(ckpt_path) 112 | 113 | def load_latest(self, model, optimizer): 114 | """Load the latest checkpoint. 115 | 116 | If there are no checkpoints, return a freshly initialized Checkpoint. 117 | 118 | Args: 119 | model (Module) 120 | optimizer (Optimizer) 121 | 122 | Returns: 123 | TrainState 124 | """ 125 | ckpt_num = self.latest_checkpoint_number 126 | if ckpt_num is None: 127 | print 'No checkpoint to reload. Initializing fresh.' 128 | return TrainState.initialize(model, optimizer) 129 | else: 130 | train_state = self.load(self.latest_checkpoint_number, model, optimizer) 131 | print 'Reloaded checkpoint #{}'.format(ckpt_num) 132 | return train_state -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/attention.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | import math 4 | import torch 5 | from gtd.ml.torch.utils import GPUVariable 6 | from torch.nn import Parameter 7 | from torch.nn import Softmax, Tanh, Module 8 | 9 | from gtd.ml.torch.utils import conditional, NamedTupleLike 10 | 11 | 12 | class AttentionOutput(namedtuple('AttentionOutput', ['weights', 'context']), NamedTupleLike): 13 | pass 14 | """ 15 | Attributes: 16 | weights (Variable): of shape (batch_size, num_cells) 17 | context (Variable): of shape (batch_size, memory_dim) 18 | """ 19 | 20 | class DummyAttention(Module): 21 | def __init__(self, memory_dim, query_dim, attn_dim): 22 | super(DummyAttention, self).__init__() 23 | self.memory_dim = memory_dim 24 | self.query_dim = query_dim 25 | self.attn_dim = attn_dim 26 | 27 | def forward(self, memory_cells, query): 28 | batch_size, num_cells = memory_cells.mask.size() 29 | weights = GPUVariable(torch.zeros(batch_size, num_cells)) 30 | context = GPUVariable(torch.zeros(batch_size, self.memory_dim)) 31 | return AttentionOutput(weights=weights, context=context) 32 | 33 | 34 | 35 | class Attention(Module): 36 | def __init__(self, memory_dim, query_dim, attn_dim): 37 | super(Attention, self).__init__() 38 | self.tanh = Tanh() 39 | self.softmax = Softmax() 40 | 41 | self.memory_dim = memory_dim 42 | self.query_dim = query_dim 43 | self.attn_dim = attn_dim 44 | 45 | self.memory_transform = Parameter(self._initialize_weight_matrix(memory_dim, attn_dim)) # Wh 46 | self.query_transform = Parameter(self._initialize_weight_matrix(query_dim, attn_dim)) # Ws 47 | self.v_transform = Parameter(self._initialize_weight_matrix(attn_dim, 1)) # v 48 | 49 | @classmethod 50 | def _initialize_weight_matrix(cls, in_dim, out_dim): 51 | stdv = 1. / math.sqrt(in_dim) 52 | m = torch.ones(in_dim, out_dim) 53 | m.uniform_(-stdv, stdv) 54 | return m 55 | 56 | def forward(self, memory_cells, query): 57 | """Generates a density over a set of elements w.r.t. the query vector. 58 | 59 | Et(i) = tanh(Hi * Wh + St * Ws) * v 60 | At = softmax(Et) 61 | 62 | Dimensions: 63 | Hi: (batch_size x memory_dim) 64 | St: (batch_size x query_dim) 65 | Wh: (memory_dim x attn_dim) 66 | Ws: (query_dim x attn_dim) 67 | v: (attn_dim x 1) 68 | -- 69 | tanh( Hi * Wh + St * Ws ): (batch_size x attn_dim) 70 | tanh( Hi * Wh + St * Ws ) * v: (batch_size x 1) 71 | At = softmax(Et): (batch_size x num_cells) 72 | 73 | Args: 74 | memory_cells (SequenceBatch): (batch_size x num_cells x memory_dim) 75 | query (torch.Variable): St (batch_size x query_dim) 76 | 77 | Returns: 78 | Variable: (batch_size x num_cells) array 79 | """ 80 | transformed_query = torch.mm(query, self.query_transform) # (batch_size, attn_dim) 81 | 82 | batch_size, num_cells = memory_cells.mask.size() 83 | memory_cells_ = torch.transpose(memory_cells.values, 0, 1) # (num_cells, batch_size, memory_dim) 84 | expanded_transformed_query = transformed_query.expand(num_cells, batch_size, self.attn_dim) 85 | expanded_memory_transform = self.memory_transform.expand(num_cells, self.memory_dim, self.attn_dim) 86 | expanded_v_transform = self.v_transform.expand(num_cells, self.attn_dim, 1) 87 | 88 | # (num_cells, batch_size, attn_dim) 89 | attn_embeds = torch.bmm(memory_cells_, expanded_memory_transform) + expanded_transformed_query 90 | attn_embeds = self.tanh(attn_embeds) 91 | attn_embeds = torch.bmm(attn_embeds, expanded_v_transform) # (num_cells, batch_size, 1) 92 | logits = torch.transpose(attn_embeds.squeeze(2), 0, 1) 93 | 94 | mask = memory_cells.mask 95 | 96 | # no_cells is a FloatTensor with shape (batch_size, num_cells) 97 | # no_cells[i, j] = 1 if example i has NO memory cells, 0 otherwise 98 | no_cells = (1 - mask).prod(1).expand_as(mask) 99 | # TODO(kelvin): check for numerical stability. Product of 1's does not necessarily equal 1 exactly, which we need 100 | 101 | suppress = GPUVariable(torch.zeros(*mask.size())) 102 | suppress[mask == 0] = float('-inf') # send the logit of non-cells to -infinity 103 | suppress[no_cells == 1] = 0.0 # but if an entire row has no cells, just leave the cells alone 104 | 105 | logits = logits + suppress 106 | # -inf + anything = -inf 107 | 108 | # compute normalized weights 109 | weights = self.softmax(logits) # (batch_size, num_cells) 110 | 111 | # if a given row has no memory cells, weights should be all zeros 112 | all_zeros = GPUVariable(torch.zeros(*mask.size())) 113 | weights = conditional(no_cells, all_zeros, weights) 114 | 115 | context = torch.bmm(weights.unsqueeze(1), memory_cells.values) # (batch_size, 1, memory_dim) 116 | context = context.squeeze(1) # (batch_size, memory_dim) 117 | return AttentionOutput(weights=weights, context=context) 118 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/training_run.py: -------------------------------------------------------------------------------- 1 | import io 2 | import logging 3 | import socket 4 | from abc import ABCMeta, abstractmethod 5 | from collections import Mapping 6 | from os.path import join 7 | 8 | from git import Repo 9 | from tensorboard_logger import tensorboard_logger 10 | 11 | from gtd.io import IntegerDirectories, Workspace 12 | from gtd.log import SyncedMetadata 13 | from gtd.utils import Config, cached_property 14 | 15 | 16 | class TrainingRunWorkspace(Workspace): 17 | def __init__(self, root): 18 | super(TrainingRunWorkspace, self).__init__(root) 19 | for attr in ['config', 'metadata']: 20 | self.add_file(attr, '{}.txt'.format(attr)) 21 | for attr in ['git_patches', 'tensorboard']: 22 | self.add_dir(attr, attr) 23 | 24 | 25 | class TrainingRun(object): 26 | __metaclass__ = ABCMeta 27 | 28 | def __init__(self, config, save_dir): 29 | """Create TrainingRun. 30 | 31 | Args: 32 | config (Config) 33 | save_dir (str) 34 | """ 35 | self._config = config 36 | self._workspace = TrainingRunWorkspace(save_dir) 37 | self.metadata['host'] = socket.gethostname() 38 | 39 | @abstractmethod 40 | def train(self): 41 | raise NotImplementedError 42 | 43 | @property 44 | def config(self): 45 | return self._config 46 | 47 | @property 48 | def workspace(self): 49 | return self._workspace 50 | 51 | @cached_property 52 | def metadata(self): 53 | return SyncedMetadata(self.workspace.metadata, fmt='json') 54 | 55 | @cached_property 56 | def tb_logger(self): 57 | return tensorboard_logger.Logger(self.workspace.tensorboard) 58 | 59 | def record_commit(self, src_dir): 60 | repo = Repo(src_dir) 61 | 62 | if 'dirty_repo' in self.metadata or 'commit' in self.metadata: 63 | raise RuntimeError('A commit has already been recorded.') 64 | 65 | self.metadata['dirty_repo'] = repo.is_dirty() 66 | self.metadata['commit'] = repo.head.object.hexsha.encode('utf-8') 67 | 68 | def dump_diff(self, src_dir): 69 | repo = Repo(src_dir) 70 | diffindex = repo.head.commit.diff(None, create_patch=True) 71 | if len(diffindex) > 0: 72 | print 'uncomitted changes being stored as patches' 73 | patch_strings = [unicode(diff) for diff in diffindex] 74 | patch_filenames = [unicode(diff.a_rawpath).replace(u'/', u'-').replace(u'.', u'-') + u'.patch' for diff in 75 | diffindex] 76 | for strin, filename in zip(patch_strings, patch_filenames): 77 | file_out = join(self.workspace.git_patches, filename) 78 | with io.open(file_out, 'w', encoding='utf-8') as fout: 79 | fout.writelines(strin) 80 | else: 81 | print 'no changes to diff. ignoring git diff.' 82 | 83 | def match_commit(self, src_dir): 84 | """Check that the current commit matches the recorded commit for this run. 85 | 86 | Raises an error if commits don't match, or if there is dirty state. 87 | 88 | Args: 89 | src_dir (str): path to the Git repository 90 | """ 91 | if self.metadata['dirty_repo']: 92 | raise EnvironmentError('Working directory was dirty when commit was recorded.') 93 | 94 | repo = Repo(src_dir) 95 | if repo.is_dirty(): 96 | raise EnvironmentError('Current working directory is dirty.') 97 | 98 | current_commit = repo.head.object.hexsha.encode('utf-8') 99 | run_commit = self.metadata['commit'] 100 | if current_commit != run_commit: 101 | raise EnvironmentError("Commits don't match.\nCurrent: {}\nRecorded: {}".format(current_commit, run_commit)) 102 | 103 | 104 | class TrainingRuns(Mapping): 105 | """A map from integers to TrainingRuns.""" 106 | 107 | def __init__(self, root_dir, src_dir, run_factory, check_commit=True): 108 | """Create TrainingRuns object. 109 | 110 | Args: 111 | root_dir (str): directory where all training run data will be stored 112 | src_dir (str): a Git repository path (used to check commits) 113 | run_factory (Callable[[Config, str], TrainingRun]): a Callable, which takes a Config and a save_dir 114 | as arguments, and creates a new TrainingRun. 115 | check_commit (bool): if True, checks that current working directory is on same commit as when the run 116 | was originally created. 117 | """ 118 | self._int_dirs = IntegerDirectories(root_dir) 119 | self._src_dir = src_dir 120 | self._run_factory = run_factory 121 | self._check_commit = check_commit 122 | 123 | def _config_path(self, save_dir): 124 | return join(save_dir, 'config.txt') 125 | 126 | def __getitem__(self, i): 127 | """Reload an existing TrainingRun.""" 128 | save_dir = self._int_dirs[i] 129 | config = Config.from_file(self._config_path(save_dir)) 130 | run = self._run_factory(config, save_dir) 131 | if self._check_commit: 132 | run.match_commit(self._src_dir) 133 | 134 | logging.info('Reloaded TrainingRun #{}'.format(i)) 135 | return run 136 | 137 | def new(self, config, name=None): 138 | """Create a new TrainingRun.""" 139 | print 'TrainingRun configuration:\n{}'.format(config) 140 | 141 | save_dir = self._int_dirs.new_dir(name=name) 142 | cfg_path = self._config_path(save_dir) 143 | config.to_file(cfg_path) # save the config 144 | run = self._run_factory(config, save_dir) 145 | run.record_commit(self._src_dir) 146 | run.dump_diff(self._src_dir) 147 | run.metadata['config'] = config._config_tree # save config in metadata, for programmatic access 148 | 149 | print 'New TrainingRun created at: {}'.format(run.workspace.root) 150 | return run 151 | 152 | def __iter__(self): 153 | return iter(self._int_dirs) 154 | 155 | def __len__(self): 156 | return len(self._int_dirs) 157 | 158 | def paths(self): 159 | return self._int_dirs.values() 160 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/training_run_viewer.py: -------------------------------------------------------------------------------- 1 | import json 2 | from abc import ABCMeta, abstractmethod 3 | from collections import OrderedDict 4 | 5 | import pwd 6 | from IPython.core.display import display, HTML 7 | from os.path import join, basename 8 | 9 | import os 10 | from os import listdir 11 | from prettytable import PrettyTable 12 | 13 | from gtd.chrono import verboserate 14 | from gtd.log import in_ipython, jupyter_no_margins, Metadata 15 | 16 | 17 | class TrainingRunViewer(object): 18 | def __init__(self, runs): 19 | """Construct TrainingRunViewer. 20 | 21 | Args: 22 | runs (gtd.ml.TrainingRuns) 23 | """ 24 | self._runs = runs 25 | self._renderers = OrderedDict() 26 | 27 | def add(self, name, renderer, post_processor=None): 28 | """Add a renderer. 29 | 30 | Args: 31 | name (unicode): name for the attribute 32 | renderer (Callable[str, object]): takes a run dir (absolute path) and returns something to print. 33 | post_processor (Callable[object, unicode]): takes the output of the renderer and returns a modified output. 34 | 35 | Returns: 36 | 37 | """ 38 | if post_processor: 39 | r = lambda path: post_processor(renderer(path)) 40 | else: 41 | r = renderer 42 | self._renderers[name] = r 43 | 44 | def view(self, select=lambda path: True): 45 | """View runs. 46 | 47 | Args: 48 | select (Callable[str, bool]): given a path to a run, returns True if we want to display the 49 | run, False otherwise. 50 | """ 51 | field_names = self._renderers.keys() 52 | table = PrettyTable(field_names=field_names) 53 | types = OrderedDict((n, set()) for n in field_names) 54 | 55 | for i, path in verboserate(self._runs._int_dirs.items(), desc='Scanning runs.'): 56 | if not select(path): 57 | continue 58 | 59 | row = [] 60 | for render in self._renderers.values(): 61 | try: 62 | s = render(path) 63 | except: 64 | s = u'' 65 | row.append(s) 66 | 67 | # record types 68 | for name, elem in zip(field_names, row): 69 | types[name].add(type(elem)) 70 | 71 | table.add_row(row) 72 | 73 | self._print_table(table) 74 | 75 | # display types for each attribute 76 | type_table = PrettyTable(['attribute', 'types']) 77 | for name, type_set in types.iteritems(): 78 | type_table.add_row([name, ', '.join(t.__name__ for t in type_set)]) 79 | self._print_table(type_table) 80 | 81 | @classmethod 82 | def _print_table(cls, table): 83 | if in_ipython(): 84 | jupyter_no_margins() 85 | display(HTML(table.get_html_string())) 86 | else: 87 | print table 88 | 89 | 90 | class Renderer(object): 91 | __metaclass__ = ABCMeta 92 | 93 | @abstractmethod 94 | def __call__(self, path): 95 | """Render. 96 | 97 | Args: 98 | path (str): absolute path to a run directory 99 | 100 | Returns: 101 | object: value to be displayed in a pretty-printed table. Should implement __str__ and __unicode__. 102 | """ 103 | raise NotImplementedError 104 | 105 | 106 | # Some renderers below are just functions, for simplicity. 107 | 108 | 109 | class JSONSelector(Renderer): 110 | def __init__(self, file_path, json_keys): 111 | """Select a value in a JSON file, or a HOCON file. 112 | 113 | Args: 114 | file_path (str): path to the JSON file, relative to run dir root. 115 | json_keys (list[str]): path from the root of the JSON tree to the target attribute 116 | """ 117 | self.file_path = file_path 118 | self.json_keys = json_keys 119 | 120 | def __call__(self, path): 121 | full_path = join(path, self.file_path) 122 | try: 123 | # try loading as JSON 124 | with open(full_path, 'r') as f: 125 | x = json.load(f) 126 | except ValueError: 127 | # try loading as HOCON 128 | x = Metadata.from_file(full_path, fmt='hocon') 129 | 130 | for key in self.json_keys: 131 | x = x[key] 132 | 133 | return x 134 | 135 | 136 | class Commit(Renderer): 137 | def __init__(self): 138 | self._commit = JSONSelector('metadata.txt', ['commit']) 139 | self._dirty = JSONSelector('metadata.txt', ['dirty_repo']) 140 | 141 | def __call__(self, path): 142 | c = self._commit(path)[:8] 143 | d = ' (dirty)' if self._dirty(path) else '' 144 | return '{}{}'.format(c, d) 145 | 146 | 147 | class NumSteps(Renderer): 148 | def __init__(self): 149 | self.json_selector = JSONSelector('metadata.txt', ['steps']) 150 | 151 | def __call__(self, path): 152 | try: 153 | steps = self.json_selector(path) # try looking in JSON 154 | except: 155 | # if that fails, look at the largest checkpoint 156 | ckpt_nums = checkpoint_numbers(join(path, 'checkpoints')) 157 | steps = max(ckpt_nums) if ckpt_nums else 0 158 | return steps 159 | 160 | 161 | class Owner(Renderer): 162 | def __init__(self, user_ids): 163 | self.user_ids = user_ids 164 | 165 | def __call__(self, path): 166 | stat_info = os.stat(path) 167 | uid = stat_info.st_uid 168 | try: 169 | user = pwd.getpwuid(uid)[0] 170 | except: 171 | # sometimes no name is associated with the ID 172 | user = self.user_ids.get(uid, uid) 173 | 174 | return str(user) 175 | 176 | 177 | def checkpoint_numbers(checkpoints_dir): 178 | """Return the train steps at which checkpoints were saved (sorted ascending).""" 179 | dirs = [d for d in listdir(checkpoints_dir) if d.endswith('.checkpoint')] 180 | return sorted([int(d[:-11]) for d in dirs]) 181 | 182 | 183 | def run_name(path): 184 | return basename(path) 185 | 186 | 187 | def num_checkpoints(path): 188 | ckpt_nums = checkpoint_numbers(join(path, 'checkpoints')) 189 | return len(ckpt_nums) 190 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/tf/tests/test_framework.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from collections import Sequence, Mapping 3 | from itertools import izip 4 | 5 | import numpy as np 6 | import pytest 7 | import tensorflow as tf 8 | from keras.engine import Input 9 | from keras.layers import Dense 10 | from numpy.testing import assert_array_almost_equal 11 | 12 | from gtd.ml.tf.framework import Feedable, KerasModel 13 | from gtd.ml.tf.utils import guarantee_initialized_variables, clean_session 14 | from gtd.utils import Bunch 15 | 16 | 17 | @pytest.yield_fixture 18 | def clean_test_session(): 19 | with clean_session() as sess: 20 | yield sess 21 | 22 | 23 | def assert_array_collections_equal(correct, test, decimal=7): 24 | """Assert that two collections of numpy arrays have the same values. 25 | 26 | Collections can be either a Sequence or a Mapping. 27 | """ 28 | if type(correct) != type(test): 29 | raise ValueError('correct ({}) and test ({}) must have the same type.'.format(type(correct), type(test))) 30 | 31 | assert_equal = lambda c, t: assert_array_almost_equal(c, t, decimal=decimal) 32 | 33 | if isinstance(correct, Sequence): 34 | assert len(correct) == len(test) 35 | for c, t in izip(correct, test): 36 | assert_equal(c, t) 37 | elif isinstance(correct, Mapping): 38 | # same keys 39 | assert set(test.keys()) == set(correct.keys()) 40 | # same values 41 | for key in test: 42 | assert_equal(correct[key], test[key]) 43 | else: 44 | raise TypeError('Inputs must be of type Mapping or Sequence, not {}.'.format(type(correct))) 45 | 46 | 47 | class FeedableTester(object): 48 | """A template for testing Feedable classes. 49 | 50 | Subclass this class and implement all of its abstractmethods. 51 | 52 | NOTE: 53 | You must decorate the implementation of each abstractmethod with a @pytest.fixture decorator. 54 | See the `TestFeedable` class below for an example. 55 | """ 56 | @abstractmethod 57 | def model(self): 58 | """The Model to be tested.""" 59 | pass 60 | 61 | @abstractmethod 62 | def inputs(self): 63 | """Inputs to the model. 64 | 65 | Returns: 66 | (list, dict): an args, kwargs pair 67 | """ 68 | pass 69 | 70 | @classmethod 71 | def as_args_kwargs(cls, *args, **kwargs): 72 | return args, kwargs 73 | 74 | @abstractmethod 75 | def feed_dict(self): 76 | """Return the correct result of the model's `feed_dict` method.""" 77 | pass 78 | 79 | @abstractmethod 80 | def output_tensors(self): 81 | """Output tensors to be fetched. 82 | 83 | Returns: 84 | list[np.array] 85 | """ 86 | pass 87 | 88 | @abstractmethod 89 | def outputs(self): 90 | """Return the correct results of running model.compute(fetch=output_tensors, ...) 91 | 92 | Returns: 93 | list[np.array] 94 | """ 95 | pass 96 | 97 | @pytest.mark.usefixtures('clean_test_session') 98 | def test_inputs_to_feed_dict(self, model, inputs, feed_dict): 99 | """Test for correct feed_dict.""" 100 | args, kwargs = inputs 101 | test_feed_dict = model.inputs_to_feed_dict(*args, **kwargs) 102 | assert_array_collections_equal(feed_dict, test_feed_dict) 103 | 104 | @pytest.mark.usefixtures('clean_test_session') 105 | def test_outputs(self, model, inputs, output_tensors, outputs): 106 | """Test for correct output.""" 107 | sess = tf.get_default_session() 108 | guarantee_initialized_variables(sess) 109 | args, kwargs = inputs 110 | test_outputs = model.compute(output_tensors, *args, **kwargs) 111 | assert_array_collections_equal(outputs, test_outputs, decimal=4) 112 | 113 | 114 | class KerasModelTester(FeedableTester): 115 | @pytest.fixture 116 | def output_tensors(self, model): 117 | return model.output_tensors 118 | 119 | @pytest.mark.usefixtures('clean_test_session') 120 | def test_placeholders(self, model, feed_dict): 121 | """Test that model.placeholders matches the keys of feed_dict.""" 122 | assert set(model.placeholders) == set(feed_dict.keys()) 123 | 124 | 125 | class FeedableExample(Feedable): 126 | def __init__(self): 127 | x = tf.placeholder(tf.float32, shape=[], name='x') 128 | y = tf.get_variable('y', shape=[], initializer=tf.constant_initializer(2.0)) 129 | z = x * y 130 | 131 | self.x = x 132 | self.y = y 133 | self.z = z 134 | 135 | def inputs_to_feed_dict(self, batch): 136 | return {self.x: batch.x} 137 | 138 | 139 | class TestFeedableExample(FeedableTester): 140 | @pytest.fixture 141 | def model(self): 142 | return FeedableExample() 143 | 144 | @pytest.fixture 145 | def inputs(self): 146 | return self.as_args_kwargs(Bunch(x=5.0)) 147 | 148 | @pytest.fixture 149 | def feed_dict(self, model): 150 | return {model.x: 5.0} 151 | 152 | @pytest.fixture 153 | def output_tensors(self, model): 154 | return [model.z] 155 | 156 | @pytest.fixture 157 | def outputs(self): 158 | return [10.0] 159 | 160 | 161 | class KerasLayersModelExample(KerasModel): 162 | """A Model that is defined using Keras layers from beginning to end.""" 163 | def __init__(self): 164 | x = Input([1]) 165 | y = np.array([[2.0]]) 166 | b = np.array([0.0]) 167 | mult = Dense(1, weights=(y, b)) 168 | z = mult(x) 169 | 170 | self.x = x 171 | self.mult = mult 172 | self.z = z 173 | 174 | @property 175 | def placeholders(self): 176 | return [self.x] 177 | 178 | def inputs_to_feed_dict(self, batch): 179 | return {self.x: np.array([[batch.x]])} 180 | 181 | @property 182 | def output_tensors(self): 183 | return [self.z] 184 | 185 | 186 | class TestKerasLayersModel(KerasModelTester): 187 | @pytest.fixture 188 | def model(self): 189 | return KerasLayersModelExample() 190 | 191 | @pytest.fixture 192 | def inputs(self): 193 | return self.as_args_kwargs(Bunch(x=5.0)) 194 | 195 | @pytest.fixture 196 | def feed_dict(self, model): 197 | return {model.x: 5.0} 198 | 199 | @pytest.fixture 200 | def outputs(self): 201 | return [10.0] -------------------------------------------------------------------------------- /third-party/gtd/gtd/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | import logging 3 | import numpy as np 4 | from gtd.utils import memoize 5 | 6 | 7 | @memoize 8 | def get_spacy(): 9 | """ 10 | Loads the spaCy english processor. 11 | 12 | Tokenizing, Parsing, and NER are enabled. All other features are disabled. 13 | 14 | Returns: 15 | A spaCy Language object for English 16 | """ 17 | logging.info('Loading spaCy...') 18 | import spacy.en 19 | nlp = spacy.en.English(tagger=False, parser=True, matcher=False) 20 | return nlp 21 | 22 | 23 | class NER(object): 24 | def __init__(self): 25 | self.processor = get_spacy() 26 | 27 | def __call__(self, text): 28 | """Given a unicode string, return a tuple of the named entities found inside.""" 29 | if not isinstance(text, unicode): 30 | text = unicode(text) 31 | doc = self.processor(text) 32 | return doc.ents 33 | 34 | 35 | class Trie(object): 36 | 37 | def __init__(self, token, parent, sink=False): 38 | self.token = token 39 | self.parent = parent 40 | self.sink = sink 41 | self.children = {} 42 | 43 | def __contains__(self, phrase): 44 | if phrase[0] == self.token: 45 | if len(phrase) == 1: 46 | # On our last word. Must be a sink to match. 47 | return self.sink 48 | else: 49 | # doesn't match 50 | return False 51 | 52 | suffix = phrase[1:] 53 | for child in self.children.values(): 54 | if suffix in child: 55 | return True 56 | 57 | def ancestors(self): 58 | if self.parent is None: 59 | return [] 60 | anc = self.parent.ancestors() 61 | anc.append(self.token) 62 | return anc 63 | 64 | 65 | class PhraseMatcher(object): 66 | def __init__(self, phrases): 67 | """Construct a phrase matcher. 68 | 69 | Args: 70 | phrases (List[Tuple[str]]): a list of phrases to match, where each phrase is a tuple of strings 71 | """ 72 | # construct Trie 73 | root = Trie('ROOT', None) 74 | for phrase in phrases: 75 | current = root 76 | for token in phrase: 77 | if token not in current.children: 78 | current.children[token] = Trie(token, current) 79 | current = current.children[token] 80 | current.sink = True # mark last node as a sink 81 | 82 | self.root = root 83 | self.phrases = phrases 84 | 85 | def has_phrase(self, phrase): 86 | """Check if a particular phrase is matched by the matcher. 87 | 88 | Args: 89 | phrase (tuple[str]) 90 | """ 91 | return ['ROOT'] + phrase in self.root 92 | 93 | def match(self, tokens): 94 | """A list of matches. 95 | 96 | Args: 97 | tokens (list[str]): a list of tokens 98 | 99 | Returns: 100 | list[tuple[str, int, int]]: A list of (match, start, end) triples. Each `match` is a tuple of tokens. 101 | `start` and `end` are word offsets. 102 | """ 103 | root = self.root 104 | candidates = [root] 105 | 106 | matches = [] 107 | for i, token in enumerate(tokens): 108 | 109 | # extend candidates or prune failed candidates 110 | new_candidates = [] 111 | for cand in candidates: 112 | if token in cand.children: 113 | new_candidates.append(cand.children[token]) # move to child 114 | candidates = new_candidates 115 | candidates.append(root) # always add root 116 | 117 | for cand in candidates: 118 | if cand.sink: 119 | match = tuple(cand.ancestors()) 120 | end = i + 1 121 | start = end - len(match) 122 | matches.append((match, start, end)) 123 | 124 | return matches 125 | 126 | 127 | # first_cap_re = re.compile('(.)([A-Z][a-z]+)') 128 | first_cap_re = re.compile('([^-_])([A-Z][a-z]+)') 129 | all_cap_re = re.compile('([a-z0-9])([A-Z])') 130 | 131 | 132 | def camel_to_snake_case(name): 133 | """Convert camelCase to snake_case (Python).""" 134 | s1 = first_cap_re.sub(r'\1_\2', name) 135 | return all_cap_re.sub(r'\1_\2', s1).lower() 136 | 137 | 138 | def longest_common_subsequence(X, Y): 139 | # https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Longest_common_subsequence#Computing_the_length_of_the_LCS 140 | 141 | def LCS(X, Y): 142 | m = len(X) 143 | n = len(Y) 144 | # An (m+1) times (n+1) matrix 145 | C = [[0] * (n + 1) for _ in range(m + 1)] 146 | for i in range(1, m + 1): 147 | for j in range(1, n + 1): 148 | if X[i - 1] == Y[j - 1]: 149 | C[i][j] = C[i - 1][j - 1] + 1 150 | else: 151 | C[i][j] = max(C[i][j - 1], C[i - 1][j]) 152 | return C 153 | 154 | def backTrack(C, X, Y, i, j): 155 | if i == 0 or j == 0: 156 | return [] 157 | elif X[i - 1] == Y[j - 1]: 158 | return backTrack(C, X, Y, i - 1, j - 1) + [X[i - 1]] 159 | else: 160 | if C[i][j - 1] > C[i - 1][j]: 161 | return backTrack(C, X, Y, i, j - 1) 162 | else: 163 | return backTrack(C, X, Y, i - 1, j) 164 | 165 | m = len(X) 166 | n = len(Y) 167 | C = LCS(X, Y) 168 | return backTrack(C, X, Y, m, n) 169 | 170 | 171 | def get_ngrams(s, n): 172 | """Get n-grams for s. 173 | 174 | >>> s = [1, 2, 3, 4] 175 | >>> get_ngrams(s, 2) 176 | [(1, 2), (2, 3), (3, 4)] 177 | >>> get_ngrams(s, 1) 178 | [(1,), (2,), (3,), (4,)] 179 | >>> get_ngrams(s, 4) 180 | [(1, 2, 3, 4)] 181 | """ 182 | assert n <= len(s) 183 | assert n >= 1 184 | return [tuple(s[k:k + n]) for k in range(len(s) + 1 - n)] 185 | 186 | 187 | def ngram_precision_recall(reference, candidate, n=None): 188 | if n is None: 189 | # Take the average over 1 through 4 grams. 190 | prs = [] 191 | for m in [1, 2, 3, 4]: 192 | prs.append(ngram_precision_recall(reference, candidate, m)) 193 | ps, rs = zip(*prs) 194 | return np.mean(ps), np.mean(rs) 195 | 196 | ref_set = set(get_ngrams(reference, n)) 197 | can_set = set(get_ngrams(candidate, n)) 198 | correct = float(len(ref_set & can_set)) 199 | rec = correct / len(ref_set) 200 | prec = correct / len(can_set) 201 | return prec, rec -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | from contextlib import contextmanager 3 | 4 | import numpy as np 5 | from numpy.testing import assert_array_almost_equal 6 | from torch import _TensorBase, torch 7 | from torch.autograd import Variable 8 | 9 | from gtd.utils import chunks 10 | 11 | 12 | def conditional(b, x, y): 13 | """Conditional operator for PyTorch. 14 | 15 | Args: 16 | b (FloatTensor): with values that are equal to 0 or 1 17 | x (FloatTensor): of same shape as b 18 | y (FloatTensor): of same shape as b 19 | 20 | Returns: 21 | z (FloatTensor): of same shape as b. z[i] = x[i] if b[i] == 1 else y[i] 22 | """ 23 | return b * x + (1 - b) * y 24 | 25 | 26 | def to_numpy(x): 27 | if isinstance(x, Variable): 28 | x = x.data # unwrap Variable 29 | 30 | if isinstance(x, _TensorBase): 31 | x = x.cpu().numpy() 32 | return x 33 | 34 | 35 | def assert_tensor_equal(x, y, decimal=6): 36 | assert_array_almost_equal(to_numpy(x), to_numpy(y), decimal=decimal) 37 | 38 | 39 | def expand_dims_for_broadcast(low_tensor, high_tensor): 40 | """Expand the dimensions of a lower-rank tensor, so that its rank matches that of a higher-rank tensor. 41 | 42 | This makes it possible to perform broadcast operations between low_tensor and high_tensor. 43 | 44 | Args: 45 | low_tensor (Tensor): lower-rank Tensor with shape [s_0, ..., s_p] 46 | high_tensor (Tensor): higher-rank Tensor with shape [s_0, ..., s_p, ..., s_n] 47 | 48 | Note that the shape of low_tensor must be a prefix of the shape of high_tensor. 49 | 50 | Returns: 51 | Tensor: the lower-rank tensor, but with shape expanded to be [s_0, ..., s_p, 1, 1, ..., 1] 52 | """ 53 | low_size, high_size = low_tensor.size(), high_tensor.size() 54 | low_rank, high_rank = len(low_size), len(high_size) 55 | 56 | # verify that low_tensor shape is prefix of high_tensor shape 57 | assert low_size == high_size[:low_rank] 58 | 59 | new_tensor = low_tensor 60 | for _ in range(high_rank - low_rank): 61 | new_tensor = torch.unsqueeze(new_tensor, len(new_tensor.size())) 62 | 63 | return new_tensor 64 | 65 | 66 | def is_binary(t): 67 | """Check if values of t are binary. 68 | 69 | Args: 70 | t (Tensor|Variable) 71 | 72 | Returns: 73 | bool 74 | """ 75 | if isinstance(t, Variable): 76 | t = t.data # convert Variable to Tensor 77 | 78 | binary = (t == 0) | (t == 1) # ByteTensor, should be all 1's 79 | all_binary = torch.prod(binary) # int, should be 1 80 | return all_binary == 1 81 | 82 | 83 | def similar_size_batches(examples, batch_size, size=lambda x: len(x.target_words)): 84 | """Create similar-sized batches of EditExamples. 85 | 86 | By default, elements with similar len('source_words') are batched together. 87 | See editor.py / EditExample. 88 | 89 | Args: 90 | examples (list[EditExample]) 91 | batch_size (int) 92 | size (Callable[[EditExample], int]) 93 | 94 | Returns: 95 | list[list[EditExample]] 96 | """ 97 | assert batch_size >= 1 98 | sorted_examples = sorted(examples, key=size) 99 | batches = list(chunks(sorted_examples, batch_size)) 100 | random.shuffle(batches) # in-place 101 | 102 | # report savings 103 | suboptimal_batches = list(chunks(examples, batch_size)) 104 | 105 | total_cost = lambda batches: batch_size * sum(max(size(b) for b in batch) for batch in batches) 106 | naive_cost = total_cost(suboptimal_batches) 107 | improved_cost = total_cost(batches) 108 | optimal_cost = sum(size(ex) for ex in examples) 109 | 110 | print 'Optimized batches: reduced cost from {naive} (naive) to {improved} ({reduction}% reduction).\n' \ 111 | 'Optimal (batch_size=1) would be {optimal}.'.format(naive=naive_cost, improved=improved_cost, 112 | reduction=float(naive_cost - improved_cost) / naive_cost, 113 | optimal=optimal_cost) 114 | 115 | return batches 116 | 117 | 118 | def print_module_parameters(m, depth=0): 119 | """Print out all parameters of a module.""" 120 | tabs = '\t' * depth 121 | for p_name, p in m._parameters.items(): 122 | print tabs + p_name 123 | for c_name, c in m.named_children(): 124 | print tabs + c_name 125 | print_module_parameters(c, depth + 1) 126 | 127 | 128 | _GPUS_EXIST = True # True by default 129 | 130 | def try_gpu(x): 131 | """Try to put a Variable/Tensor/Module on GPU.""" 132 | global _GPUS_EXIST 133 | 134 | if _GPUS_EXIST: 135 | try: 136 | return x.cuda() 137 | except (AssertionError, RuntimeError): 138 | # actually, GPUs don't exist 139 | print 'No GPUs detected. Sticking with CPUs.' 140 | _GPUS_EXIST = False 141 | return x 142 | else: 143 | return x 144 | 145 | 146 | def GPUVariable(data): 147 | return try_gpu(Variable(data, requires_grad=False)) 148 | 149 | 150 | class RandomState(object): 151 | def __init__(self): 152 | """Take a snapshot of random number generator state at this point in time. 153 | 154 | Only covers random, numpy.random and torch (CPU). 155 | """ 156 | self.py = random.getstate() 157 | self.np = np.random.get_state() 158 | self.torch = torch.get_rng_state() 159 | 160 | def set_global(self): 161 | """Set all global random number generators to this state.""" 162 | random.setstate(self.py) 163 | np.random.set_state(self.np) 164 | torch.set_rng_state(self.torch) 165 | 166 | 167 | @contextmanager 168 | def random_state(state): 169 | """Execute code inside this with-block by starting with the specified random state. 170 | 171 | Does not affect the state of random number generators outside this block. 172 | Not thread-safe. 173 | 174 | Args: 175 | state (RandomState) 176 | """ 177 | old_state = RandomState() 178 | state.set_global() 179 | yield 180 | old_state.set_global() 181 | 182 | 183 | @contextmanager 184 | def random_seed(seed): 185 | """Execute code inside this with-block using the specified random seed. 186 | 187 | Sets the seed for random, numpy.random and torch (CPU). 188 | 189 | WARNING: torch GPU seeds are NOT set! 190 | 191 | Does not affect the state of random number generators outside this block. 192 | Not thread-safe. 193 | 194 | Args: 195 | seed (int) 196 | """ 197 | state = RandomState() 198 | random.seed(seed) # alter state 199 | np.random.seed(seed) 200 | torch.manual_seed(seed) 201 | yield 202 | state.set_global() 203 | 204 | 205 | class NamedTupleLike(object): 206 | __slots__ = [] 207 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/profile_imports.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2006, 2008, 2009, 2010 by Canonical Ltd 2 | # Written by John Arbash Meinel 3 | # 4 | # This program is free software; you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation; either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program; if not, write to the Free Software 16 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 | 18 | """A custom importer and regex compiler which logs time spent.""" 19 | 20 | import sys 21 | import time 22 | 23 | 24 | import re 25 | 26 | 27 | _parent_stack = [] 28 | _total_stack = {} 29 | _info = {} 30 | _cur_id = 0 31 | _timer = time.time 32 | if sys.platform == 'win32': 33 | _timer = time.clock 34 | 35 | 36 | def stack_add(name, frame_name, frame_lineno, scope_name=None): 37 | """Start a new record on the stack""" 38 | global _cur_id 39 | _cur_id += 1 40 | this_stack = (_cur_id, name) 41 | 42 | if _parent_stack: 43 | _total_stack[_parent_stack[-1]].append(this_stack) 44 | _total_stack[this_stack] = [] 45 | _parent_stack.append(this_stack) 46 | _info[this_stack] = [len(_parent_stack)-1, frame_name, frame_lineno, scope_name] 47 | 48 | return this_stack 49 | 50 | 51 | def stack_finish(this, cost): 52 | """Finish a given entry, and record its cost in time""" 53 | global _parent_stack 54 | 55 | assert _parent_stack[-1] == this, \ 56 | 'import stack does not end with this %s: %s' % (this, _parent_stack) 57 | _parent_stack.pop() 58 | _info[this].append(cost) 59 | 60 | 61 | def log_stack_info(out_file, sorted=True, hide_fast=True): 62 | # Find all of the roots with import = 0 63 | out_file.write('%5s %5s %-40s @ %s:%s\n' 64 | % ('cum', 'inline', 'name', 'file', 'line')) 65 | todo = [(value[-1], key) for key,value in _info.iteritems() if value[0] == 0] 66 | 67 | if sorted: 68 | todo.sort() 69 | 70 | while todo: 71 | cum_time, cur = todo.pop() 72 | children = _total_stack[cur] 73 | 74 | c_times = [] 75 | 76 | info = _info[cur] 77 | if hide_fast and info[-1] < 0.0001: 78 | continue 79 | 80 | # Compute the module time by removing the children times 81 | mod_time = info[-1] 82 | for child in children: 83 | c_info = _info[child] 84 | mod_time -= c_info[-1] 85 | c_times.append((c_info[-1], child)) 86 | 87 | # indent, cum_time, mod_time, name, 88 | # scope_name, frame_name, frame_lineno 89 | out_file.write('%5.1f %5.1f %-40s @ %s:%d\n' 90 | % (info[-1]*1000., mod_time*1000., 91 | ('+'*info[0] + cur[1]), 92 | info[1], info[2])) 93 | 94 | if sorted: 95 | c_times.sort() 96 | else: 97 | c_times.reverse() 98 | todo.extend(c_times) 99 | 100 | 101 | _real_import = __import__ 102 | 103 | def timed_import(name, globals=None, locals=None, fromlist=None, level=None): 104 | """Wrap around standard importer to log import time""" 105 | # normally there are 4, but if this is called as __import__ eg by 106 | # /usr/lib/python2.6/email/__init__.py then there may be only one 107 | # parameter 108 | # level is only passed by python2.6 109 | 110 | if globals is None: 111 | # can't determine the scope name afaics; we could peek up the stack to 112 | # see where this is being called from, but it should be a rare case. 113 | scope_name = None 114 | else: 115 | scope_name = globals.get('__name__', None) 116 | if scope_name is None: 117 | scope_name = globals.get('__file__', None) 118 | if scope_name is None: 119 | scope_name = globals.keys() 120 | else: 121 | # Trim out paths before bzrlib 122 | loc = scope_name.find('bzrlib') 123 | if loc != -1: 124 | scope_name = scope_name[loc:] 125 | # For stdlib, trim out early paths 126 | loc = scope_name.find('python2.4') 127 | if loc != -1: 128 | scope_name = scope_name[loc:] 129 | 130 | # Figure out the frame that is doing the importing 131 | frame = sys._getframe(1) 132 | frame_name = frame.f_globals.get('__name__', '') 133 | extra = '' 134 | if frame_name.endswith('demandload'): 135 | # If this was demandloaded, we have 3 frames to ignore 136 | extra = '(demandload) ' 137 | frame = sys._getframe(4) 138 | frame_name = frame.f_globals.get('__name__', '') 139 | elif frame_name.endswith('lazy_import'): 140 | # If this was lazily imported, we have 3 frames to ignore 141 | extra = '[l] ' 142 | frame = sys._getframe(4) 143 | frame_name = frame.f_globals.get('__name__', '') 144 | if fromlist: 145 | extra += ' [%s]' % (', '.join(map(str, fromlist)),) 146 | frame_lineno = frame.f_lineno 147 | 148 | this = stack_add(extra + name, frame_name, frame_lineno, scope_name) 149 | 150 | tstart = _timer() 151 | try: 152 | # Do the import 153 | mod = _real_import(name, globals, locals, fromlist) 154 | finally: 155 | tload = _timer()-tstart 156 | stack_finish(this, tload) 157 | 158 | return mod 159 | 160 | 161 | _real_compile = re._compile 162 | 163 | 164 | def timed_compile(*args, **kwargs): 165 | """Log how long it takes to compile a regex""" 166 | 167 | # And who is requesting this? 168 | frame = sys._getframe(2) 169 | frame_name = frame.f_globals.get('__name__', '') 170 | 171 | extra = '' 172 | if frame_name.endswith('lazy_regex'): 173 | # If this was lazily compiled, we have 3 more frames to ignore 174 | extra = '[l] ' 175 | frame = sys._getframe(5) 176 | frame_name = frame.f_globals.get('__name__', '') 177 | frame_lineno = frame.f_lineno 178 | this = stack_add(extra+repr(args[0]), frame_name, frame_lineno) 179 | 180 | tstart = _timer() 181 | try: 182 | # Measure the compile time 183 | comp = _real_compile(*args, **kwargs) 184 | finally: 185 | tcompile = _timer() - tstart 186 | stack_finish(this, tcompile) 187 | 188 | return comp 189 | 190 | 191 | def install(): 192 | """Install the hooks for measuring import and regex compile time.""" 193 | __builtins__['__import__'] = timed_import 194 | re._compile = timed_compile 195 | 196 | 197 | def uninstall(): 198 | """Remove the import and regex compile timing hooks.""" 199 | __builtins__['__import__'] = _real_import 200 | re._compile = _real_compile 201 | 202 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from os.path import join 3 | 4 | import pytest 5 | from gtd.text import PhraseMatcher 6 | from gtd.utils import FileMemoized, SimpleExecutor, as_batches, Failure, NestedDict, EqualityMixinSlots, \ 7 | memoize_with_key_fxn, DictMemoized, ranks, truncated, ClassCounter 8 | 9 | 10 | def test_as_batches(): 11 | items = [0, 1, 2, 3, 4, 5, 6] 12 | assert list(as_batches(items, 2)) == [[0, 1], [2, 3], [4, 5], [6]] 13 | 14 | 15 | def test_file_memoized_represent_args(tmpdir): 16 | path = str(tmpdir.join('fxn')) 17 | 18 | fm = FileMemoized(None, path, None, None) 19 | key = fm._cache_key(['a', 'b'], {'c': 2, 'd': 'e'}) 20 | assert key == join(path, 'a_b_c=2_d=e.txt') 21 | key = fm._cache_key([], {'c': 2, 'd': 'e'}) 22 | assert key == join(path, 'c=2_d=e.txt') 23 | key = fm._cache_key([], dict()) 24 | assert key == join(path, 'NO_KEY.txt') 25 | 26 | 27 | def test_ranks(): 28 | scores = [10, -1, 0.3, 24, 11] 29 | assert ranks(scores, ascending=True) == [3, 1, 2, 5, 4] 30 | assert ranks(scores, ascending=False) == [3, 5, 4, 1, 2] 31 | 32 | 33 | class TestUtils(TestCase): 34 | 35 | def test_phrase_matcher(self): 36 | phrases = [[1, 2, 3], [1, ], [2, ], [2, 4]] 37 | not_phrases = [[1, 2], [4, ]] 38 | 39 | pm = PhraseMatcher(phrases) 40 | 41 | for phrase in phrases: 42 | self.assertTrue(pm.has_phrase(phrase)) 43 | 44 | for phrase in not_phrases: 45 | self.assertFalse(pm.has_phrase(phrase)) 46 | 47 | tokens = [1, 2, 1, 2, 3, 2, 3, 2, 4] 48 | 49 | matches = pm.match(tokens) 50 | 51 | correct = [((1,), 0, 1), 52 | ((2,), 1, 2), 53 | ((1,), 2, 3), 54 | ((2,), 3, 4), 55 | ((1, 2, 3), 2, 5), 56 | ((2,), 5, 6), 57 | ((2,), 7, 8), 58 | ((2, 4), 7, 9)] 59 | 60 | self.assertEqual(matches, correct) 61 | 62 | 63 | class TestSimpleExecutor(object): 64 | 65 | def test_context_manager(self): 66 | fxn = lambda x: 2 * x 67 | with SimpleExecutor(fxn, max_workers=2) as ex: 68 | for i, x in enumerate(range(10)): 69 | ex.submit(i, x) 70 | results = {k: v for k, v in ex.results()} 71 | 72 | correct = {k: 2 * k for k in range(10)} 73 | assert results == correct 74 | 75 | 76 | class TestFailure(object): 77 | def test_eq(self): 78 | f0 = Failure() 79 | f1 = Failure() 80 | f2 = Failure(uid=1) 81 | f3 = Failure(uid=1, message='different message') 82 | assert f0 != f1 # different id 83 | assert f1 != f2 # different id 84 | assert f2 == f3 # same id 85 | 86 | 87 | class TestNestedDict(object): 88 | @pytest.fixture 89 | def normal_dict(self): 90 | return { 91 | 'a': 1, 92 | 'b': { 93 | 'c': 2, 94 | 'd': 3, 95 | }, 96 | } 97 | 98 | @pytest.fixture 99 | def nested_dict(self, normal_dict): 100 | return NestedDict(normal_dict) 101 | 102 | def test_as_dict(self, nested_dict, normal_dict): 103 | assert nested_dict.as_dict() == normal_dict 104 | 105 | def test_iter(self, nested_dict): 106 | assert set(nested_dict) == {'a', 'b'} 107 | 108 | def test_len(self, nested_dict): 109 | assert len(nested_dict) == 3 110 | 111 | def test_nested(self): 112 | d = NestedDict() 113 | d.set_nested(('a', 'b', 'c'), 1) 114 | d.set_nested(('a', 'd'), 2) 115 | 116 | assert d.as_dict() == { 117 | 'a': { 118 | 'b': { 119 | 'c': 1 120 | }, 121 | 'd': 2, 122 | } 123 | } 124 | assert d.get_nested(('a', 'd')) == 2 125 | 126 | with pytest.raises(KeyError): 127 | d.get_nested(('a', 'd', 'e')) 128 | 129 | def test_leaves(self, nested_dict): 130 | assert set(nested_dict.leaves()) == {1, 2, 3} 131 | 132 | 133 | class DummySlotsObject(EqualityMixinSlots): 134 | __slots__ = ['a', 'b', 'c'] 135 | 136 | def __init__(self, a, b, c=None): 137 | self.a = a 138 | self.b = b 139 | 140 | if c: 141 | self.c = c 142 | 143 | 144 | class TestEqualityMixinSlot(object): 145 | def test_equality(self): 146 | d1 = DummySlotsObject(5, 10) 147 | d2 = DummySlotsObject(5, 10) 148 | assert d1 == d2 149 | 150 | d3 = DummySlotsObject(5, 10, 20) 151 | d4 = DummySlotsObject(5, 11) 152 | assert d1 != d3 153 | assert d1 != d4 154 | 155 | 156 | class MemoizedClass(object): 157 | def __init__(self): 158 | self.calls = 0 159 | 160 | @memoize_with_key_fxn(lambda self, a, b: b) # key fxn only uses b 161 | def fxn_to_memoize(self, a, b): 162 | self.calls += 1 163 | return a + b 164 | 165 | 166 | class MemoizedClass2(object): 167 | def __init__(self): 168 | self.calls = 0 169 | 170 | def fxn(self, a, b): 171 | self.calls += 1 172 | return a + b 173 | 174 | fxn_memoized = DictMemoized(fxn) 175 | 176 | 177 | class TestDictMemoized(object): 178 | def test(self): 179 | mc = MemoizedClass2() 180 | result = mc.fxn_memoized('a', 'b') 181 | assert result == 'ab' 182 | assert mc.calls == 1 183 | 184 | result2 = mc.fxn_memoized('a', 'b') 185 | assert result2 == 'ab' 186 | assert mc.calls == 1 187 | 188 | result2 = mc.fxn_memoized('b', 'b') 189 | assert result2 == 'bb' 190 | assert mc.calls == 2 191 | 192 | 193 | class TestMemoizeWithKey(object): 194 | def test_caching(self): 195 | mc = MemoizedClass() 196 | result = mc.fxn_to_memoize('hey', 'there') 197 | assert mc.calls == 1 198 | assert result == 'heythere' 199 | 200 | # returns cached result 201 | result2 = mc.fxn_to_memoize('hey', 'there') 202 | assert result2 == 'heythere' 203 | assert mc.calls == 1 204 | 205 | # computes new result 206 | result3 = mc.fxn_to_memoize('hey', 'what') 207 | assert mc.calls == 2 208 | 209 | # only caches on 2nd arg, 'there', not 'you' 210 | result4 = mc.fxn_to_memoize('you', 'there') 211 | assert result4 == 'heythere' 212 | assert mc.calls == 2 213 | 214 | 215 | def test_truncated(): 216 | items = [1, 3, 1, 2, 2, 4] 217 | assert list(truncated(items, 3)) == [1, 3, 1] 218 | assert list(truncated(items, 0)) == [] 219 | 220 | 221 | class Dummy(object): 222 | def __init__(self, x): 223 | self.x = x 224 | 225 | 226 | class TestClassCounter(object): 227 | def test_count(self): 228 | counter = ClassCounter(Dummy) 229 | assert counter.count() == 0 230 | 231 | b1 = Dummy('1') 232 | b2 = Dummy('2') 233 | assert counter.count() == 2 234 | 235 | b3 = Dummy('1') 236 | assert counter.count() == 3 237 | 238 | b1 = Dummy('3') 239 | assert counter.count() == 3 # lost a reference to 1, due to reassignment 240 | 241 | del b3 242 | assert counter.count() == 2 # lost one due to deletion 243 | 244 | # this line is here to prevent b1 and b2 from being garbage collected 245 | x = [b1, b2] 246 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/log.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import math 4 | import numbers 5 | import os 6 | import platform 7 | import resource 8 | import sys 9 | from collections import MutableMapping 10 | from contextlib import contextmanager 11 | from os.path import join 12 | 13 | from IPython.core.display import display, HTML 14 | from pyhocon import ConfigFactory 15 | from pyhocon import ConfigMissingException 16 | from pyhocon import ConfigTree 17 | from pyhocon import HOCONConverter 18 | 19 | from gtd.utils import NestedDict, Config 20 | 21 | 22 | def in_ipython(): 23 | try: 24 | __IPYTHON__ 25 | return True 26 | except NameError: 27 | return False 28 | 29 | 30 | def print_with_fonts(tokens, sizes, colors, background=None): 31 | 32 | def style(text, size=12, color='black'): 33 | return u'{}'.format(size, color, text) 34 | 35 | styled = [style(token, size, color) for token, size, color in zip(tokens, sizes, colors)] 36 | text = u' '.join(styled) 37 | 38 | if background: 39 | text = u'{}'.format(background, text) 40 | 41 | display(HTML(text)) 42 | 43 | 44 | def gb_used(): 45 | used = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss 46 | if platform.system() != 'Darwin': 47 | # on Linux, used is in terms of kilobytes 48 | power = 2 49 | else: 50 | # on Mac, used is in terms of bytes 51 | power = 3 52 | return float(used) / math.pow(1024, power) 53 | 54 | 55 | class Metadata(MutableMapping): 56 | """A wrapper around ConfigTree. 57 | 58 | Supports a name_scope contextmanager. 59 | """ 60 | def __init__(self, config_tree=None): 61 | if config_tree is None: 62 | config_tree = ConfigTree() 63 | 64 | self._config_tree = config_tree 65 | self._namestack = [] 66 | 67 | @contextmanager 68 | def name_scope(self, name): 69 | self._namestack.append(name) 70 | yield 71 | self._namestack.pop() 72 | 73 | def _full_key(self, key): 74 | return '.'.join(self._namestack + [key]) 75 | 76 | def __getitem__(self, key): 77 | try: 78 | val = self._config_tree.get(self._full_key(key)) 79 | except ConfigMissingException: 80 | raise KeyError(key) 81 | 82 | if isinstance(val, ConfigTree): 83 | return Metadata(val) 84 | return val 85 | 86 | def __setitem__(self, key, value): 87 | """Put a value (key is a dot-separated name).""" 88 | self._config_tree.put(self._full_key(key), value) 89 | 90 | def __delitem__(self, key): 91 | raise NotImplementedError() 92 | 93 | def __iter__(self): 94 | return iter(self._config_tree) 95 | 96 | def __len__(self): 97 | return len(self._config_tree) 98 | 99 | def __repr__(self): 100 | return self.to_str() 101 | 102 | def to_str(self, fmt='hocon'): 103 | return HOCONConverter.convert(self._config_tree, fmt) 104 | 105 | def to_file(self, path, fmt='hocon'): 106 | with open(path, 'w') as f: 107 | f.write(self.to_str(fmt)) 108 | 109 | @classmethod 110 | def from_file(cls, path, fmt='hocon'): 111 | if fmt == 'hocon': 112 | config_tree = ConfigFactory.parse_file(path) 113 | elif fmt == 'json': 114 | with open(path, 'r') as f: 115 | d = json.load(f) 116 | config_tree = ConfigFactory.from_dict(d) 117 | else: 118 | raise ValueError('Invalid format: {}'.format(fmt)) 119 | 120 | return cls(config_tree) 121 | 122 | 123 | class SyncedMetadata(Metadata): 124 | """A Metadata object which writes to file after every change.""" 125 | def __init__(self, path, fmt='hocon'): 126 | if os.path.exists(path): 127 | m = Metadata.from_file(path, fmt) 128 | else: 129 | m = Metadata() 130 | 131 | super(SyncedMetadata, self).__init__(m._config_tree) 132 | self._path = path 133 | self._fmt = fmt 134 | 135 | def __setitem__(self, key, value): 136 | super(SyncedMetadata, self).__setitem__(key, value) 137 | self.to_file(self._path, fmt=self._fmt) 138 | 139 | 140 | def print_list(l): 141 | for item in l: 142 | print item 143 | 144 | 145 | def print_no_newline(s): 146 | sys.stdout.write(s) 147 | sys.stdout.flush() 148 | 149 | 150 | def set_log_level(level): 151 | """Set the log-level of the root logger of the logging module. 152 | 153 | Args: 154 | level: can be an integer such as 30 (logging.WARN), or a string such as 'WARN' 155 | """ 156 | if isinstance(level, str): 157 | level = logging._levelNames[level] 158 | 159 | logger = logging.getLogger() # gets root logger 160 | logger.setLevel(level) 161 | 162 | 163 | def jupyter_no_margins(): 164 | """Cause Jupyter notebook to take up 100% of window width.""" 165 | display(HTML("")) 166 | 167 | 168 | class TraceSession(object): 169 | def __init__(self, tracer): 170 | self.tracer = tracer 171 | self._values = {} 172 | 173 | @property 174 | def values(self): 175 | return self._values 176 | 177 | def save(self, save_path): 178 | with open(save_path, 'w') as f: 179 | json.dump(self.values, f, indent=4, sort_keys=True) 180 | 181 | def __enter__(self): 182 | if self.tracer._current_session: 183 | raise RuntimeError('Already in the middle of a TraceSession') 184 | 185 | # register as the current session 186 | self.tracer._current_session = self 187 | return self 188 | 189 | def __exit__(self, exc_type, exc_val, exc_tb): 190 | # un-register 191 | self.tracer._current_session = None 192 | 193 | 194 | class Tracer(object): 195 | """Log values computed during program execution. 196 | 197 | Values are logged to the currently active TraceSession object. 198 | """ 199 | def __init__(self): 200 | self._current_session = None 201 | 202 | def session(self): 203 | return TraceSession(self) 204 | 205 | def log(self, logging_callback): 206 | """If we are in a TraceSession, execute the logging_callback. 207 | 208 | The logging_callback should take a `values` dict as its only argument, and modify `values` in some way. 209 | 210 | Args: 211 | logging_callback (Callable[dict]): a function which takes a `values` dict as its only argument. 212 | """ 213 | if self._current_session is None: 214 | return 215 | logging_callback(self._current_session.values) 216 | 217 | def log_put(self, name, value): 218 | """Log a value. 219 | 220 | Args: 221 | name (str): name of the variable 222 | value (object) 223 | """ 224 | def callback(values): 225 | if name in values: 226 | raise RuntimeError('{} already logged'.format(name)) 227 | values[name] = value 228 | 229 | return self.log(callback) 230 | 231 | def log_append(self, name, value): 232 | """Append a value. 233 | 234 | Args: 235 | name (str): name of the variable 236 | value (object): value to append 237 | """ 238 | def callback(values): 239 | if name not in values: 240 | values[name] = [] 241 | values[name].append(value) 242 | 243 | return self.log(callback) 244 | 245 | 246 | def indent(s, spaces=4): 247 | whitespace = u' ' * spaces 248 | return u'\n'.join(whitespace + line for line in s.split(u'\n')) 249 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/codalab.py: -------------------------------------------------------------------------------- 1 | """Tools for working with CodaLab.""" 2 | import cPickle as pickle 3 | import json 4 | import os 5 | import platform 6 | import shutil 7 | import sys 8 | import tempfile 9 | from contextlib import contextmanager 10 | 11 | import matplotlib.image as mpimg 12 | from gtd.io import shell 13 | 14 | __author__ = 'kelvinguu' 15 | 16 | 17 | # need to be specified by user 18 | worksheet = None 19 | site = None 20 | 21 | 22 | def get_uuids(): 23 | """List all bundle UUIDs in the worksheet.""" 24 | result = shell('cl ls -w {} -u'.format(worksheet)) 25 | uuids = result.split('\n') 26 | uuids = uuids[1:-1] # trim non uuids 27 | return uuids 28 | 29 | 30 | @contextmanager 31 | def open_file(uuid, path): 32 | """Get the raw file content within a particular bundle at a particular path. 33 | 34 | Path have no leading slash. 35 | """ 36 | # create temporary file just so we can get an unused file path 37 | f = tempfile.NamedTemporaryFile() 38 | f.close() # close and delete right away 39 | fname = f.name 40 | 41 | # download file to temporary path 42 | cmd ='cl down -o {} -w {} {}/{}'.format(fname, worksheet, uuid, path) 43 | try: 44 | shell(cmd) 45 | except RuntimeError: 46 | try: 47 | os.remove(fname) # if file exists, remove it 48 | except OSError: 49 | pass 50 | raise IOError('Failed to open file {}/{}'.format(uuid, path)) 51 | 52 | f = open(fname) 53 | yield f 54 | f.close() 55 | os.remove(fname) # delete temp file 56 | 57 | 58 | class Bundle(object): 59 | def __init__(self, uuid): 60 | self.uuid = uuid 61 | 62 | def __getattr__(self, item): 63 | """ 64 | Load attributes: history, meta on demand 65 | """ 66 | if item == 'history': 67 | try: 68 | with open_file(self.uuid, 'history.cpkl') as f: 69 | value = pickle.load(f) 70 | except IOError: 71 | value = {} 72 | 73 | elif item == 'meta': 74 | try: 75 | with open_file(self.uuid, 'meta.json') as f: 76 | value = json.load(f) 77 | except IOError: 78 | value = {} 79 | 80 | # load codalab info 81 | fields = ('uuid', 'name', 'bundle_type', 'state', 'time', 'remote') 82 | cmd = 'cl info -w {} -f {} {}'.format(worksheet, ','.join(fields), self.uuid) 83 | result = shell(cmd) 84 | info = dict(zip(fields, result.split())) 85 | value.update(info) 86 | 87 | elif item in ('stderr', 'stdout'): 88 | with open_file(self.uuid, item) as f: 89 | value = f.read() 90 | 91 | else: 92 | raise AttributeError(item) 93 | 94 | self.__setattr__(item, value) 95 | return value 96 | 97 | def __repr__(self): 98 | return self.uuid 99 | 100 | def load_img(self, img_path): 101 | """ 102 | Return an image object that can be immediately plotted with matplotlib 103 | """ 104 | with open_file(self.uuid, img_path) as f: 105 | return mpimg.imread(f) 106 | 107 | 108 | def download_logs(bundle, log_dir): 109 | if bundle.meta['bundle_type'] != 'run' or bundle.meta['state'] == 'queued': 110 | print 'Skipped {}\n'.format(bundle.uuid) 111 | return 112 | 113 | if isinstance(bundle, str): 114 | bundle = Bundle(bundle) 115 | 116 | uuid = bundle.uuid 117 | name = bundle.meta['name'] 118 | log_path = os.path.join(log_dir, '{}_{}'.format(name, uuid)) 119 | 120 | cmd ='cl down -o {} -w {} {}/logs'.format(log_path, worksheet, uuid) 121 | 122 | print uuid 123 | try: 124 | shell(cmd, verbose=True) 125 | except RuntimeError: 126 | print 'Failed to download', bundle.uuid 127 | print 128 | 129 | 130 | def report(render, uuids=None, reverse=True, limit=None): 131 | if uuids is None: 132 | uuids = get_uuids() 133 | 134 | if reverse: 135 | uuids = uuids[::-1] 136 | 137 | if limit is not None: 138 | uuids = uuids[:limit] 139 | 140 | for uuid in uuids: 141 | bundle = Bundle(uuid) 142 | try: 143 | render(bundle) 144 | except Exception: 145 | print 'Failed to render', bundle.uuid 146 | 147 | 148 | def monitor_jobs(logdir, uuids=None, reverse=True, limit=None): 149 | if os.path.exists(logdir): 150 | delete = raw_input('Overwrite existing logdir? ({})'.format(logdir)) 151 | if delete == 'y': 152 | shutil.rmtree(logdir) 153 | os.makedirs(logdir) 154 | else: 155 | os.makedirs(logdir) 156 | print 'Using logdir:', logdir 157 | 158 | report(lambda bd: download_logs(bd, logdir), uuids, reverse, limit) 159 | 160 | 161 | def tensorboard(logdir): 162 | print 'Run this in bash:' 163 | shell('tensorboard --logdir={}'.format(logdir), verbose=True, debug=True) 164 | print '\nGo to TensorBoard: http://localhost:6006/' 165 | 166 | 167 | def add_to_sys_path(path): 168 | """Add a path to the system PATH.""" 169 | sys.path.insert(0, path) 170 | 171 | 172 | def configure_matplotlib(): 173 | """Set Matplotlib backend to 'Agg', which is necessary on CodaLab docker image.""" 174 | import warnings 175 | import matplotlib 176 | with warnings.catch_warnings(): 177 | warnings.simplefilter('ignore') 178 | matplotlib.use('Agg') # needed when running from server 179 | 180 | 181 | def in_codalab(): 182 | """Check if we are running inside CodaLab Docker container or not.""" 183 | # TODO: below is a total hack. If the OS is not a Mac, we assume we're on CodaLab. 184 | return platform.system() != 'Darwin' 185 | 186 | 187 | def upload(full_path, bundle_name=None, excludes='*.ipynb .git .ipynb_checkpoints .ignore'): 188 | """ 189 | Upload a file or directory to the codalab worksheet 190 | Args: 191 | full_path: Path + filename of file to upload 192 | bundle_name: Name to upload file/directory as. I 193 | """ 194 | directory, filename = os.path.split(full_path) 195 | if bundle_name is None: 196 | bundle_name = filename 197 | shell('cl up -n {} -w {} {} -x {}'.format(bundle_name, worksheet, full_path, excludes), verbose=True) 198 | 199 | 200 | def launch_job(job_name, cmd, 201 | dependencies=tuple(), 202 | queue='john', image='kelvinguu/gtd:1.0', 203 | memory=None, cpus='5', 204 | network=False, 205 | debug=False, tail=False): 206 | """Launch a job on CodaLab (optionally upload code that the job depends on). 207 | 208 | Args: 209 | job_name: name of the job 210 | cmd: command to execute 211 | dependencies: list of other bundles that we depend on 212 | debug: if True, prints SSH commands, but does not execute them 213 | tail: show the streaming output returned by CodaLab once it launches the job 214 | """ 215 | print 'Remember to set up SSH tunnel and LOG IN through the command line before calling this.' 216 | options = '-v -n {} -w {} --request-queue {} --request-docker-image {} --request-cpus {}'.format( 217 | job_name, worksheet, queue, image, cpus) 218 | 219 | if memory: 220 | options += ' --request-memory {}'.format(memory) 221 | if network: 222 | options += ' --request-network' 223 | 224 | dep_str = ' '.join(['{0}:{0}'.format(dep) for dep in dependencies]) 225 | full_cmd = "cl run {} {} '{}'".format(options, dep_str, cmd) 226 | if tail: 227 | full_cmd += ' -t' 228 | shell(full_cmd, verbose=True, debug=debug) 229 | 230 | 231 | if in_codalab(): 232 | configure_matplotlib() 233 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/source_encoder.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod, abstractproperty 2 | from collections import namedtuple 3 | from itertools import izip 4 | 5 | import torch 6 | from gtd.ml.torch.recurrent import tile_state, gated_update 7 | from torch.nn import Module 8 | from torch.nn import Parameter 9 | 10 | from gtd.ml.torch.seq_batch import SequenceBatchElement 11 | 12 | 13 | class SourceEncoder(Module): 14 | __metaclass__ = ABCMeta 15 | 16 | @abstractproperty 17 | def hidden_dim(self): 18 | raise NotImplementedError 19 | 20 | @abstractmethod 21 | def forward(self, input_embeds_list): 22 | """Embed a source sequence. 23 | 24 | Args: 25 | input_embeds_list (list[SequenceBatchElement]): where each element is of shape (batch_size, input_dim) 26 | 27 | Returns: 28 | hidden_states_list (list[SequenceBatchElement]) where each element is (batch_size, hidden_dim) 29 | """ 30 | raise NotImplementedError 31 | 32 | 33 | class SimpleSourceEncoder(SourceEncoder): 34 | def __init__(self, rnn_cell): 35 | """ 36 | 37 | Args: 38 | rnn_cell (DecoderCell) 39 | """ 40 | super(SimpleSourceEncoder, self).__init__() 41 | self.rnn_cell = rnn_cell 42 | hidden_dim = self.rnn_cell.hidden_size 43 | self.h0 = Parameter(torch.zeros(hidden_dim)) 44 | self.c0 = Parameter(torch.zeros(hidden_dim)) 45 | self._hidden_dim = hidden_dim 46 | 47 | @property 48 | def hidden_dim(self): 49 | return self._hidden_dim 50 | 51 | def forward(self, input_embeds_list): 52 | """ 53 | 54 | Args: 55 | input_embeds_list (list[SequenceBatchElement]): where each element is of shape (batch_size, input_dim) 56 | 57 | Returns: 58 | hidden_states_list (list[SequenceBatchElement]) where each element is (batch_size, hidden_dim) 59 | """ 60 | batch_size = input_embeds_list[0].values.size()[0] 61 | 62 | h = tile_state(self.h0, batch_size) # (batch_size, hidden_dim) 63 | c = tile_state(self.c0, batch_size) # (batch_size, hidden_dim) 64 | 65 | hidden_states_list = [] 66 | 67 | for t, x in enumerate(input_embeds_list): 68 | # x.values has shape (batch_size, input_dim) 69 | # x.mask has shape (batch_size, 1) 70 | h_new, c_new = self.rnn_cell(x.values, (h, c)) 71 | h = gated_update(h, h_new, x.mask) 72 | c = gated_update(c, c_new, x.mask) 73 | hidden_states_list.append(SequenceBatchElement(h, x.mask)) 74 | 75 | return hidden_states_list 76 | 77 | 78 | class BidirectionalSourceEncoder(SourceEncoder): 79 | def __init__(self, input_dim, hidden_dim, rnn_cell_factory): 80 | super(BidirectionalSourceEncoder, self).__init__() 81 | 82 | if hidden_dim % 2 != 0: 83 | raise ValueError('hidden_dim must be even for BidirectionalSourceEncoder.') 84 | self._hidden_dim = hidden_dim 85 | 86 | build_encoder = lambda: SimpleSourceEncoder(rnn_cell_factory(input_dim, hidden_dim / 2)) 87 | self.forward_encoder = build_encoder() 88 | self.backward_encoder = build_encoder() 89 | 90 | @property 91 | def hidden_dim(self): 92 | return self._hidden_dim 93 | 94 | def forward(self, input_embeds_list): 95 | """Compute bidirectional RNN embeddings. 96 | 97 | Args: 98 | input_embeds_list (list[SequenceBatchElement]) 99 | 100 | Returns: 101 | forward_states (list[SequenceBatchElement]): ordered left to right 102 | backward_states (list[SequenceBatchElement]): ordered left to right 103 | """ 104 | reverse = lambda seq: list(reversed(seq)) 105 | forward_states = self.forward_encoder(input_embeds_list) 106 | backward_states = reverse(self.backward_encoder(reverse(input_embeds_list))) 107 | return BidirectionalEncoderOutput(forward_states, backward_states) 108 | 109 | 110 | class BidirectionalEncoderOutput(namedtuple('BidirectionalEncoderOutput', ['forward_states', 'backward_states'])): 111 | """ 112 | Attributes: 113 | forward_states (list[SequenceBatchElement]): ordered left to right 114 | backward_states (list[SequenceBatchElement]): ordered left to right 115 | """ 116 | @property 117 | def combined_states(self): 118 | """Concatenates forward and backward hidden states: [forward; backward]. 119 | 120 | Returns: 121 | combined_states (list[SequenceBatchElement]): ordered left to right 122 | """ 123 | combined_states = [SequenceBatchElement(torch.cat([f.values, b.values], 1), f.mask) 124 | for f, b in izip(self.forward_states, self.backward_states)] 125 | return combined_states 126 | 127 | @property 128 | def final_states(self): 129 | """Return the final forward and backward states. 130 | 131 | Returns: 132 | forward_state (Variable): right-most forward state, of shape (batch_size, hidden_dim) 133 | backward_state (Variable): left-most backward state, of shape (batch_size, hidden_dim) 134 | """ 135 | return self.forward_states[-1].values, self.backward_states[0].values 136 | 137 | 138 | # TODO(kelvin): test this 139 | class MultiLayerSourceEncoder(SourceEncoder): 140 | def __init__(self, input_dim, hidden_dim, num_layers, rnn_cell_factory): 141 | """ 142 | 143 | Args: 144 | input_dim (int) 145 | hidden_dim (int) 146 | num_layers (int) 147 | rnn_cell_factory (Callable[[int, int], RNNCell): takes input_dim and output_dim as arguments. 148 | """ 149 | super(MultiLayerSourceEncoder, self).__init__() 150 | self.layers = [] 151 | for layer in range(num_layers): 152 | in_dim = input_dim if layer == 0 else hidden_dim 153 | out_dim = hidden_dim 154 | encoder = BidirectionalSourceEncoder(in_dim, out_dim, rnn_cell_factory) 155 | self.add_module('encoder_layer_{}'.format(layer), encoder) 156 | self.layers.append(encoder) 157 | 158 | @property 159 | def hidden_dim(self): 160 | return self.layers[-1].hidden_dim 161 | 162 | def forward(self, input_embeds_list): 163 | """ 164 | 165 | Args: 166 | input_embeds_list (list[SequenceBatchElement]): where each element is of shape (batch_size, input_dim) 167 | 168 | Returns: 169 | hidden_states_list (list[SequenceBatchElement]) where each element is (batch_size, hidden_dim) 170 | """ 171 | for i, layer in enumerate(self.layers): 172 | if i == 0: 173 | prev_hidden_states = input_embeds_list 174 | else: 175 | prev_hidden_states = [SequenceBatchElement(torch.cat([f.values, b.values], 1), f.mask) 176 | for f, b in izip(forward_states, backward_states)] 177 | 178 | new_forward_states, new_backward_states = layer(prev_hidden_states) 179 | 180 | if i == 0: 181 | # no skip connections here, because dimensions don't match 182 | forward_states, backward_states = new_forward_states, new_backward_states 183 | else: 184 | # add residuals to previous hidden states 185 | add_residuals = lambda a_list, b_list: [SequenceBatchElement(a.values + b.values, a.mask) 186 | for a, b in izip(a_list, b_list)] 187 | 188 | forward_states = add_residuals(forward_states, new_forward_states) 189 | backward_states = add_residuals(backward_states, new_backward_states) 190 | 191 | return BidirectionalEncoderOutput(forward_states, backward_states) -------------------------------------------------------------------------------- /textmorph/edit_model/edit_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import Module, Linear, Parameter, Hardtanh 3 | from gtd.ml.torch.utils import GPUVariable 4 | from gtd.ml.torch.seq_batch import SequenceBatch 5 | import numpy as np 6 | 7 | class EditEncoder(Module): 8 | """ 9 | EditEncoder maps insert / delete embeddings into a single edit vector of dimensionality edit_dim 10 | """ 11 | def __init__(self, word_dim, edit_dim, kappa_init, norm_eps, norm_max): 12 | super(EditEncoder, self).__init__() 13 | self.linear = Linear(edit_dim, edit_dim) 14 | self.linear_prenoise = Linear(word_dim, edit_dim/2, bias=False) 15 | self.noise_scaler = kappa_init 16 | self.norm_eps = norm_eps 17 | self.norm_max = norm_max 18 | self.normclip = Hardtanh(0, self.norm_max - norm_eps) 19 | 20 | def forward(self, insert_embeds, insert_embeds_exact, delete_embeds, delete_embeds_exact, draw_samples = False, draw_p = False): 21 | """Create agenda vector. 22 | 23 | Args: 24 | insert_embeds (SequenceBatch): of shape (batch_size, max_edits, word_dim) 25 | insert_embeds_exact (SequenceBatch): of shape (batch_size, max_edits, word_dim) 26 | delete_embeds (SequenceBatch): of shape (batch_size, max_edits, word_dim) 27 | delete_embeds_exact (SequenceBatch): of shape (batch_size, max_edits, word_dim) 28 | draw_samples (bool) : flag for whether to add noise for variational approx. disable at test time. 29 | 30 | Returns: 31 | edit_embed (Variable): of shape (batch_size, edit_vec_cim) 32 | """ 33 | insert_embed = SequenceBatch.reduce_sum(insert_embeds) # (batch_size, word_dim) 34 | insert_embed += SequenceBatch.reduce_sum(insert_embeds_exact) # (batch_size, word_dim) 35 | delete_embed = SequenceBatch.reduce_sum(delete_embeds) # (batch_size, word_dim) 36 | delete_embed += SequenceBatch.reduce_sum(delete_embeds_exact) # (batch_size, word_dim) 37 | insert_set = self.linear_prenoise(insert_embed) 38 | delete_set = self.linear_prenoise(delete_embed) 39 | combined_map = torch.cat([insert_set, delete_set], 1) 40 | if draw_samples: 41 | if draw_p: 42 | batch_size, edit_dim = combined_map.size() 43 | combined_map = self.draw_p_noise(batch_size, edit_dim) 44 | else: 45 | combined_map = self.sample_vMF(combined_map, self.noise_scaler) 46 | edit_embed = combined_map 47 | return edit_embed 48 | 49 | def seq_batch_noise(self, seq_batch, draw_noise): 50 | """ 51 | Returns a noisy version of seq_batch, in which every vector is noisy and unit norm. 52 | :param seq_batch(SequenceBatch): a sequence batch of elements 53 | :return: noisy version of seq-batch 54 | """ 55 | values = seq_batch.values 56 | mask = seq_batch.mask 57 | batch_size, max_edits, w_embed_size = values.size() 58 | new_values = GPUVariable(torch.from_numpy(np.zeros((batch_size, max_edits, w_embed_size),dtype=np.float32))) 59 | phint = self.sample_vMF(values[:,0,:], self.noise_scaler) 60 | prand = self.draw_p_noise(batch_size, w_embed_size) 61 | m_expand = mask.expand(batch_size, w_embed_size) 62 | new_values[:, 0, :] = phint*m_expand+ prand*(1-m_expand) 63 | return SequenceBatch(values=new_values*draw_noise, mask=mask) 64 | 65 | def draw_p_noise(self, batch_size, edit_dim): 66 | rand_draw = GPUVariable(torch.randn(batch_size, edit_dim)) 67 | rand_draw = rand_draw / torch.norm(rand_draw, p=2, dim=1).expand(batch_size, edit_dim) 68 | rand_norms = (torch.rand(batch_size,1)*self.norm_max).expand(batch_size, edit_dim) 69 | return rand_draw * GPUVariable(rand_norms) 70 | 71 | 72 | def add_norm_noise(self, munorm, eps): 73 | """ 74 | KL loss is - log(maxvalue/eps) 75 | cut at maxvalue-eps, and add [0,eps] noise. 76 | """ 77 | trand = torch.rand(1).expand(munorm.size())*eps 78 | return (self.normclip(munorm) + GPUVariable(trand)) 79 | 80 | def sample_vMF(self, mu, kappa): 81 | """vMF sampler in pytorch. 82 | 83 | http://stats.stackexchange.com/questions/156729/sampling-from-von-mises-fisher-distribution-in-python 84 | 85 | Args: 86 | mu (Tensor): of shape (batch_size, 2*word_dim) 87 | kappa (Float): controls dispersion. kappa of zero is no dispersion. 88 | """ 89 | batch_size, id_dim = mu.size() 90 | result_list = [] 91 | for i in range(batch_size): 92 | munorm = mu[i].norm().expand(id_dim) 93 | munoise = self.add_norm_noise(munorm, self.norm_eps) 94 | if float(mu[i].norm().data.cpu().numpy()) > 1e-10: 95 | # sample offset from center (on sphere) with spread kappa 96 | w = self._sample_weight(kappa, id_dim) 97 | wtorch = GPUVariable(w*torch.ones(id_dim)) 98 | 99 | # sample a point v on the unit sphere that's orthogonal to mu 100 | v = self._sample_orthonormal_to(mu[i]/munorm, id_dim) 101 | 102 | # compute new point 103 | scale_factr = torch.sqrt(GPUVariable(torch.ones(id_dim)) - torch.pow(wtorch,2)) 104 | orth_term = v * scale_factr 105 | muscale = mu[i] * wtorch / munorm 106 | sampled_vec = (orth_term + muscale)*munoise 107 | else: 108 | rand_draw = GPUVariable(torch.randn(id_dim)) 109 | rand_draw = rand_draw / torch.norm(rand_draw, p=2).expand(id_dim) 110 | rand_norms = (torch.rand(1) * self.norm_eps).expand(id_dim) 111 | sampled_vec = rand_draw*GPUVariable(rand_norms)#mu[i] 112 | result_list.append(sampled_vec) 113 | 114 | return torch.stack(result_list,0) 115 | 116 | def _sample_weight(self, kappa, dim): 117 | """Rejection sampling scheme for sampling distance from center on 118 | surface of the sphere. 119 | """ 120 | dim = dim - 1 # since S^{n-1} 121 | b = dim / (np.sqrt(4. * kappa ** 2 + dim ** 2) + 2 * kappa) # b= 1/(sqrt(4.* kdiv**2 + 1) + 2 * kdiv) 122 | x = (1. - b) / (1. + b) 123 | c = kappa * x + dim * np.log(1 - x ** 2) # dim * (kdiv *x + np.log(1-x**2)) 124 | 125 | while True: 126 | z = np.random.beta(dim / 2., dim / 2.) #concentrates towards 0.5 as d-> inf 127 | w = (1. - (1. + b) * z) / (1. - (1. - b) * z) 128 | u = np.random.uniform(low=0, high=1) 129 | if kappa * w + dim * np.log(1. - x * w) - c >= np.log(u): #thresh is dim *(kdiv * (w-x) + log(1-x*w) -log(1-x**2)) 130 | return w 131 | 132 | def _sample_orthonormal_to(self, mu, dim): 133 | """Sample point on sphere orthogonal to mu. 134 | """ 135 | v = GPUVariable(torch.randn(dim)) 136 | rescale_value = mu.dot(v) / mu.norm() 137 | proj_mu_v = mu * rescale_value.expand(dim) 138 | ortho = v - proj_mu_v 139 | ortho_norm = torch.norm(ortho) 140 | return ortho / ortho_norm.expand_as(ortho) 141 | 142 | def test_sample_weight(kappa, dim): 143 | """Rejection sampling scheme for sampling distance from center on 144 | surface of the sphere. 145 | """ 146 | dim = dim - 1 # since S^{n-1} 147 | b = dim / (np.sqrt(4. * kappa ** 2 + dim ** 2) + 2 * kappa) 148 | x = (1. - b) / (1. + b) 149 | c = kappa * x + dim * np.log(1 - x ** 2) 150 | 151 | while True: 152 | z = np.random.beta(dim / 2., dim / 2.) 153 | w = (1. - (1. + b) * z) / (1. - (1. - b) * z) 154 | u = np.random.uniform(low=0, high=1) 155 | if kappa * w + dim * np.log(1. - x * w) - c >= np.log(u): 156 | return w 157 | 158 | def get_ev(kappa,dim,nsamp): 159 | samp_in = np.array([test_sample_weight(kappa,dim) for i in xrange(nsamp)]) 160 | return np.mean(samp_in), np.std(samp_in), np.percentile(samp_in, np.arange(0,100,10)) 161 | 162 | def get_mode(kappa,dim): 163 | return np.sqrt(4*(kappa**2.0)+dim**2.0+6*dim+9)/(2*kappa) - (dim+3.0)/(2*kappa) -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/vocab.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | from abc import ABCMeta, abstractmethod 3 | from collections import Mapping 4 | 5 | import numpy as np 6 | 7 | from gtd.chrono import verboserate 8 | from gtd.io import num_lines 9 | from gtd.utils import EqualityMixin, random_seed 10 | 11 | 12 | class Vocab(object): 13 | __metaclass__ = ABCMeta 14 | 15 | @abstractmethod 16 | def word2index(self, w): 17 | pass 18 | 19 | @abstractmethod 20 | def index2word(self, i): 21 | pass 22 | 23 | 24 | class SimpleVocab(Vocab, EqualityMixin): 25 | """A simple vocabulary object.""" 26 | 27 | def __init__(self, tokens): 28 | """Create a vocab. 29 | 30 | Args: 31 | tokens (list[unicode]): a unique list of unicode tokens 32 | 33 | If t = tokens[i], this vocab will map token t to the integer i. 34 | """ 35 | if not isinstance(tokens, list): 36 | raise ValueError('tokens must be a list') 37 | 38 | # build mapping 39 | word2index = {} 40 | for i, tok in enumerate(tokens): 41 | word2index[tok] = i 42 | 43 | if len(tokens) != len(word2index): 44 | raise ValueError('tokens must be unique') 45 | 46 | self._index2word = list(tokens) # make a copy 47 | self._word2index = word2index 48 | 49 | @property 50 | def tokens(self): 51 | """Return the full list of tokens sorted by their index.""" 52 | return self._index2word 53 | 54 | def __iter__(self): 55 | """Iterate through the full list of tokens.""" 56 | return iter(self._index2word) 57 | 58 | def __len__(self): 59 | """Total number of tokens indexed.""" 60 | return len(self._index2word) 61 | 62 | def __contains__(self, w): 63 | """Check if a token has been indexed by this vocab.""" 64 | return w in self._word2index 65 | 66 | def word2index(self, w): 67 | return self._word2index[w] 68 | 69 | def index2word(self, i): 70 | return self._index2word[i] 71 | 72 | def words2indices(self, words): 73 | return map(self.word2index, words) 74 | 75 | def indices2words(self, indices): 76 | return [self.index2word(i) for i in indices] 77 | 78 | def save(self, path): 79 | """Save SimpleVocab to file path. 80 | 81 | Args: 82 | path (str) 83 | """ 84 | with open(path, 'w') as f: 85 | for word in self._index2word: 86 | f.write(word) 87 | f.write('\n') 88 | 89 | @classmethod 90 | def load(cls, path): 91 | """Load SimpleVocab from file path. 92 | 93 | Args: 94 | path (str) 95 | 96 | Returns: 97 | SimpleVocab 98 | """ 99 | strip_newline = lambda s: s[:-1] 100 | with open(path, 'r') as f: 101 | tokens = [strip_newline(line) for line in f] 102 | return cls(tokens) 103 | 104 | 105 | class WordVocab(SimpleVocab): 106 | """WordVocab. 107 | 108 | IMPORTANT NOTE: WordVocab is blind to casing! All words are converted to lower-case. 109 | 110 | A WordVocab is required to have the following special tokens: UNK, START, STOP. 111 | """ 112 | UNK = u'' 113 | START = u'' 114 | STOP = u'' 115 | SPECIAL_TOKENS = (UNK, START, STOP) 116 | 117 | def __init__(self, tokens): 118 | super(WordVocab, self).__init__([t.lower() for t in tokens]) 119 | 120 | # make sure all special tokens present 121 | for special in self.SPECIAL_TOKENS: 122 | if special not in self: 123 | raise ValueError('All special tokens must be present in tokens. Missing {}'.format(special)) 124 | 125 | def word2index(self, w): 126 | """Map a word to an integer. 127 | 128 | Automatically lower-cases the word before mapping it. 129 | 130 | If the word is not known to the vocab, return the index for UNK. 131 | """ 132 | sup = super(WordVocab, self) 133 | try: 134 | return sup.word2index(w.lower()) 135 | except KeyError: 136 | return sup.word2index(self.UNK) 137 | 138 | 139 | class SimpleEmbeddings(Mapping): 140 | def __init__(self, array, vocab): 141 | """Create embeddings object. 142 | 143 | Args: 144 | array (np.array): has shape (vocab_size, embed_dim) 145 | vocab (SimpleVocab): a Vocab object 146 | """ 147 | assert len(array.shape) == 2 148 | assert array.shape[0] == len(vocab) # entries line up 149 | 150 | self.array = array 151 | self.vocab = vocab 152 | 153 | def __contains__(self, w): 154 | return w in self.vocab 155 | 156 | def __getitem__(self, w): 157 | idx = self.vocab.word2index(w) 158 | return np.copy(self.array[idx]) 159 | 160 | def __iter__(self): 161 | return iter(self.vocab) 162 | 163 | def __len__(self): 164 | return len(self.vocab) 165 | 166 | @property 167 | def embed_dim(self): 168 | return self.array.shape[1] 169 | 170 | @classmethod 171 | def from_file(cls, file_path, embed_dim, vocab_size=None): 172 | """Load word embeddings. 173 | 174 | Args: 175 | file_path (str) 176 | embed_dim (int): expected embed_dim 177 | vocab_size (int): max # of words in the vocab. If not specified, uses all available vectors in file. 178 | """ 179 | if vocab_size is None: 180 | vocab_size = num_lines(file_path) 181 | 182 | words = [] 183 | embeds = [] 184 | with codecs.open(file_path, 'r', encoding='utf-8') as f: 185 | lines = verboserate(f, desc='Loading embeddings from {}'.format(file_path), total=vocab_size) 186 | for i, line in enumerate(lines): 187 | if i == vocab_size: break 188 | tokens = line.split() 189 | word, embed = tokens[0], np.array([float(tok) for tok in tokens[1:]], dtype=np.float32) 190 | if len(embed) != embed_dim: 191 | raise ValueError('expected {} dims, got {} dims'.format(embed_dim, len(embed))) 192 | words.append(word) 193 | embeds.append(embed) 194 | 195 | vocab = SimpleVocab(words) 196 | embed_matrix = np.stack(embeds) 197 | embed_matrix = embed_matrix.astype(np.float32) 198 | assert embed_matrix.shape == (vocab_size, embed_dim) 199 | return cls(embed_matrix, vocab) 200 | 201 | def to_file(self, file_path): 202 | array = self.array 203 | with codecs.open(file_path, 'w', encoding='utf-8') as f: 204 | for i, word in enumerate(self.vocab): 205 | vec_str = u' '.join(str(x) for x in array[i]) 206 | f.write(u'{} {}'.format(word, vec_str)) 207 | f.write('\n') 208 | 209 | def with_special_tokens(self, random_seed=0): 210 | """Return a new SimpleEmbeddings object with special tokens inserted at the front of the vocab. 211 | 212 | In the new vocab, special tokens will occupy indices 0, 1, ..., len(special_tokens) - 1. 213 | The special tokens will have randomly generated embeddings. 214 | 215 | Args: 216 | random_seed (int) 217 | 218 | Returns: 219 | SimpleEmbeddings 220 | """ 221 | special_tokens = list(WordVocab.SPECIAL_TOKENS) 222 | _, embed_dim = self.array.shape 223 | special_tokens_array_shape = (len(special_tokens), embed_dim) 224 | special_tokens_array = emulate_distribution(special_tokens_array_shape, self.array, seed=random_seed) 225 | special_tokens_array = special_tokens_array.astype(np.float32) 226 | 227 | new_array = np.concatenate((special_tokens_array, self.array), axis=0) 228 | new_vocab = WordVocab(special_tokens + self.vocab.tokens) 229 | 230 | return SimpleEmbeddings(new_array, new_vocab) 231 | 232 | 233 | def emulate_distribution(shape, target_samples, seed=None): 234 | m = np.mean(target_samples) 235 | s = np.std(target_samples) 236 | 237 | with random_seed(seed): 238 | samples = np.random.normal(m, s, size=shape) 239 | 240 | return samples 241 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/graph.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter, deque 2 | import numpy as np 3 | import random 4 | 5 | from gtd import utils 6 | 7 | 8 | # defines whether an edge is inverted or not 9 | inverted = lambda r: r[:2] == '**' 10 | invert = lambda r: r[2:] if inverted(r) else '**' + r 11 | 12 | 13 | class Graph(object): 14 | def __init__(self, triples): 15 | self.triples = triples 16 | neighbors = defaultdict(lambda: defaultdict(set)) 17 | relation_args = defaultdict(lambda: defaultdict(set)) 18 | 19 | for s, r, t in triples: 20 | relation_args[r]['s'].add(s) 21 | relation_args[r]['t'].add(t) 22 | neighbors[s][r].add(t) 23 | neighbors[t][invert(r)].add(s) 24 | 25 | def freeze(d): 26 | frozen = {} 27 | for key, subdict in d.iteritems(): 28 | frozen[key] = {} 29 | for subkey, set_val in subdict.iteritems(): 30 | frozen[key][subkey] = tuple(set_val) 31 | return frozen 32 | 33 | # WARNING: both neighbors and relation_args must not have default initialization. 34 | # Default init is dangerous, because we sometimes perform uniform sampling over 35 | # all keys in the dictionary. This distribution will get altered if a user asks about 36 | # entities or relations that weren't present. 37 | 38 | # self.neighbors[start][relation] = (end1, end2, ...) 39 | # self.relation_args[relation][position] = (ent1, ent2, ...) 40 | # position is either 's' (domain) or 't' (range) 41 | self.neighbors = freeze(neighbors) 42 | self.relation_args = freeze(relation_args) 43 | self.random_entities = [] 44 | 45 | # cpp_graph = graph_traversal.Graph() 46 | # for s, r, t in triples: 47 | # cpp_graph.add_edge(s, r, t) 48 | # cpp_graph.add_edge(t, invert(r), s) 49 | # self.cpp_graph = cpp_graph 50 | cpp_graph = None 51 | 52 | def shortest_path(self, source, target): 53 | # use breadth-first search 54 | 55 | queue = deque() 56 | explored = {} # stores backpointers 57 | 58 | def enqueue(node, backpointer): 59 | queue.appendleft(node) 60 | explored[node] = backpointer 61 | 62 | def path(node): 63 | current = node 64 | path = [current] 65 | while True: 66 | backpointer = explored[current] 67 | if backpointer: 68 | rel, current = backpointer 69 | path.extend((rel, current)) 70 | else: 71 | break # we've hit the source 72 | return path[::-1] # reverse 73 | 74 | enqueue(source, None) 75 | 76 | while len(queue) != 0: 77 | current = queue.pop() 78 | for rel, nbrs in self.neighbors[current].iteritems(): 79 | for nbr in nbrs: 80 | if nbr not in explored: 81 | enqueue(nbr, (rel, current)) 82 | if nbr == target: 83 | return path(nbr) 84 | 85 | 86 | def random_walk_probs(self, start, path): 87 | return self.cpp_graph.exact_random_walk_probs(start, list(path)) 88 | 89 | def walk_all(self, start, path, positive_branch_factor=float('inf')): 90 | if positive_branch_factor == 0: 91 | return set() 92 | 93 | approx = positive_branch_factor != float('inf') 94 | 95 | if approx: 96 | return set(self.cpp_graph.approx_path_traversal(start, list(path), positive_branch_factor)) 97 | else: 98 | return set(self.cpp_graph.path_traversal(start, list(path))) 99 | 100 | def is_trivial_query(self, start, path): 101 | return self.cpp_graph.is_trivial_query(start, list(path)) 102 | 103 | def type_matching_entities(self, path, position): 104 | if position == 's': 105 | r = path[0] 106 | elif position == 't': 107 | r = path[-1] 108 | else: 109 | raise ValueError(position) 110 | 111 | try: 112 | if not inverted(r): 113 | return self.relation_args[r][position] 114 | else: 115 | inv_pos = 's' if position == 't' else 't' 116 | return self.relation_args[invert(r)][inv_pos] 117 | except KeyError: 118 | # nothing type-matches 119 | return tuple() 120 | 121 | # TODO: test this 122 | def random_walk(self, start, length, no_return=False): 123 | """ 124 | If no_return, the random walk never revisits the same node. Can sometimes return None, None. 125 | """ 126 | max_attempts = 1000 127 | for i in range(max_attempts): 128 | 129 | sampled_path = [] 130 | visited = set() 131 | current = start 132 | for k in range(length): 133 | visited.add(current) 134 | 135 | r = random.choice(self.neighbors[current].keys()) 136 | sampled_path.append(r) 137 | 138 | candidates = self.neighbors[current][r] 139 | 140 | if no_return: 141 | current = utils.sample_excluding(candidates, visited) 142 | else: 143 | current = random.choice(candidates) 144 | 145 | # no viable next step 146 | if current is None: 147 | break 148 | 149 | # failed to find a viable walk. Try again. 150 | if current is None: 151 | continue 152 | 153 | return tuple(sampled_path), current 154 | 155 | return None, None 156 | 157 | def random_walk_constrained(self, start, path): 158 | """ 159 | Warning! Can sometimes return None. 160 | """ 161 | 162 | # if start node isn't present we can't take this walk 163 | if start not in self.neighbors: 164 | return None 165 | 166 | current = start 167 | for r in path: 168 | rels = self.neighbors[current] 169 | if r not in rels: 170 | # no viable next steps 171 | return None 172 | current = random.choice(rels[r]) 173 | return current 174 | 175 | def random_entity(self): 176 | if len(self.random_entities) == 0: 177 | self.random_entities = list(np.random.choice(self.neighbors.keys(), size=20000, replace=True)) 178 | return self.random_entities.pop() 179 | 180 | def relation_stats(self): 181 | stats = defaultdict(dict) 182 | rel_counts = Counter(r for s, r, t in self.triples) 183 | 184 | for r, args in self.relation_args.iteritems(): 185 | out_degrees, in_degrees = [], [] 186 | for s in args['s']: 187 | out_degrees.append(len(self.neighbors[s][r])) 188 | for t in args['t']: 189 | in_degrees.append(len(self.neighbors[t][invert(r)])) 190 | 191 | domain = float(len(args['s'])) 192 | range = float(len(args['t'])) 193 | out_degree = np.mean(out_degrees) 194 | in_degree = np.mean(in_degrees) 195 | stat = {'avg_out_degree': out_degree, 196 | 'avg_in_degree': in_degree, 197 | 'min_degree': min(in_degree, out_degree), 198 | 'in/out': in_degree / out_degree, 199 | 'domain': domain, 200 | 'range': range, 201 | 'r/d': range / domain, 202 | 'total': rel_counts[r], 203 | 'log(total)': np.log(rel_counts[r]) 204 | } 205 | 206 | # include inverted relation 207 | inv_stat = {'avg_out_degree': in_degree, 208 | 'avg_in_degree': out_degree, 209 | 'min_degree': stat['min_degree'], 210 | 'in/out': out_degree / in_degree, 211 | 'domain': range, 212 | 'range': domain, 213 | 'r/d': domain / range, 214 | 'total': stat['total'], 215 | 'log(total)': stat['log(total)'] 216 | } 217 | 218 | stats[r] = stat 219 | stats[invert(r)] = inv_stat 220 | 221 | return stats -------------------------------------------------------------------------------- /textmorph/turk/turk.py: -------------------------------------------------------------------------------- 1 | import json 2 | from string import Template 3 | 4 | import boto 5 | from boto.mturk.question import Overview, QuestionContent, SelectionAnswer, Question, AnswerSpecification, QuestionForm 6 | 7 | from gtd.turk import Task, get_mturk_connection, standard_quals 8 | from gtd.utils import Config 9 | from textmorph import data 10 | 11 | 12 | """ 13 | To review completed HITs: 14 | - Go to: https://requester.mturk.com/mturk/manageHITs 15 | 16 | To do a HIT: 17 | - Go to: https://worker.mturk.com/ 18 | - Search for "percy liang" 19 | - Click "Accept & Work" 20 | - For some reason, I had trouble viewing these HITs on Google Chrome (invalid URL parameter error). 21 | - On Firefox, things are fine. 22 | """ 23 | 24 | config = Config.from_file(data.workspace.config) 25 | mtc = get_mturk_connection(config.aws_access_key_id, 26 | config.aws_secret_access_key, sandbox=False) 27 | 28 | 29 | class SimilarityTask(Task): 30 | 31 | def __init__(self, debug): 32 | # load from configuration 33 | conf = Config.from_file(data.workspace.turk.similarity.config.txt) 34 | self.title = conf.title 35 | self.description = conf.description 36 | self.keywords = conf.keywords 37 | self.price = conf.price 38 | self.duration = eval(conf.duration) 39 | self.approval_delay = eval(conf.approval_delay) 40 | 41 | # store form specification as JSON, to be built automatically on launch 42 | with open(data.workspace.turk.similarity.form.json) as form_json: 43 | self.form_json = form_json.read() 44 | 45 | price_per_hit = 0.0 if debug else self.price 46 | 47 | quals = standard_quals(debug) 48 | 49 | hit_type_ids = mtc.register_hit_type(title=self.title, description=self.description, reward=price_per_hit, 50 | duration=self.duration, 51 | keywords=self.keywords, approval_delay=self.approval_delay, qual_req=quals) 52 | hit_type_id = hit_type_ids[0].HITTypeId 53 | 54 | super(SimilarityTask, self).__init__(hit_type_id, mtc) 55 | 56 | def launch(self, data={}): 57 | qf = QuestionForm() 58 | form_json = BotoFormGenerator.inject_data(self.form_json, data) 59 | BotoFormGenerator.from_json(qf, form_json) 60 | return self.create_hit(qf) 61 | 62 | 63 | class CoherenceTask(Task): 64 | 65 | def __init__(self, debug): 66 | # load from configuration 67 | conf = Config.from_file(data.workspace.turk.coherence.config.txt) 68 | self.title = conf.title 69 | self.description = conf.description 70 | self.keywords = conf.keywords 71 | self.price = conf.price 72 | self.duration = eval(conf.duration) 73 | self.approval_delay = eval(conf.approval_delay) 74 | 75 | # store form specification as JSON, to be built automatically on launch 76 | with open(data.workspace.turk.coherence.form.json) as form_json: 77 | self.form_json = form_json.read() 78 | 79 | price_per_hit = 0.0 if debug else self.price 80 | 81 | quals = standard_quals(debug) 82 | 83 | hit_type_ids = mtc.register_hit_type(title=self.title, description=self.description, reward=price_per_hit, 84 | duration=self.duration, 85 | keywords=self.keywords, approval_delay=self.approval_delay, qual_req=quals) 86 | hit_type_id = hit_type_ids[0].HITTypeId 87 | 88 | super(CoherenceTask, self).__init__(hit_type_id, mtc) 89 | 90 | def launch(self, data={}): 91 | qf = QuestionForm() 92 | form_json = BotoFormGenerator.inject_data(self.form_json, data) 93 | BotoFormGenerator.from_json(qf, form_json) 94 | return self.create_hit(qf) 95 | 96 | 97 | class BotoFormGenerator(object): 98 | 99 | form_types = {'Overview', 'QuestionContent', 'SelectionAnswer', 'Question', 'AnswerSpecification', 'QuestionForm', 'FormattedContent'} 100 | 101 | @staticmethod 102 | def from_json(question_form, json_data): 103 | """ 104 | Construct a QuestionForm from a JSON specification 105 | """ 106 | 107 | form_data = json.loads(json_data, strict=False) 108 | 109 | # construct objects and build QuestionForm 110 | for obj_data in form_data['form']: 111 | obj = BotoFormGenerator._from_data(obj_data) 112 | question_form.append(obj) 113 | 114 | @staticmethod 115 | def _from_data(form_data): 116 | """ 117 | Generates and populates boto.mturk.question objects from a specification. 118 | """ 119 | 120 | if type(form_data) is not dict: 121 | return form_data 122 | 123 | """ 124 | Functions for creating form objects. 125 | args_dict is a dictionary containing a mapping from names to arguments. 126 | Positional and keyword arguments pertaining to the particular object 127 | are extracted from args_dict and passed appropriately to the object 128 | constructor. 129 | 130 | It's very easy to add functionality to this scheme. Simply add a form_type 131 | and a make_{} function with the correct required args_, and it can 132 | immediately be used in the JSON spec. 133 | """ 134 | def make_args(args_dict, args_): 135 | # positional arguments 136 | args = [args_dict[k] for k in args_] 137 | # keyword arguments 138 | kwargs = {k: v for k, v in args_dict.iteritems() if k not in args_} 139 | return args, kwargs 140 | 141 | def add_field(obj, field): 142 | (fl_name, fl_value) = next(field.iteritems()) 143 | obj.append_field(fl_name, fl_value) 144 | 145 | def add_append(obj, append): 146 | obj.append(append) 147 | 148 | def make_Overview(args_dict, args_=[]): 149 | args, kwargs = make_args(args_dict, args_) 150 | return boto.mturk.question.Overview(*args, **kwargs) 151 | 152 | def make_Question(args_dict, args_=['identifier', 'content', 'answer_spec']): 153 | args, kwargs = make_args(args_dict, args_) 154 | return boto.mturk.question.Question(*args, **kwargs) 155 | 156 | def make_QuestionContent(args_dict, args_=[]): 157 | args, kwargs = make_args(args_dict, args_) 158 | return boto.mturk.question.QuestionContent(*args, **kwargs) 159 | 160 | def make_SelectionAnswer(args_dict, args_=[]): 161 | args, kwargs = make_args(args_dict, args_) 162 | return boto.mturk.question.SelectionAnswer(*args, **kwargs) 163 | 164 | def make_AnswerSpecification(args_dict, args_=['spec']): 165 | args, kwargs = make_args(args_dict, args_) 166 | return boto.mturk.question.AnswerSpecification(*args, **kwargs) 167 | 168 | def make_FormattedContent(args_dict, args_=['content']): 169 | args, kwargs = make_args(args_dict, args_) 170 | return boto.mturk.question.FormattedContent(*args, **kwargs) 171 | 172 | k, v = next(form_data.iteritems()) 173 | if k in BotoFormGenerator.form_types: 174 | make_fn = eval("make_{}".format(k)) 175 | args = {} # arguments to the object, that may be other objects 176 | # list of to be appended form objects (Field-type or otherwise 177 | fields = [] 178 | 179 | for arg_k, arg_v in v.iteritems(): # iterate over arguments to the form object 180 | # Fields _or_ form objects to be appended (e.g. 181 | # FormattedContent) 182 | if arg_k == "fields": 183 | fields = arg_v 184 | else: # recurse and build form object argument 185 | args[arg_k] = BotoFormGenerator._from_data(arg_v) 186 | 187 | obj = make_fn(args) 188 | for fl in fields: 189 | fl_k, fl_v = next(fl.iteritems()) 190 | if fl_k == "field": 191 | add_field(obj, fl_v) 192 | if fl_k == "append": 193 | ap = BotoFormGenerator._from_data(fl_v) 194 | add_append(obj, ap) 195 | return obj 196 | 197 | return None 198 | 199 | @staticmethod 200 | def inject_data(json_data, data): 201 | """ 202 | Insert data into the JSON format specification. 203 | This is used to dynamically create forms with different questions using 204 | the same specification. 205 | """ 206 | return Template(json_data).substitute(**data) 207 | -------------------------------------------------------------------------------- /textmorph/edit_model/attention_decoder.py: -------------------------------------------------------------------------------- 1 | from itertools import izip 2 | 3 | import numpy as np 4 | import torch 5 | from torch.nn import LSTMCell, Linear, Parameter, Softmax 6 | 7 | from collections import namedtuple 8 | from gtd.ml.torch.attention import Attention, AttentionOutput, DummyAttention 9 | from gtd.ml.torch.decoder_cell import DecoderCell, DecoderCellOutput, RNNState, RNNInput 10 | from gtd.ml.torch.recurrent import gated_update, tile_state 11 | from gtd.ml.torch.utils import GPUVariable 12 | from gtd.utils import UnicodeMixin 13 | from gtd.ml.torch.decoder import RNNContextCombiner 14 | 15 | 16 | class AttentionContextCombiner(RNNContextCombiner): 17 | def __call__(self, encoder_output, x): 18 | return AttentionRNNInput(x=x, agenda=encoder_output.agenda, source_embeds=encoder_output.source_embeds, insert_embeds=encoder_output.insert_embeds, delete_embeds=encoder_output.delete_embeds) 19 | 20 | class AttentionDecoderCell(DecoderCell): 21 | def __init__(self, token_embedder, agenda_dim, decoder_dim, encoder_dim, attn_dim, no_insert_delete_attn, num_layers): 22 | super(AttentionDecoderCell, self).__init__() 23 | 24 | input_dim = token_embedder.embed_dim 25 | self.num_layers = num_layers 26 | 27 | # see definition of `x_augment` in `forward` method 28 | # we augment the input to each RNN layer with 3 attention contexts + the agenda 29 | augment_dim = encoder_dim + input_dim + input_dim + agenda_dim 30 | 31 | self.rnn_cells = [] 32 | for layer in range(num_layers): 33 | in_dim = input_dim if layer == 0 else decoder_dim # first layer takes word vectors 34 | out_dim = decoder_dim 35 | rnn_cell = LSTMCell(in_dim + augment_dim, out_dim) 36 | self.add_module('decoder_layer_{}'.format(layer), rnn_cell) 37 | self.rnn_cells.append(rnn_cell) 38 | 39 | # see definition of `z` in `forward` method 40 | # to predict words, we condition on the hidden state h + 3 attention contexts 41 | z_dim = decoder_dim + encoder_dim + 2 * input_dim 42 | if no_insert_delete_attn: 43 | z_dim = decoder_dim + encoder_dim 44 | 45 | self.vocab_projection_pos = Linear(z_dim, input_dim) # TODO(kelvin): these big params may need regularization 46 | self.vocab_projection_neg = Linear(z_dim, input_dim) 47 | self.relu = torch.nn.ReLU() 48 | 49 | self.h0 = Parameter(torch.zeros(decoder_dim)) 50 | self.c0 = Parameter(torch.zeros(decoder_dim)) 51 | self.vocab_softmax = Softmax() 52 | 53 | self.source_attention = Attention(encoder_dim, decoder_dim, attn_dim) 54 | if not no_insert_delete_attn: 55 | self.insert_attention = Attention(input_dim, decoder_dim, attn_dim) 56 | self.delete_attention = Attention(input_dim, decoder_dim, attn_dim) 57 | else: 58 | self.insert_attention = DummyAttention(input_dim, decoder_dim, attn_dim) 59 | self.delete_attention = DummyAttention(input_dim, decoder_dim, attn_dim) 60 | 61 | self.token_embedder = token_embedder 62 | self.no_insert_delete_attn = no_insert_delete_attn 63 | 64 | def initialize(self, batch_size): 65 | h = tile_state(self.h0, batch_size) 66 | c = tile_state(self.c0, batch_size) 67 | 68 | # no initial weights, context is just zero vector 69 | init_attn = lambda attention: AttentionOutput(None, GPUVariable(torch.zeros(batch_size, attention.memory_dim))) 70 | 71 | return AttentionRNNState([h] * self.num_layers, [c] * self.num_layers, init_attn(self.source_attention), 72 | init_attn(self.insert_attention), init_attn(self.delete_attention)) 73 | 74 | def forward(self, rnn_state, decoder_cell_input, advance): 75 | dci = decoder_cell_input 76 | mask = advance 77 | 78 | # this will be concatenated to x at every layer 79 | # we are conditioning on the attention from the previous time step and the agenda from the encoder 80 | x_augment = torch.cat([rnn_state.source_attn.context, 81 | rnn_state.insert_attn.context, 82 | rnn_state.delete_attn.context, 83 | dci.agenda], 1) 84 | 85 | hs, cs = [], [] 86 | x = dci.x # input word vector 87 | for layer in range(self.num_layers): 88 | rnn_cell = self.rnn_cells[layer] 89 | old_h, old_c = rnn_state.hs[layer], rnn_state.cs[layer] 90 | rnn_input = torch.cat([x, x_augment], 1) 91 | h, c = rnn_cell(rnn_input, (old_h, old_c)) 92 | h = gated_update(old_h, h, mask) 93 | c = gated_update(old_c, c, mask) 94 | hs.append(h) 95 | cs.append(c) 96 | 97 | if layer == 0: 98 | x = h # no skip connection on the first layer 99 | else: 100 | x = x + h 101 | 102 | # compute attention using bottom layer 103 | source_attn = self.source_attention(dci.source_embeds, hs[0]) 104 | insert_attn = self.insert_attention(dci.insert_embeds, hs[0]) 105 | delete_attn = self.delete_attention(dci.delete_embeds, hs[0]) 106 | if not self.no_insert_delete_attn: 107 | z = torch.cat([x, source_attn.context, insert_attn.context, delete_attn.context], 1) 108 | else: 109 | z = torch.cat([x, source_attn.context], 1) 110 | 111 | # has shape (batch_size, decoder_dim + encoder_dim + input_dim + input_dim) 112 | 113 | vocab_query_pos = self.vocab_projection_pos(z) 114 | vocab_query_neg = self.vocab_projection_neg(z) 115 | word_vocab = self.token_embedder.vocab 116 | word_embeds = self.token_embedder.embeds 117 | vocab_logit_pos = self.relu(torch.mm(vocab_query_pos, word_embeds.t())) # (batch_size, vocab_size) 118 | vocab_logit_neg = self.relu(torch.mm(vocab_query_neg, word_embeds.t())) # (batch_size, vocab_size) 119 | vocab_probs = self.vocab_softmax(vocab_logit_pos - vocab_logit_neg) 120 | # TODO(kelvin): prevent model from putting probability on UNK 121 | 122 | rnn_state = AttentionRNNState(hs, cs, source_attn, insert_attn, delete_attn) 123 | 124 | return DecoderCellOutput(rnn_state, vocab=word_vocab, vocab_probs=vocab_probs) 125 | 126 | def rnn_state_type(self): 127 | return AttentionRNNState 128 | 129 | def rnn_input_type(self): 130 | return AttentionRNNInput 131 | 132 | class AttentionRNNState(namedtuple('AttentionRNNState', ['hs','cs','source_attn','insert_attn','delete_attn']), RNNState): 133 | """ 134 | Attributes: 135 | hs (list[Variable]): a list of the hidden states for each layer of a multi-layer RNN. 136 | Each Variable has shape (batch_size, hidden_dim). 137 | cs (list[Variable]): a list of the cell states for each layer of a multi-layer RNN 138 | Each Variable has shape (batch_size, hidden_dim). 139 | source_attn (AttentionOutput) 140 | insert_attn (AttentionOutput) 141 | delete_attn (AttentionOutput) 142 | """ 143 | pass 144 | 145 | class AttentionRNNInput(namedtuple('AttentionRNNInput', ['x','agenda','source_embeds','insert_embeds','delete_embeds']), RNNInput): 146 | """ 147 | Attributes: 148 | x (Variable): of shape (batch_size, word_dim), embedding of word generated at previous time step 149 | agenda (Variable): of shape (batch_size, agenda_dim) 150 | source_embeds (SequenceBatch): of shape (batch_size, source_seq_length, hidden_size) 151 | insert_embeds (SequenceBatch): of shape (batch_size, max_edits, embed_dim) 152 | delete_embeds (SequenceBatch): of shape (batch_size, max_edits, embed_dim) 153 | """ 154 | pass 155 | 156 | 157 | 158 | class AttentionTrace(UnicodeMixin): 159 | __slots__ = ['name', 'tokens', 'attention_weights'] 160 | 161 | def __init__(self, name, tokens, attention_weights): 162 | """Construct AttentionTrace. 163 | 164 | Args: 165 | name (unicode): name of attention mechanism 166 | tokens (list[unicode]) 167 | attention_weights (np.ndarray): a 1D array. May be longer than len(tokens) due to batching. 168 | """ 169 | assert len(attention_weights.shape) == 1 170 | 171 | # any attention weights exceeding length of tokens should be zero 172 | for i in range(len(tokens), len(attention_weights)): 173 | assert attention_weights[i] == 0 174 | 175 | self.name = name 176 | self.tokens = tokens 177 | self.attention_weights = attention_weights 178 | 179 | def __unicode__(self): 180 | total_mass = np.sum(self.attention_weights) 181 | s = u' '.join(u'{}[{:.2f}]'.format(t, w) for t, w in izip(self.tokens, self.attention_weights)) 182 | return u'{:10}[{:.2f}]: {}'.format(self.name, total_mass, s) 183 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/tf/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import os 3 | 4 | import numpy as np 5 | import pytest 6 | import tensorflow as tf 7 | from numpy.testing import assert_array_equal, assert_array_almost_equal 8 | from tensorflow.python.framework.errors import InvalidArgumentError 9 | 10 | from gtd.ml.tf.utils import TensorDebugger, clean_session, expand_dims_for_broadcast, broadcast, Saver, \ 11 | guarantee_initialized_variables, gather_2d 12 | from gtd.ml.tf.tests.test_framework import clean_test_session 13 | 14 | 15 | class TestTensorDebugger(TestCase): 16 | def test_tensor_debugger_deps(self): 17 | tdb = TensorDebugger() 18 | 19 | x = tf.constant(3, name='x') 20 | z = tf.mul(x, 3, name='z') 21 | with tf.control_dependencies([x]): 22 | y = tf.constant(8, name='y') 23 | 24 | deps = tdb.dependency_graph 25 | 26 | # control dependencies depend on x's output 27 | self.assertEqual(deps['y'], {'x:0'}) 28 | 29 | # each output depends on its op 30 | self.assertEqual(deps['y:0'], {'y'}) 31 | 32 | # downstream ops depend on the output of earlier ops 33 | self.assertTrue('x:0' in deps['z']) 34 | 35 | def test_tensor_debugger_multiple(self): 36 | tdb = TensorDebugger() 37 | 38 | x = tf.constant([1, 2]) 39 | tdb.register('x', x) 40 | zs = [] 41 | for k in range(3): 42 | y = tf.constant(k) 43 | z = tf.reduce_sum(x * y) 44 | # register multiple nodes under the same name 45 | tdb.register('y', y) 46 | zs.append(z) 47 | 48 | # 0, (1 + 2), (2 + 4) 49 | final = tf.pack(zs) 50 | 51 | with tf.Session() as sess: 52 | results, bp_results = tdb.debug(sess, final, {}) 53 | 54 | def test(a, b): 55 | self.assertTrue(np.array_equal(a, b)) 56 | 57 | # result correctly passed back 58 | test(results, [0, 3, 6]) 59 | # values in for loop accumulated as list 60 | test(bp_results['y'], [0, 1, 2]) 61 | 62 | def test_tensor_debugger_exec_path(self): 63 | tdb = TensorDebugger() 64 | 65 | x = tf.constant(5, name='x') 66 | y = tf.placeholder(tf.int32, name='y') 67 | 68 | z = tf.mul(x, y, 'z') 69 | w = tf.constant(4, name='w') 70 | 71 | f = tf.mul(z, w, 'f') 72 | g = tf.constant(3, name='g') 73 | 74 | with tf.control_dependencies([f]): 75 | h = tf.constant(11, name='h') 76 | 77 | # don't register x 78 | tdb.register('y', y) 79 | tdb.register('z', z) 80 | tdb.register('w', w) 81 | tdb.register('f', f) 82 | tdb.register('g', g, force_run=True) 83 | tdb.register('h', h) 84 | 85 | with tf.Session() as sess: 86 | result, bp_results = tdb.debug(sess, f, {y: 2}) 87 | # result is a single value, not a list 88 | self.assertEqual(result, 40) 89 | # excludes x, because not registered. excludes h, because not on execution path. 90 | # includes g, because of force_run 91 | self.assertEqual(bp_results, {'y': 2, 'z': 10, 'w': 4, 'g': 3}) 92 | 93 | results, bp_results = tdb.debug(sess, [h, g], {y: 2}) 94 | # returns a list 95 | self.assertEqual(results, [11, 3]) 96 | # includes y, z, w and f because h depends on them through control_dependencies 97 | # includes g because of force_run 98 | self.assertEqual(bp_results, {'y': 2, 'z': 10, 'f': 40, 'w': 4, 'g': 3}) 99 | 100 | 101 | def test_expand_dims_for_broadcast(): 102 | with clean_session(): 103 | arr = tf.constant([ 104 | [ 105 | [1, 2, 3], 106 | [4, 5, 6], 107 | [4, 5, 6], 108 | ], 109 | [ 110 | [1, 2, 3], 111 | [4, 5, 6], 112 | [4, 5, 6], 113 | ], 114 | ], dtype=tf.float32) 115 | weights = tf.constant([1, 2], dtype=tf.float32) 116 | 117 | assert arr.get_shape().as_list() == [2, 3, 3] 118 | assert weights.get_shape().as_list() == [2] 119 | 120 | new_weights = expand_dims_for_broadcast(weights, arr) 121 | assert new_weights.eval().shape == (2, 1, 1) 122 | 123 | bad_weights = tf.constant([1, 2, 3], dtype=tf.float32) 124 | bad_new_weights = expand_dims_for_broadcast(bad_weights, arr) 125 | 126 | with pytest.raises(InvalidArgumentError): 127 | bad_new_weights.eval() 128 | 129 | 130 | class TestGather2D(object): 131 | @pytest.fixture 132 | def x(self): 133 | x = tf.constant([ 134 | [[1, 2], [2, 2], [3, 3]], 135 | [[4, 5], [5, 4], [6, 6]], 136 | [[7, 7], [8, 7], [9, 9]], 137 | [[0, 8], [1, 1], [2, 2]] 138 | ], dtype=tf.int32) 139 | return x 140 | 141 | @pytest.mark.usefixtures('clean_test_session') 142 | def test(self, x): 143 | i = tf.constant([[0, 2], 144 | [3, 0]], 145 | dtype=tf.int32) 146 | j = tf.constant([[1, 1], 147 | [0, 2]], 148 | dtype=tf.int32) 149 | vals = gather_2d(x, i, j) 150 | 151 | correct = np.array([ 152 | [[2, 2], [8, 7]], 153 | [[0, 8], [3, 3]], 154 | ], dtype=np.int32) 155 | 156 | assert_array_almost_equal(correct, vals.eval()) 157 | 158 | assert vals.get_shape().as_list() == [2, 2, 2] 159 | 160 | @pytest.mark.usefixtures('clean_test_session') 161 | def test_broadcast(self, x): 162 | i = tf.constant([[0, 2], 163 | [3, 0]], 164 | dtype=tf.int32) 165 | j = tf.constant([[1, 2]], dtype=tf.int32) # needs to be broadcast up 166 | vals = gather_2d(x, i, j) 167 | 168 | correct = np.array([ 169 | [[2, 2], [9, 9]], 170 | [[1, 1], [3, 3]], 171 | ], dtype=np.int32) 172 | 173 | assert_array_almost_equal(correct, vals.eval()) 174 | 175 | 176 | def test_broadcast(): 177 | with clean_session(): 178 | values = tf.constant([ 179 | [ 180 | [1, 2], 181 | [1, 2], 182 | ], 183 | [ 184 | [1, 2], 185 | [3, 4], 186 | ], 187 | [ 188 | [5, 6], 189 | [7, 8], 190 | ] 191 | ], dtype=tf.float32) 192 | 193 | mask = tf.constant([ 194 | [1, 0], 195 | [1, 1], 196 | [0, 1], 197 | ], dtype=tf.float32) 198 | 199 | correct = np.array([ 200 | [ 201 | [1, 1], 202 | [0, 0], 203 | ], 204 | [ 205 | [1, 1], 206 | [1, 1], 207 | ], 208 | [ 209 | [0, 0], 210 | [1, 1], 211 | ] 212 | ], dtype=np.float32) 213 | 214 | assert values.get_shape().as_list() == [3, 2, 2] 215 | assert mask.get_shape().as_list() == [3, 2] 216 | 217 | mask = expand_dims_for_broadcast(mask, values) 218 | assert mask.get_shape().as_list() == [3, 2, 1] 219 | 220 | mask = broadcast(mask, values) 221 | assert mask.get_shape().as_list() == [3, 2, 2] 222 | 223 | mask_val = mask.eval() 224 | 225 | assert_array_equal(mask_val, correct) 226 | 227 | 228 | class TestSaver(object): 229 | @pytest.fixture 230 | def v(self): 231 | return tf.get_variable('v', shape=[], initializer=tf.constant_initializer(5)) 232 | 233 | @pytest.mark.usefixtures('clean_test_session') 234 | def test_restore(self, tmpdir, v): 235 | save_100_path = str(tmpdir.join('weights-100')) 236 | save_10_path = str(tmpdir.join('weights-10')) 237 | 238 | saver = Saver(str(tmpdir)) 239 | assign_op = tf.assign(v, 12) 240 | 241 | sess = tf.get_default_session() 242 | guarantee_initialized_variables(sess) 243 | 244 | assert v.eval() == 5 245 | saver.save(100) # save as step 100 246 | 247 | sess.run(assign_op) 248 | assert v.eval() == 12 249 | saver.save(10) # save as step 10 250 | 251 | saver.restore() # restores from the larger step number by default (100) 252 | assert v.eval() == 5 # restored 253 | 254 | saver.restore(10) # force restore number 10 255 | assert v.eval() == 12 256 | 257 | saver.restore(save_100_path) 258 | assert v.eval() == 5 259 | 260 | # latest should be the largest step number, not necessarily last saved 261 | assert saver.latest_checkpoint == save_100_path 262 | assert os.path.exists(save_100_path) 263 | 264 | assert saver.checkpoint_paths == { 265 | 10: save_10_path, 266 | 100: save_100_path, 267 | } 268 | -------------------------------------------------------------------------------- /third-party/gtd/gtd/ml/torch/tests/test_seq_batch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import torch 4 | from gtd.ml.torch.utils import GPUVariable 5 | from gtd.ml.torch.utils import assert_tensor_equal 6 | 7 | from gtd.ml.torch.seq_batch import SequenceBatch, SequenceBatchElement 8 | from gtd.ml.vocab import SimpleVocab 9 | 10 | 11 | class TestSequenceBatch(object): 12 | @pytest.fixture 13 | def sequences(self): 14 | return [ 15 | ['a', 'b', 'b', 'c'], 16 | ['c'], 17 | [], 18 | ] 19 | 20 | @pytest.fixture 21 | def vocab(self): 22 | return SimpleVocab(['', 'a', 'b', 'c', '', '']) 23 | 24 | def test_from_sequences(self, sequences, vocab): 25 | seq_batch = SequenceBatch.from_sequences(sequences, vocab) 26 | 27 | assert_tensor_equal(seq_batch.values, 28 | np.array([ 29 | [1, 2, 2, 3], 30 | [3, 0, 0, 0], 31 | [0, 0, 0, 0], 32 | ], dtype=np.int32)) 33 | 34 | assert_tensor_equal(seq_batch.mask, 35 | np.array([ 36 | [1, 1, 1, 1], 37 | [1, 0, 0, 0], 38 | [0, 0, 0, 0], 39 | ], dtype=np.float32)) 40 | 41 | def test_min_seq_length(self, vocab): 42 | seq_batch = SequenceBatch.from_sequences([[], [], []], vocab, min_seq_length=2) 43 | assert_tensor_equal(seq_batch.values, np.zeros((3, 2))) 44 | assert_tensor_equal(seq_batch.mask, np.zeros((3, 2))) 45 | 46 | def test_mask_validation(self): 47 | mask = GPUVariable(torch.FloatTensor([[1, 0, 0, 0], 48 | [1, 1, 0, 0], 49 | [1, 1, 1, 0]])) 50 | 51 | values = mask # just set values = mask, since it doesn't matter 52 | 53 | # should not raise any errors 54 | SequenceBatch(values, mask) 55 | 56 | non_binary_mask = GPUVariable(torch.FloatTensor([[1, 0, 0, 0], 57 | [1, 1.2, 0, 0], 58 | [1, 1, 1, 0]])) 59 | 60 | with pytest.raises(ValueError): 61 | SequenceBatch(mask, non_binary_mask) 62 | 63 | non_left_justified_mask = GPUVariable(torch.FloatTensor([[1, 0, 0, 1], 64 | [1, 1, 0, 0], 65 | [1, 1, 1, 0]])) 66 | 67 | with pytest.raises(ValueError): 68 | SequenceBatch(mask, non_left_justified_mask) 69 | 70 | def test_split(self): 71 | input_embeds = GPUVariable(torch.LongTensor([ 72 | # batch item 1 73 | [ 74 | [1, 2], [2, 3], [5, 6] 75 | ], 76 | # batch item 2 77 | [ 78 | [4, 8], [3, 5], [0, 0] 79 | ], 80 | ])) 81 | 82 | input_mask = GPUVariable(torch.FloatTensor([ 83 | [1, 1, 1], 84 | [1, 1, 0], 85 | ])) 86 | 87 | sb = SequenceBatch(input_embeds, input_mask) 88 | 89 | elements = sb.split() 90 | input_list = [e.values for e in elements] 91 | mask_list = [e.mask for e in elements] 92 | 93 | assert len(input_list) == 3 94 | assert_tensor_equal(input_list[0], [[1, 2], [4, 8]]) 95 | assert_tensor_equal(input_list[1], [[2, 3], [3, 5]]) 96 | assert_tensor_equal(input_list[2], [[5, 6], [0, 0]]) 97 | 98 | assert len(mask_list) == 3 99 | assert_tensor_equal(mask_list[0], [[1], [1]]) 100 | assert_tensor_equal(mask_list[1], [[1], [1]]) 101 | assert_tensor_equal(mask_list[2], [[1], [0]]) 102 | 103 | def test_cat(self): 104 | x1 = SequenceBatchElement( 105 | GPUVariable(torch.FloatTensor([ 106 | [[1, 2], [3, 4]], 107 | [[8, 2], [9, 0]]])), 108 | GPUVariable(torch.FloatTensor([ 109 | [1], 110 | [1] 111 | ]))) 112 | x2 = SequenceBatchElement( 113 | GPUVariable(torch.FloatTensor([ 114 | [[-1, 20], [3, 40]], 115 | [[-8, 2], [9, 10]]])), 116 | GPUVariable(torch.FloatTensor([ 117 | [1], 118 | [0] 119 | ]))) 120 | x3 = SequenceBatchElement( 121 | GPUVariable(torch.FloatTensor([ 122 | [[-1, 20], [3, 40]], 123 | [[-8, 2], [9, 10]]])), 124 | GPUVariable(torch.FloatTensor([ 125 | [0], 126 | [0] 127 | ]))) 128 | 129 | result = SequenceBatch.cat([x1, x2, x3]) 130 | 131 | assert_tensor_equal(result.values, 132 | [ 133 | [[[1, 2], [3, 4]], [[-1, 20], [3, 40]], [[-1, 20], [3, 40]]], 134 | [[[8, 2], [9, 0]], [[-8, 2], [9, 10]], [[-8, 2], [9, 10]]], 135 | ]) 136 | 137 | assert_tensor_equal(result.mask, 138 | [ 139 | [1, 1, 0], 140 | [1, 0, 0] 141 | ]) 142 | 143 | @pytest.fixture 144 | def some_seq_batch(self): 145 | values = GPUVariable(torch.FloatTensor([ 146 | [[1, 2], [4, 5], [4, 4]], 147 | [[0, 4], [43, 5], [-1, 20]], 148 | [[-1, 20], [43, 5], [0, 0]], 149 | ])) 150 | mask = GPUVariable(torch.FloatTensor([ 151 | [1, 1, 0], 152 | [1, 0, 0], 153 | [0, 0, 0], 154 | ])) 155 | return SequenceBatch(values, mask) 156 | 157 | def test_weighted_sum(self, some_seq_batch): 158 | weights = GPUVariable(torch.FloatTensor([ 159 | [0.5, 0.3, 0], 160 | [0.8, 0.2, 0], 161 | [0, 0, 0], 162 | ])) 163 | result = SequenceBatch.weighted_sum(some_seq_batch, weights) 164 | 165 | # [1, 2] * 0.5 + [4, 5] * 0.3 = [0.5 + 1.2, 1 + 1.5] = [1.7, 2.5] 166 | # [0, 4] * 0.8 = [0, 3.2] 167 | # 0 168 | 169 | # Weights on entries where mask[i, j] = 0 get ignored, as desired. 170 | assert_tensor_equal(result, [ 171 | [1.7, 2.5], 172 | [0, 3.2], 173 | [0, 0], 174 | ]) 175 | 176 | def test_reduce_sum(self, some_seq_batch): 177 | result = SequenceBatch.reduce_sum(some_seq_batch) 178 | 179 | assert_tensor_equal(result, [ 180 | [5, 7], 181 | [0, 4], 182 | [0, 0], 183 | ]) 184 | 185 | def test_reduce_mean(self, some_seq_batch): 186 | result = SequenceBatch.reduce_mean(some_seq_batch, allow_empty=True) 187 | 188 | assert_tensor_equal(result, [ 189 | [2.5, 3.5], 190 | [0, 4], 191 | [0, 0] 192 | ]) 193 | 194 | with pytest.raises(ValueError): 195 | SequenceBatch.reduce_mean(some_seq_batch, allow_empty=False) 196 | 197 | def test_reduce_prod(self, some_seq_batch): 198 | result = SequenceBatch.reduce_prod(some_seq_batch) 199 | assert_tensor_equal(result, [ 200 | [4, 10], 201 | [0, 4], 202 | [1, 1] 203 | ]) 204 | 205 | def test_reduce_max(self, some_seq_batch): 206 | 207 | with pytest.raises(ValueError): 208 | # should complain about empty sequence 209 | SequenceBatch.reduce_max(some_seq_batch) 210 | 211 | values = GPUVariable(torch.FloatTensor([ 212 | [[1, 2], [4, 5], [4, 4]], # actual max is in later elements, but shd be suppressed by mask 213 | [[0, -4], [43, -5], [-1, -20]], # note that all elements in 2nd dim are negative 214 | ])) 215 | mask = GPUVariable(torch.FloatTensor([ 216 | [1, 0, 0], 217 | [1, 1, 0], 218 | ])) 219 | seq_batch = SequenceBatch(values, mask) 220 | result = SequenceBatch.reduce_max(seq_batch) 221 | 222 | assert_tensor_equal(result, [ 223 | [1, 2], 224 | [43, -4], 225 | ]) 226 | 227 | def test_embed(self): 228 | sequences = [ 229 | [], 230 | [1, 2, 3], 231 | [3, 3], 232 | [2] 233 | ] 234 | 235 | vocab = SimpleVocab([0, 1, 2, 3, 4]) 236 | indices = SequenceBatch.from_sequences(sequences, vocab) 237 | 238 | embeds = GPUVariable(torch.FloatTensor([ 239 | [0, 0], 240 | [2, 2], # 1 241 | [3, 4], # 2 242 | [-10, 1], # 3 243 | [11, -1] # 4 244 | ])) 245 | 246 | embedded = SequenceBatch.embed(indices, embeds) 247 | 248 | correct = np.array([ 249 | [[0, 0], [0, 0], [0, 0]], 250 | [[2, 2], [3, 4], [-10, 1]], 251 | [[-10, 1], [-10, 1], [0, 0]], 252 | [[3, 4], [0, 0], [0, 0]] 253 | ], dtype=np.float32) 254 | assert_tensor_equal(embedded.values, correct) --------------------------------------------------------------------------------