├── .gitignore
├── README.md
├── figures
    ├── char2wav.png
    └── tacotron-model.png
├── lib
    └── utils.py
├── modules
    ├── sampleRNN.py
    └── seq2seq.py
├── neural_vocoder
    ├── __init__.py
    ├── neural_vocoder.py
    └── recursive_dilation.py
├── scratch
    └── rnn_sample.py
├── text_to_features
    ├── __init__.py
    └── text_to_feature.py
└── word_seq_2_seq
    ├── .gitignore
    ├── Language Pair Scraper.ipynb
    ├── README.md
    ├── TODO.md
    ├── Translation via Sequence to Sequence Model with Attention.ipynb
    ├── __init__.py
    ├── crawl.out
    ├── data.py
    ├── download_anki.py
    ├── evaluate.py
    ├── language.py
    ├── metric.py
    ├── model.py
    ├── model_eval.spec.py
    ├── scraper.py
    ├── train.py
    ├── training-data
        ├── afr-eng.zip
        ├── ara-eng.zip
        ├── aze-eng.zip
        ├── bel-eng.zip
        ├── ben-eng.zip
        ├── ber-eng.zip
        ├── bul-eng.zip
        ├── cat-eng.zip
        ├── cat-eng
        │   ├── _about.txt
        │   └── cat.txt
        ├── cbk-eng.zip
        ├── cbk-eng
        │   ├── _about.txt
        │   └── cbk.txt
        ├── ces-eng.zip
        ├── ces-eng
        │   ├── _about.txt
        │   └── ces.txt
        ├── cmn-eng.zip
        ├── cmn-eng
        │   └── _about.txt
        ├── dan-eng.zip
        ├── dan-eng
        │   ├── _about.txt
        │   └── dan.txt
        ├── deu-eng.zip
        ├── ell-eng.zip
        ├── eng-cmn.txt
        ├── eng-eng.zip
        ├── eng-fra.txt
        ├── est-eng.zip
        ├── fin-eng.zip
        ├── fra-eng.zip
        ├── heb-eng.zip
        ├── hin-eng.zip
        ├── hrv-eng.zip
        ├── hun-eng.zip
        ├── hun-eng
        │   ├── _about.txt
        │   └── hun.txt
        ├── ind-eng.zip
        ├── isl-eng.zip
        ├── ita-eng.zip
        ├── jpn-eng.zip
        ├── kha-eng.zip
        ├── khm-eng.zip
        ├── kor-eng.zip
        ├── lit-eng.zip
        ├── lvs-eng.zip
        ├── mal-eng.zip
        ├── mar-eng.zip
        ├── mkd-eng.zip
        ├── nds-eng.zip
        ├── nld-eng.zip
        ├── nob-eng.zip
        ├── pes-eng.zip
        ├── pol-eng.zip
        ├── por-eng.zip
        ├── ron-eng.zip
        ├── rus-eng.zip
        ├── slk-eng.zip
        ├── spa-eng.zip
        ├── srp-eng.zip
        ├── swe-eng.zip
        ├── tat-eng.zip
        ├── tgl-eng.zip
        ├── tur-eng.zip
        ├── ukr-eng.zip
        ├── urd-eng.zip
        ├── vie-eng.zip
        ├── yue-eng.zip
        └── yue-eng
        │   ├── _about.txt
        │   └── yue.txt
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | 
3 | .DS_Store
4 | *.pyc
5 | 
6 | char2wav_original
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # char2wav_pytorch 
 2 | 
 3 | link to paper: https://openreview.net/pdf?id=B1VWyySKx
 4 | 
 5 | ## TODOs
 6 | 
 7 | - [ ] Implement Model
 8 |     - [ ] Reader
 9 |         - [x] Encoder
10 |             - [ ] make encoder bi-directional
11 |         - [x] Decoder
12 |             - [ ] Add Attention to decoder
13 |     - [x] SampleRNN
14 |         - [x] overall architecture
15 |         - [x] perforated RNN module
16 | - [ ] Unit Test
17 | 
18 | ## Model Architecture
19 | 
20 | ![char2wav](./figures/char2wav.png)


--------------------------------------------------------------------------------
/figures/char2wav.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/figures/char2wav.png


--------------------------------------------------------------------------------
/figures/tacotron-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/figures/tacotron-model.png


--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from termcolor import cprint as _cprint, colored as c
  3 | from pprint import pprint
  4 | import traceback
  5 | 
  6 | 
  7 | class Ledger():
  8 |     def __init__(self, debug=True):
  9 |         self.is_debug = debug
 10 |         pass
 11 | 
 12 |     def p(self, *args, **kwargs):
 13 |         self.print(*args, **kwargs)
 14 | 
 15 |     def print(self, *args, **kwargs):
 16 |         """use stdout.flush to allow streaming to file when used by IPython. IPython doesn't have -u option."""
 17 |         print(*args, **kwargs)
 18 |         sys.stdout.flush()
 19 | 
 20 |     def cp(self, *args, **kwargs):
 21 |         self.cprint(*args, **kwargs)
 22 | 
 23 |     def cprint(self, *args, sep=' ', color='white', **kwargs):
 24 |         """use stdout.flush to allow streaming to file when used by IPython. IPython doesn't have -u option."""
 25 |         _cprint(sep.join([str(a) for a in args]), color, **kwargs)
 26 |         sys.stdout.flush()
 27 | 
 28 |     def pp(self, *args, **kwargs):
 29 |         self.pprint(*args, **kwargs)
 30 | 
 31 |     def pprint(self, *args, **kwargs):
 32 |         pprint(*args, **kwargs)
 33 |         sys.stdout.flush()
 34 | 
 35 |     def log(self, *args, **kwargs):
 36 |         """use stdout.flush to allow streaming to file when used by IPython. IPython doesn't have -u option."""
 37 |         self.print(*args, **kwargs)
 38 | 
 39 |     # TODO: take a look at https://gist.github.com/FredLoney/5454553
 40 |     def debug(self, *args, **kwargs):
 41 |         # DONE: current call stack instead of last traceback instead of.
 42 |         if self.is_debug:
 43 |             stacks = traceback.extract_stack()
 44 |             last_caller = stacks[-2]
 45 |             path = last_caller.filename.split('/')
 46 |             self.white(path[-2], end='/')
 47 |             self.green(path[-1], end=' ')
 48 |             self.white('L', end='')
 49 |             self.red('{}:'.format(last_caller.lineno), end=' ')
 50 |             self.grey(last_caller.line)
 51 |             self.white('----------------------')
 52 |             self.print(*args, **kwargs)
 53 | 
 54 |     def refresh(self, *args, **kwargs):
 55 |         """allow keyword override of end='\r', so that only last print refreshes the console."""
 56 |         # to prevent from creating new line
 57 |         # default new end to single space.
 58 |         if 'end' not in kwargs:
 59 |             kwargs['end'] = ' '
 60 |         self.print('\r', *args, **kwargs)
 61 | 
 62 |     def info(self, *args, **kwargs):
 63 |         self.cprint(*args, color='blue', **kwargs)
 64 | 
 65 |     def error(self, *args, sep='', **kwargs):
 66 |         self.cprint(*args, color='red', **kwargs)
 67 | 
 68 |     def warn(self, *args, **kwargs):
 69 |         self.cprint(*args, color='yellow', **kwargs)
 70 | 
 71 |     def highlight(self, *args, **kwargs):
 72 |         self.cprint(*args, color='green', **kwargs)
 73 | 
 74 |     def green(self, *args, **kwargs):
 75 |         self.cprint(*args, color='green', **kwargs)
 76 | 
 77 |     def grey(self, *args, **kwargs):
 78 |         self.cprint(*args, color='grey', **kwargs)
 79 | 
 80 |     def red(self, *args, **kwargs):
 81 |         self.cprint(*args, color='red', **kwargs)
 82 | 
 83 |     def yellow(self, *args, **kwargs):
 84 |         self.cprint(*args, color='yellow', **kwargs)
 85 | 
 86 |     def blue(self, *args, **kwargs):
 87 |         self.cprint(*args, color='blue', **kwargs)
 88 | 
 89 |     def magenta(self, *args, **kwargs):
 90 |         self.cprint(*args, color='magenta', **kwargs)
 91 | 
 92 |     def cyan(self, *args, **kwargs):
 93 |         self.cprint(*args, color='cyan', **kwargs)
 94 | 
 95 |     def white(self, *args, **kwargs):
 96 |         self.cprint(*args, color='white', **kwargs)
 97 | 
 98 |         # def assert(self, statement, warning):
 99 |         #     if not statement:
100 |         #         self.error(warning)
101 |         #
102 | 
103 |     def raise_(self, exception, *args, **kwargs):
104 |         self.error(*args, **kwargs)
105 |         raise exception
106 | 
107 | 
108 | class Struct():
109 |     def __init__(self, **d):
110 |         """Features:
111 |         0. Take in a list of keyword arguments in constructor, and assign them as attributes
112 |         1. Correctly handles `dir` command, so shows correct auto-completion in editors.
113 |         2. Correctly handles `vars` command, and returns a dictionary version of self. 
114 |         
115 |         When recursive is set to False, 
116 |         """
117 |         # double underscore variables are mangled by python, so we use keyword argument dictionary instead.
118 |         # Otherwise you will have to use __Struct_recursive = False instead.
119 |         if '__recursive' in d:
120 |             __recursive = d['__recursive']
121 |             del d['__recursive']
122 |         else:
123 |             __recursive = True
124 |         self.__is_recursive = __recursive
125 |         # keep the input as a reference. Destructuring breaks this reference.
126 |         self.__d = d
127 | 
128 |     def __dir__(self):
129 |         return self.__dict__.keys()
130 | 
131 |     def __str__(self):
132 |         return str(self.__dict__)
133 | 
134 |     def __getattr__(self, key):
135 |         value = self.__d[key]
136 |         if type(value) == type({}) and self.__is_recursive:
137 |             return Struct(**value)
138 |         else:
139 |             return value
140 | 
141 |     def __getattribute__(self, key):
142 |         if key == "_Struct__d" or key == "__dict__":
143 |             return super().__getattribute__("__d")
144 |         elif key in ["_Struct__is_recursive", "__is_recursive"]:
145 |             return super().__getattribute__("__is_recursive")
146 |         else:
147 |             return super().__getattr__(key)
148 | 
149 |     def __setattr__(self, key, value):
150 |         if key == "_Struct__d":
151 |             super().__setattr__("__d", value)
152 |         elif key == "_Struct__is_recursive":
153 |             super().__setattr__("__is_recursive", value)
154 |         else:
155 |             self.__d[key] = value
156 | 
157 | 
158 | ledger = Ledger()
159 | 
160 | if __name__ == "__main__":
161 |     import time
162 | 
163 |     # print('running test as main script...')
164 |     # ledger.log('blah_1', 'blah_2')
165 |     # for i in range(10):
166 |     #     ledger.refresh('{}: hahahaha'.format(i))
167 |     #     ledger.green('hahaha', end=" ")
168 |     #     time.sleep(0.5)
169 | 
170 |     # test dictionary to object
171 |     test_dict = {
172 |         'a': 0,
173 |         'b': 1
174 |     }
175 | 
176 |     test_args = Struct(**test_dict)
177 |     assert test_args.a == 0
178 |     assert test_args.b == 1
179 |     test_args.haha = 0
180 |     assert test_args.haha == 0
181 |     test_args.haha = {'a': 1}
182 |     assert test_args.haha != {'a': 1}
183 |     assert vars(test_args.haha) == {'a': 1}
184 |     assert test_args.haha.a == 1
185 |     assert test_args.__dict__['haha']['a'] == 1
186 |     assert vars(test_args)['haha']['a'] == 1
187 |     print(test_args)
188 | 
189 |     test_args = Struct(__recursive=False, **test_dict)
190 |     assert test_args.__is_recursive == False
191 |     assert test_args.a == 0
192 |     assert test_args.b == 1
193 |     test_args.haha = {'a': 1}
194 |     assert test_args.haha['a'] == 1
195 |     assert test_args.haha == {'a': 1}
196 | 
197 |     ledger.green('*Struct* tests have passed.')
198 | 
199 |     # Some other usage patterns
200 |     test_args = Struct(**test_dict, **{'ha': 'ha', 'no': 'no'})
201 |     print(test_args.ha)
202 | 


--------------------------------------------------------------------------------
/modules/sampleRNN.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | 
  5 | 
  6 | # NOTE: Considering removing this class, since the only thing it does is to add up-sampling.
  7 | # done: add up-sampling
  8 | # todo: add multiple layers
  9 | class PerforatedRNN(nn.Module):
 10 |     """This fun little module up-samples its output by `k` in a way similar to *perforated* up-sampling done in CNN.
 11 |     NOTE:
 12 |         - Does NOT support multiple layers.
 13 |         - Does NOT support batch_first, always assumes seq_length to be first index.
 14 |     
 15 |     Args:
 16 |         input_size: The number of expected features in the input x
 17 |         hidden_size: The number of features in the hidden state h
 18 |         k: the up-sampling within each layer
 19 |         bidirectional: If True, becomes a bidirectional RNN. Default: False
 20 | 
 21 |     Inputs: input, h_0
 22 |         - **input** (seq_len, batch, input_size): **Same as RNN**. tensor containing the features of the input sequence.
 23 |           The input can also be a packed variable length sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
 24 |           for details.
 25 |         - **h_0** (num_layers * num_directions, batch, hidden_size): **Same as RNN** tensor containing the initial
 26 |           hidden state for each element in the batch.
 27 | 
 28 |     Outputs: output, h_n
 29 |         - **output** (seq_len, batch, hidden_size * num_directions): tensor containing the output features h_t from
 30 |           the last layer of the RNN, for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been given as the
 31 |           input, the output will also be a packed sequence.
 32 |         - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t=seq_len
 33 |     """
 34 | 
 35 |     def __init__(self, input_size, output_size, k, bidirectional=False, activation=nn.ReLU):
 36 |         """k is the up-sampling factor."""
 37 |         super(PerforatedRNN, self).__init__()
 38 |         self.char_set = input_size
 39 |         self.hidden_size = output_size
 40 |         self.k = k
 41 |         self.n_layers = 1
 42 |         self.bi_multiplier = 2 if bidirectional else 1
 43 | 
 44 |         self.gru = nn.GRU(output_size, output_size, batch_first=True)
 45 |         self.activation = activation()
 46 | 
 47 |     def forward(self, x, hidden):
 48 |         """
 49 |         Perforated Up-sampling: add zeros in-between real outputs.
 50 |         
 51 |         NOTE: 
 52 |             - batch index goes first.
 53 |             - no support for `batch_first` yet.
 54 |         """
 55 |         output, hidden = self.gru(x, hidden)
 56 |         seq_len, batch_size, feature_n = output.size()
 57 |         # done: up-sampling
 58 |         output_perforated = Variable(
 59 |             torch.zeros(
 60 |                 int(seq_len * self.k),
 61 |                 batch_size,
 62 |                 feature_n
 63 |             )
 64 |         )
 65 |         # NOTE: no support for sequence index advanced indexing.
 66 |         # output_perforated[::k] = output
 67 |         for i in range(seq_len):
 68 |             output_perforated[i*k] = output
 69 |         return output_perforated, hidden
 70 | 
 71 |     def init_hidden(self, batch_size, rand: bool = False):
 72 |         """remember, hidden layer always has batch_size at index = 1, regardless of the batch_first flag."""
 73 |         if rand:
 74 |             return Variable(
 75 |                 torch.randn(
 76 |                     self.bi_multiplier * self.n_layers,
 77 |                     batch_size,
 78 |                     self.hidden_size))
 79 |         else:
 80 |             return Variable(
 81 |                 torch.zeros(
 82 |                     self.bi_multiplier * self.n_layers,
 83 |                     batch_size,
 84 |                     self.hidden_size))
 85 | 
 86 | 
 87 | class SampleRNN(nn.Module):
 88 |     """The sample RNN module, built on-top of perforated RNN.
 89 |     params:
 90 |         - input_size
 91 |         - output_size (one-hot)
 92 |         - ks: an array of up-sampling
 93 |     input: 
 94 |         - 
 95 |     """
 96 | 
 97 |     def __init__(self, input_size, output_size, hidden_size, k, n_layers=1, bidirectional=False):
 98 |         super(SampleRNN, self).__init__()
 99 |         self.input_size = input_size
100 |         self.output_size = output_size
101 |         self.hidden_size = hidden_size
102 |         self.n_layers = n_layers
103 |         # NOTE: accept both [k*] as well as k:<int>
104 |         self.ks = k if type(k) is 'list' else [k] * self.n_layers
105 |         assert (len(k) == n_layers)
106 |         self.layers = []
107 |         self.layers.append(PerforatedRNN(input_size, hidden_size))
108 |         for i in range(1, self.n_layers - 1):
109 |             self.layers.append(PerforatedRNN(hidden_size, hidden_size))
110 |         if i < self.n_layers:
111 |             self.layers.append(PerforatedRNN(hidden_size, output_size))
112 | 
113 |         self.bidirectional = bidirectional
114 | 
115 |     def init_hidden(self, batch_size, *args, **kwrags):
116 |         """takes in the batch_size of the input."""
117 |         return [layer.init_hidden(batch_size, *args, **kwrags) for layer in self.layers]
118 | 
119 |     # todo: add teacher_forcing to the wavelet output?
120 |     def forward(self, input, hidden, target=None):
121 |         """
122 |         hidden is a list where each item is the hidden vector for each layer.
123 |         ```
124 |         hidden = [
125 |             Size(seq_len, batch_size, hidden_size)
126 |         ]
127 |         ```
128 |         We get these parameters from the layer array.
129 |         """
130 |         # NOTE: hidden always has second index being the batch index.
131 | 
132 |         x = input
133 |         hidden_updated = []
134 |         for ind, (layer, hidden) in enumerate(zip(self.layers, hidden if type(hidden) is 'list' else [hidden])):
135 |             x, _hidden = self.layer.forward(x, hidden)
136 |             hidden_updated.append(_hidden)
137 |         return x, hidden_updated
138 | 
139 |     # todo: need to evaluate this method.
140 |     def setup_training(self, learning_rate):
141 |         self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
142 |         self.optimizer.zero_grad()
143 |         self.loss_fn = nn.CrossEntropyLoss()
144 |         self.init_hidden_()
145 | 
146 |     def load(self, fn):
147 |         # TODO: load the input and output language as well, to maintain the char list.
148 |         checkpoint = torch.load(fn)
149 |         self.load_state_dict(checkpoint['state_dict'])
150 |         self.input_lang.load_dict(checkpoint['input_lang'])
151 |         self.output_lang.load_dict(checkpoint['output_lang'])
152 |         return checkpoint
153 | 
154 |     def save(self, fn="seq-to-seq.cp", meta=None, **kwargs):
155 |         # TODO: save input and output language as well, to maintain the char list.
156 |         d = {k: kwargs[k] for k in kwargs}
157 |         d["state_dict"] = self.state_dict()
158 |         d["input_lang"] = vars(self.input_lang)
159 |         d["output_lang"] = vars(self.output_lang)
160 |         if meta is not None:
161 |             d['meta'] = meta
162 |         torch.save(d, fn)
163 | 


--------------------------------------------------------------------------------
/modules/seq2seq.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | 
  5 | import utils
  6 | 
  7 | # todo: change word seq2seq to char seq2seq
  8 | # todo: add attention to decoder
  9 | 
 10 | 
 11 | class EncoderRNN(nn.Module):
 12 |     """"""
 13 |     def __init__(self, n_chars, embedding_size, n_layers=1, bidirectional=False):
 14 |         super(EncoderRNN, self).__init__()
 15 |         self.char_set = n_chars
 16 |         self.hidden_size = embedding_size
 17 |         self.n_layers = n_layers
 18 |         if bidirectional:
 19 |             self.bi_multiplier = 2
 20 |         else:
 21 |             self.bi_multiplier = 1
 22 | 
 23 |         self.embedding = nn.Embedding(n_chars, embedding_size)
 24 |         self.gru = nn.GRU(embedding_size, embedding_size, batch_first=True)
 25 | 
 26 |     def forward(self, input, hidden):
 27 |         """batch index goes first."""
 28 |         batch_size = input.size()[0]
 29 |         embeded = self.embedding(input).view(batch_size, -1, self.hidden_size)
 30 |         output, hidden = self.gru(embeded, hidden)
 31 |         return output, hidden
 32 | 
 33 |     def init_hidden(self, batch_size, random=False):
 34 |         """remember, hidden layer always has batch_size at index = 1, reguardless of the batch_first flag."""
 35 |         if random:
 36 |             return Variable(
 37 |                 torch.randn(
 38 |                     self.bi_multiplier * self.n_layers,
 39 |                     batch_size,
 40 |                     self.hidden_size))
 41 |         else:
 42 |             return Variable(
 43 |                 torch.zeros(
 44 |                     self.bi_multiplier * self.n_layers,
 45 |                     batch_size,
 46 |                     self.hidden_size))
 47 | 
 48 | 
 49 | class DecoderRNN(nn.Module):
 50 |     def __init__(self, n_chars, embedding_size, n_layers=1, bidirectional=False):
 51 |         """Need to impedance match input and output. Input is class_index, output is class_index, 
 52 |         but we also need the softmax raw during training, since it contains more information."""
 53 |         super(DecoderRNN, self).__init__()
 54 |         self.n_chars = n_chars
 55 |         self.hidden_size = embedding_size
 56 |         self.n_layers = n_layers
 57 |         if bidirectional:
 58 |             self.bi_multiplier = 2
 59 |         else:
 60 |             self.bi_multiplier = 1
 61 | 
 62 |         self.embedding = nn.Embedding(self.n_chars, embedding_size)
 63 |         # add dropout
 64 |         self.gru = nn.GRU(embedding_size, embedding_size, batch_first=True)
 65 |         self.output_embedding = nn.Linear(self.hidden_size, n_chars)
 66 |         self.output_softmax = nn.Softmax()
 67 | 
 68 |     def embed(self, input_char_indexes):
 69 |         """char index to embedding"""
 70 |         batch_size = input_char_indexes.size()[0]
 71 |         embeded = self.embedding(input_char_indexes).view(batch_size, -1, self.hidden_size)
 72 |         return embeded
 73 | 
 74 |     def forward(self, embeded, hidden):
 75 |         """batch index goes first. Input and output are both size <,,n_chars>"""
 76 |         batch_size = embeded.size()[0]
 77 |         output, hidden = self.gru(embeded, hidden)
 78 |         output_embeded = self.output_embedding(output.view(-1, self.hidden_size)).view(batch_size, -1, self.n_chars)
 79 |         return output_embeded, hidden
 80 | 
 81 |     def extract(self, output_embeded):
 82 |         """char embedding to class indexes"""
 83 |         b_size, seq_len, n_chars = output_embeded.size()
 84 |         output_softmax = self.output_softmax(output_embeded.view(-1, n_chars))
 85 |         # TODO: alternatively: output_chars = output_softmax.multinomial(1).view(batch_size, -1)
 86 |         _, output_char_indexes = output_softmax.topk(1, dim=1)  # .view(batch_size, -1)
 87 |         return output_char_indexes.view(b_size, seq_len)
 88 | 
 89 | 
 90 | def init_hidden(self, batch_size, random=False):
 91 |     """remember, hidden layer always has batch_size at index = 1, reguardless of the batch_first flag."""
 92 |     if random:
 93 |         return Variable(
 94 |             torch.randn(
 95 |                 self.bi_multiplier * self.n_layers,
 96 |                 batch_size,
 97 |                 self.hidden_size))
 98 |     else:
 99 |         return Variable(
100 |             torch.zeros(
101 |                 self.bi_multiplier * self.n_layers,
102 |                 batch_size,
103 |                 self.hidden_size))  # TODO: attention
104 | 
105 | 
106 | import random
107 | 
108 | 
109 | class VanillaSequenceToSequence(nn.Module):
110 |     def __init__(self, input_lang, output_lang, hidden_size, n_layers=1, bidirectional=False):
111 |         super(VanillaSequenceToSequence, self).__init__()
112 |         self.input_lang = input_lang
113 |         self.output_lang = output_lang
114 |         self.encoder = EncoderRNN(input_lang.n_chars, hidden_size, n_layers, bidirectional)
115 |         self.decoder = DecoderRNN(output_lang.n_chars, hidden_size, n_layers, bidirectional)
116 | 
117 |     def init_hidden(self, batch_size):
118 |         return self.encoder.init_hidden(batch_size)
119 | 
120 |     def get_SOS_vec(self, batch_size):
121 |         return Variable(torch.LongTensor([[self.output_lang.SOS_ind]] * batch_size))
122 | 
123 |     def forward(self, input, hidden, target=None, teacher_r=0, max_output_length=None):
124 |         # DONE: Should really use Module.train and Module.eval to set the training flag, then handle the different logic inside the forward function. This separate evaluation function is repetitive and will not needed in that case.
125 |         # NOTE: hidden always has second index being the batch index.
126 |         batch_size = hidden.size()[1]
127 |         target_size = 0 if target is None else target.size()[1]
128 |         encoder_output, encoded = self.encoder(input, hidden)
129 |         slices = []
130 |         embeded_outputs = []
131 |         hidden = encoded
132 |         output_char_batch = self.get_SOS_vec(batch_size)
133 |         # TODO: make it so end of sentence for all elements in batch trigger end of while loop.
134 |         eos_flags = list(range(batch_size))
135 |         while len(slices) < (max_output_length or self.args.MAX_OUTPUT_LEN) \
136 |                 and len(eos_flags) != 0:
137 |             # char_slice size(b_size, 1), is correct
138 |             ## This is where you add teacher forcing
139 |             index = len(slices)
140 |             i_vec = Variable(torch.LongTensor([index]))
141 |             # TODO: use tensor combine/add_with_mask operator here instead.
142 |             output_slice_forced = output_char_batch \
143 |                 if random.random() > teacher_r or index >= target_size \
144 |                 else target.index_select(1, i_vec)
145 |             output_embedded, hidden = self.decoder(self.decoder.embed(output_slice_forced), hidden)
146 |             # convert embedded to class_index here
147 |             output_char_batch = self.decoder.extract(output_embedded)
148 | 
149 |             # Now add the slices to the output stack. char_slice(b_size, 1) -> size(b_size)
150 |             slices.append(output_char_batch.view(batch_size))
151 |             embeded_outputs.append(output_embedded.view(batch_size, -1))
152 | 
153 |             for ind, s in enumerate(output_char_batch):
154 |                 s_index = int(s.data.numpy()[0])
155 |                 if ind in eos_flags:
156 |                     if s_index == self.output_lang.EOS_ind:
157 |                         eos_flags.remove(ind)
158 | 
159 |         # TODO: fix mismatch output between evaluate and forward.
160 |         return torch.stack(slices, dim=1), hidden, torch.stack(embeded_outputs, dim=1)
161 | 
162 |     def setup_training(self, learning_rate):
163 |         self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
164 |         self.optimizer.zero_grad()
165 |         self.loss_fn = nn.CrossEntropyLoss()
166 |         self.init_hidden_()
167 | 
168 |     def load(self, fn):
169 |         # TODO: load the input and output language as well, to maintain the char list.
170 |         checkpoint = torch.load(fn)
171 |         self.load_state_dict(checkpoint['state_dict'])
172 |         self.input_lang.load_dict(checkpoint['input_lang'])
173 |         self.output_lang.load_dict(checkpoint['output_lang'])
174 |         return checkpoint
175 | 
176 |     def save(self, fn="seq-to-seq.cp", meta=None, **kwargs):
177 |         # TODO: save input and output language as well, to maintain the char list.
178 |         d = {k: kwargs[k] for k in kwargs}
179 |         d["state_dict"] = self.state_dict()
180 |         d["input_lang"] = vars(self.input_lang)
181 |         d["output_lang"] = vars(self.output_lang)
182 |         if meta is not None:
183 |             d['meta'] = meta
184 |         torch.save(d, fn)
185 | 


--------------------------------------------------------------------------------
/neural_vocoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/neural_vocoder/__init__.py


--------------------------------------------------------------------------------
/neural_vocoder/neural_vocoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | import torch.nn as nn
 4 | import torchvision.datasets as dsets
 5 | import torchvision.transforms as transforms
 6 | from torch.autograd import Variable
 7 | 
 8 | class NeuralVocoder(nn.Module):
 9 |     def __init__(self):
10 |         super(NeuralVocoder, self).__init__()
11 | 


--------------------------------------------------------------------------------
/neural_vocoder/recursive_dilation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | import torch.nn as nn
 4 | import torchvision.datasets as dsets
 5 | import torchvision.transforms as transforms
 6 | from torch.autograd import Variable
 7 | 
 8 | class RecursiveDilation(nn.Module):
 9 |     def __init__(self):
10 |         super(RecursiveDilation, self).__init__()
11 | 


--------------------------------------------------------------------------------
/scratch/rnn_sample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Variable
 4 | 
 5 | # test script
 6 | rnn = nn.GRU(input_size=10, hidden_size=200, num_layers=2, bidirectional=True)
 7 | 
 8 | x0 = Variable(torch.randn(15, 100, 10))
 9 | h0 = Variable(torch.randn(2 * 200, 100, 200))
10 | x, h = rnn(x0, h0)
11 | 


--------------------------------------------------------------------------------
/text_to_features/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/text_to_features/__init__.py


--------------------------------------------------------------------------------
/text_to_features/text_to_feature.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.autograd import Variable
  3 | import torch.nn as nn
  4 | 
  5 | import utils
  6 | 
  7 | 
  8 | class EncoderRNN(nn.Module):
  9 |     def __init__(self, n_words, embedding_size, n_layers=1, bidirectional=False):
 10 |         super(EncoderRNN, self).__init__()
 11 |         self.char_set = n_words
 12 |         self.hidden_size = embedding_size
 13 |         self.n_layers = n_layers
 14 |         if bidirectional:
 15 |             self.bi_multiplier = 2
 16 |         else:
 17 |             self.bi_multiplier = 1
 18 | 
 19 |         self.embedding = nn.Embedding(n_words, embedding_size)
 20 |         self.gru = nn.GRU(embedding_size, embedding_size, batch_first=True)
 21 | 
 22 |     def forward(self, input, hidden):
 23 |         """batch index goes first."""
 24 |         batch_size = input.size()[0]
 25 |         embeded = self.embedding(input).view(batch_size, -1, self.hidden_size)
 26 |         output, hidden = self.gru(embeded, hidden)
 27 |         return output, hidden
 28 | 
 29 |     def init_hidden(self, batch_size, random=False):
 30 |         """remember, hidden layer always has batch_size at index = 1, reguardless of the batch_first flag."""
 31 |         if random:
 32 |             return Variable(
 33 |                 torch.randn(
 34 |                     self.bi_multiplier * self.n_layers,
 35 |                     batch_size,
 36 |                     self.hidden_size))
 37 |         else:
 38 |             return Variable(
 39 |                 torch.zeros(
 40 |                     self.bi_multiplier * self.n_layers,
 41 |                     batch_size,
 42 |                     self.hidden_size))
 43 | 
 44 | 
 45 | class DecoderRNN(nn.Module):
 46 |     def __init__(self, n_words, embedding_size, n_layers=1, bidirectional=False):
 47 |         """Need to impedance match input and output. Input is class_index, output is class_index, 
 48 |         but we also need the softmax raw during training, since it contains more information."""
 49 |         super(DecoderRNN, self).__init__()
 50 |         self.n_words = n_words
 51 |         self.hidden_size = embedding_size
 52 |         self.n_layers = n_layers
 53 |         if bidirectional:
 54 |             self.bi_multiplier = 2
 55 |         else:
 56 |             self.bi_multiplier = 1
 57 | 
 58 |         self.embedding = nn.Embedding(self.n_words, embedding_size)
 59 |         # add dropout
 60 |         self.gru = nn.GRU(embedding_size, embedding_size, batch_first=True)
 61 |         self.output_embedding = nn.Linear(self.hidden_size, n_words)
 62 |         self.output_softmax = nn.Softmax()
 63 | 
 64 |     def embed(self, input_word_indexes):
 65 |         """word index to embedding"""
 66 |         batch_size = input_word_indexes.size()[0]
 67 |         embeded = self.embedding(input_word_indexes).view(batch_size, -1, self.hidden_size)
 68 |         return embeded
 69 | 
 70 |     def forward(self, embeded, hidden):
 71 |         """batch index goes first. Input and output are both size <,,n_words>"""
 72 |         batch_size = embeded.size()[0]
 73 |         output, hidden = self.gru(embeded, hidden)
 74 |         output_embeded = self.output_embedding(output.view(-1, self.hidden_size)).view(batch_size, -1, self.n_words)
 75 |         return output_embeded, hidden
 76 | 
 77 |     def extract(self, output_embeded):
 78 |         """word embedding to class indexes"""
 79 |         b_size, seq_len, n_words = output_embeded.size()
 80 |         output_softmax = self.output_softmax(output_embeded.view(-1, n_words))
 81 |         # TODO: alternatively: output_words = output_softmax.multinomial(1).view(batch_size, -1)
 82 |         _, output_word_indexes = output_softmax.topk(1, dim=1)  # .view(batch_size, -1)
 83 |         return output_word_indexes.view(b_size, seq_len)
 84 | 
 85 | 
 86 | def init_hidden(self, batch_size, random=False):
 87 |     """remember, hidden layer always has batch_size at index = 1, reguardless of the batch_first flag."""
 88 |     if random:
 89 |         return Variable(
 90 |             torch.randn(
 91 |                 self.bi_multiplier * self.n_layers,
 92 |                 batch_size,
 93 |                 self.hidden_size))
 94 |     else:
 95 |         return Variable(
 96 |             torch.zeros(
 97 |                 self.bi_multiplier * self.n_layers,
 98 |                 batch_size,
 99 |                 self.hidden_size))  # TODO: attention
100 | 
101 | 
102 | # 1. [ ] get training hooked up
103 | #   1. [ ]
104 | 
105 | import random
106 | 
107 | 
108 | class VanillaSequenceToSequence(nn.Module):
109 |     def __init__(self, input_lang, output_lang, hidden_size, n_layers=1, bidirectional=False):
110 |         super(VanillaSequenceToSequence, self).__init__()
111 |         self.input_lang = input_lang
112 |         self.output_lang = output_lang
113 |         self.encoder = EncoderRNN(input_lang.n_words, hidden_size, n_layers, bidirectional)
114 |         self.decoder = DecoderRNN(output_lang.n_words, hidden_size, n_layers, bidirectional)
115 | 
116 |     def init_hidden(self, batch_size):
117 |         return self.encoder.init_hidden(batch_size)
118 | 
119 |     def get_SOS_vec(self, batch_size):
120 |         return Variable(torch.LongTensor([[self.output_lang.SOS_ind]] * batch_size))
121 | 
122 |     def forward(self, input, hidden, target=None, teacher_r=0, max_output_length=None):
123 |         # DONE: Should really use Module.train and Module.eval to set the training flag, then handle the different logic inside the forward function. This separate evaluation function is repetitive and will not needed in that case.
124 |         # NOTE: hidden always has second index being the batch index.
125 |         batch_size = hidden.size()[1]
126 |         target_size = 0 if target is None else target.size()[1]
127 |         encoder_output, encoded = self.encoder(input, hidden)
128 |         slices = []
129 |         embeded_outputs = []
130 |         hidden = encoded
131 |         output_word_batch = self.get_SOS_vec(batch_size)
132 |         # TODO: make it so end of sentence for all elements in batch trigger end of while loop.
133 |         eos_flags = list(range(batch_size))
134 |         while len(slices) < (max_output_length or self.args.MAX_OUTPUT_LEN) \
135 |                 and len(eos_flags) != 0:
136 |             # word_slice size(b_size, 1), is correct
137 |             ## This is where you add teacher forcing
138 |             index = len(slices)
139 |             i_vec = Variable(torch.LongTensor([index]))
140 |             # TODO: use tensor combine/add_with_mask operator here instead.
141 |             output_slice_forced = output_word_batch \
142 |                 if random.random() > teacher_r or index >= target_size \
143 |                 else target.index_select(1, i_vec)
144 |             output_embedded, hidden = self.decoder(self.decoder.embed(output_slice_forced), hidden)
145 |             # convert embedded to class_index here
146 |             output_word_batch = self.decoder.extract(output_embedded)
147 | 
148 |             # Now add the slices to the output stack. word_slice(b_size, 1) -> size(b_size)
149 |             slices.append(output_word_batch.view(batch_size))
150 |             embeded_outputs.append(output_embedded.view(batch_size, -1))
151 | 
152 |             for ind, s in enumerate(output_word_batch):
153 |                 s_index = int(s.data.numpy()[0])
154 |                 if ind in eos_flags:
155 |                     if s_index == self.output_lang.EOS_ind:
156 |                         eos_flags.remove(ind)
157 | 
158 |         # TODO: fix mismatch output between evaluate and forward.
159 |         return torch.stack(slices, dim=1), hidden, torch.stack(embeded_outputs, dim=1)
160 | 
161 |     def setup_training(self, learning_rate):
162 |         self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
163 |         self.optimizer.zero_grad()
164 |         self.loss_fn = nn.CrossEntropyLoss()
165 |         self.init_hidden_()
166 | 
167 |     def load(self, fn):
168 |         # TODO: load the input and output language as well, to maintain the word list.
169 |         checkpoint = torch.load(fn)
170 |         self.load_state_dict(checkpoint['state_dict'])
171 |         self.input_lang.load_dict(checkpoint['input_lang'])
172 |         self.output_lang.load_dict(checkpoint['output_lang'])
173 |         return checkpoint
174 | 
175 |     def save(self, fn="seq-to-seq.cp", meta=None, **kwargs):
176 |         # TODO: save input and output language as well, to maintain the word list.
177 |         d = {k: kwargs[k] for k in kwargs}
178 |         d["state_dict"] = self.state_dict()
179 |         d["input_lang"] = vars(self.input_lang)
180 |         d["output_lang"] = vars(self.output_lang)
181 |         if meta is not None:
182 |             d['meta'] = meta
183 |         torch.save(d, fn)
184 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | .idea
3 | 
4 | DS_Store
5 | 
6 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/Language Pair Scraper.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true,
  8 |     "run_control": {
  9 |      "frozen": false,
 10 |      "read_only": false
 11 |     }
 12 |    },
 13 |    "source": [
 14 |     "# Simple Downloader\n",
 15 |     "\n",
 16 |     "Scraping and downloading stuff from the internet is commonly the first step for every experiment. here is a simple Page class that has bunch of helper methods that makes this type of work much much simpler.\n",
 17 |     "\n",
 18 |     "\n",
 19 |     "**Async and Multi-process crawing is much much faster**. I initially wrote the engadget crawer as a single threaded class. Because the python `requests` library is synchronous, the crawler spent virtually all time waiting for the `GET` requests.\n",
 20 |     "    \n",
 21 |     "    This could be made a *lot* faster by parallelizing the crawling, or use proper async pattern. \n",
 22 |     "\n",
 23 |     "    This thought came to me pretty late during the second crawl so I did not implement it. But for future work, parallel and async crawler is going to be on the todo list.\n",
 24 |     "    \n",
 25 |     "    \n",
 26 |     "## TODO\n",
 27 |     "\n",
 28 |     "- [ ] use async pattern for the requests, so that we don't spend 90% of the time waiting for `GET` request to finish.\n",
 29 |     "- [ ] use multiple-threads to craw."
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "%load_ext autoreload\n",
 41 |     "%autoreload 2"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {
 48 |     "collapsed": false
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "%%bash \n",
 53 |     "mkdir data"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "collapsed": false,
 61 |     "scrolled": false
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "from download_links import Page\n",
 66 |     "\n",
 67 |     "p = Page('http://www.manythings.org/anki/', debug=True)\n",
 68 |     "p.set_mask('(.*).zip')\n",
 69 |     "p.request()\n",
 70 |     "\n",
 71 |     "for m, n in p.get_anchors():\n",
 72 |     "    n_p = Page(p.url + m[0], debug=True)\n",
 73 |     "    n_p.download('./data/' + m[0], 'wb', chunk_size=4096*2**4)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 96,
 79 |    "metadata": {
 80 |     "collapsed": false,
 81 |     "deletable": true,
 82 |     "editable": true,
 83 |     "run_control": {
 84 |      "frozen": false,
 85 |      "read_only": false
 86 |     }
 87 |    },
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "[NbConvertApp] Converting notebook Language Pair Scraper.ipynb to script\n",
 94 |       "[NbConvertApp] Writing 1632 bytes to Language Pair Scraper.py\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "!jupyter nbconvert --to script \"Language Pair Scraper.ipynb\""
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 97,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "Process is interrupted.\n"
114 |      ]
115 |     }
116 |    ],
117 |    "source": [
118 |     "%%bash\n",
119 |     "source activate deep-learning\n",
120 |     "rm crawl.out\n",
121 |     "ipython 'Language Pair Scraper.py' > crawl.out"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "doesn't work."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": false
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "%%bash\n",
140 |     "tail -f crawl.out"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {
147 |     "collapsed": true
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     ""
152 |    ]
153 |   }
154 |  ],
155 |  "metadata": {
156 |   "anaconda-cloud": {},
157 |   "hide_input": false,
158 |   "kernelspec": {
159 |    "display_name": "Python [conda env:deep-learning]",
160 |    "language": "python",
161 |    "name": "conda-env-deep-learning-py"
162 |   },
163 |   "language_info": {
164 |    "codemirror_mode": {
165 |     "name": "ipython",
166 |     "version": 3.0
167 |    },
168 |    "file_extension": ".py",
169 |    "mimetype": "text/x-python",
170 |    "name": "python",
171 |    "nbconvert_exporter": "python",
172 |    "pygments_lexer": "ipython3",
173 |    "version": "3.6.0"
174 |   },
175 |   "toc": {
176 |    "colors": {
177 |     "hover_highlight": "#DAA520",
178 |     "running_highlight": "#FF0000",
179 |     "selected_highlight": "#FFD700"
180 |    },
181 |    "moveMenuLeft": true,
182 |    "nav_menu": {
183 |     "height": "305px",
184 |     "width": "280px"
185 |    },
186 |    "navigate_menu": true,
187 |    "number_sections": true,
188 |    "sideBar": true,
189 |    "threshold": 4.0,
190 |    "toc_cell": false,
191 |    "toc_section_display": "block",
192 |    "toc_window_display": false,
193 |    "widenNotebook": false
194 |   }
195 |  },
196 |  "nbformat": 4,
197 |  "nbformat_minor": 0
198 | }


--------------------------------------------------------------------------------
/word_seq_2_seq/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Machine Translation [Work-In-Progress 2017-04-02]
 2 | [![](https://img.shields.io/badge/link_on-GitHub-brightgreen.svg?style=flat-square)](https://github.com/episodeyang/deep_machine_translation/tree/master#deep-machine-translation-work-in-progress-2017-04-02)
 3 | 
 4 | 
 5 | This is a fun week-long project I did to implement a sequence to sequence model in PyTorch. The project uses language pairs from the Anki project as the training set.
 6 | 
 7 | ## Usage
 8 | 
 9 | #### work-in-progress
10 | 
11 | 1. First unzip a language pair. use `eng-cmn.txt` from the [training-data](training-data/) folder for example.
12 | 
13 | 2. Then run this script bellow (will be put into the make file):
14 | 
15 |     ```bash
16 |     python traing.py python train.py -cf=checkpoints
17 |     ```
18 | 
19 | ### Command Line Options
20 | 
21 | ```bash
22 | usage: train.py [-h] [-d DEBUG] [-cf CHECKPOINT_FOLDER] [-cp CHECKPOINT]
23 |                 [--checkpoint-batch-stamp CHECKPOINT_BATCH_STAMP]
24 |                 [-il INPUT_LANG] [-ol OUTPUT_LANG]
25 |                 [--max-data-len MAX_DATA_LEN] [--dash-id DASH_ID]
26 |                 [--batch-size BATCH_SIZE] [--learning-rate LEARNING_RATE]
27 |                 [--n-epoch N_EPOCH] [-e EVAL_INTERVAL]
28 |                 [--teacher-forcing-r TEACHER_FORCING_R] [-s SAVE_INTERVAL]
29 |                 [--n-layers N_LAYERS] [--bi-directional BI_DIRECTIONAL]
30 | 
31 | Sequence-To-Sequence Model in PyTorch
32 | 
33 | optional arguments:
34 |   -h, --help            show this help message and exit
35 |   -d DEBUG, --debug DEBUG
36 |                         debug mode prints more info
37 |   -cf CHECKPOINT_FOLDER, --checkpoint-folder CHECKPOINT_FOLDER
38 |                         folder where it saves checkpoint files
39 |   -cp CHECKPOINT, --checkpoint CHECKPOINT
40 |                         the checkpoint to load
41 |   --checkpoint-batch-stamp CHECKPOINT_BATCH_STAMP
42 |                         the checkpoint to load
43 |   -il INPUT_LANG, --input-lang INPUT_LANG
44 |                         code name for the input language
45 |   -ol OUTPUT_LANG, --output-lang OUTPUT_LANG
46 |                         code name for the output language
47 |   --max-data-len MAX_DATA_LEN
48 |                         maximum length for input output pairs (words)
49 |   --dash-id DASH_ID     maximum length for input output pairs
50 |   --batch-size BATCH_SIZE
51 |                         maximum length for input output pairs
52 |   --learning-rate LEARNING_RATE
53 |                         maximum length for input output pairs
54 |   --n-epoch N_EPOCH     number of epochs to train
55 |   -e EVAL_INTERVAL, --eval-interval EVAL_INTERVAL
56 |                         evaluate model on validation set
57 |   --teacher-forcing-r TEACHER_FORCING_R
58 |                         Float for the teacher-forcing ratio
59 |   -s SAVE_INTERVAL, --save-interval SAVE_INTERVAL
60 |                         evaluate model on validation set
61 |   --n-layers N_LAYERS   maximum length for input output pairs
62 |   --bi-directional BI_DIRECTIONAL
63 |                         whether use bi-directional module for the model
64 | ```
65 | 
66 | ## Key Learning
67 | 
68 | - **Good Demo train fast**. Sean Roberson's demos are very nice for teaching, partially because the training converges very quickly. Most of his demo converge in less than half an hour on a MBP. 
69 | 
70 | - **Write Evaluation Functions Early**. You can't manually check if the results make sense until you have writen the evaluation functions. 
71 |     
72 |     When everything is done correctly, the evaluation gives sensible results very quickly.
73 | 
74 | - **Teacher forcing can be a hyper parameter**. During training, we can tune the teacher forcing ratio between 0 and 1.
75 | 
76 | - **Mini-batch hugely improves training speed**. Here I used a mini-batch of 128 pairs. For the small English-to-Chinese anki dataset, the training takes about 1 min 30 seconds on an i7 PC. And the translated result is acceptable within a few epochs.
77 | 
78 | - **Training and loss function need more work**. The loss function used here feels a bit unsatisfactory.
79 | 
80 | ## TODO List
81 | 
82 | - [x] polish demo
83 | - [x] write sequence to sequence example
84 | - [x] get both training and evaluation to work
85 | - [ ] Add BLUE as accuracy metric
86 | - [ ] Add confusion matrix in demo.
87 | - [ ] Add unzip script for languages
88 | - [ ] polish repo
89 | - [ ] Compare results with attention model
90 | 
91 | ## DONE
92 | - [x] write data scraper, download zip files from anki
93 | - [x] convert zip to text file
94 | - [x] write evaluation function
95 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/TODO.md:
--------------------------------------------------------------------------------
1 | # TODO List
2 | - [ ] write sequence to sequence example
3 | 
4 | ## DONE
5 | - [x] write data scraper, download zip files from anki
6 | - [ ] convert zip to text file
7 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/Translation via Sequence to Sequence Model with Attention.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false,
  8 |     "deletable": true,
  9 |     "editable": true
 10 |    },
 11 |    "outputs": [],
 12 |    "source": [
 13 |     "%load_ext autoreload\n",
 14 |     "%autoreload 2"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {
 20 |     "deletable": true,
 21 |     "editable": true
 22 |    },
 23 |    "source": [
 24 |     "## Main Objectives\n",
 25 |     "\n",
 26 |     "0. [ ] write a language translation model for English to Chinese\n",
 27 |     "1. [ ] Congusion Matrix\n",
 28 |     "2. [ ] Add Attention\n",
 29 |     "\n",
 30 |     "## Todo\n",
 31 |     "\n",
 32 |     "1. [ ] add evaluate method\n",
 33 |     "\n",
 34 |     "1. [ ] use torch.utils.dataloader and stick with a standard api for data.\n",
 35 |     "2. [ ] make input and output language configurable\n",
 36 |     "3. [ ] train multiple models in parallel, with different language, batch_size etc\n",
 37 |     "4. [ ] how does batch size affect loss?\n",
 38 |     "5. [ ] add evaluation print. What is the metric to use for evaluating translation? Take a look at Roberson's notes."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {
 44 |     "deletable": true,
 45 |     "editable": true
 46 |    },
 47 |    "source": [
 48 |     "## Usage of this Training script\n",
 49 |     "\n",
 50 |     "### Output Figures are saved by hyper parameter id.\n",
 51 |     "\n",
 52 |     "for example, batch_size etc. \n",
 53 |     "\n",
 54 |     "loss vs time vs sample."
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {
 60 |     "collapsed": false,
 61 |     "deletable": true,
 62 |     "editable": true
 63 |    },
 64 |    "source": [
 65 |     "%%bash\n",
 66 |     "source activate deep-learning\n",
 67 |     "python -u train.py --batch-size=100 --n-epoch=10"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 2,
 73 |    "metadata": {
 74 |     "collapsed": false,
 75 |     "deletable": true,
 76 |     "editable": true
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "from utils import ledger, Struct\n",
 81 |     "import data\n",
 82 |     "from language import get_language_pairs\n",
 83 |     "from visdom_helper import visdom_helper\n",
 84 |     "from model import VanillaSequenceToSequence"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 9,
 90 |    "metadata": {
 91 |     "collapsed": false,
 92 |     "deletable": true,
 93 |     "editable": true
 94 |    },
 95 |    "outputs": [
 96 |     {
 97 |      "name": "stdout",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "********<loading train module>********\n"
101 |      ]
102 |     }
103 |    ],
104 |    "source": [
105 |     "args = Struct(**{'BATCH_SIZE': 10,\n",
106 |     "                 'BI_DIRECTIONAL': False,\n",
107 |     "                 'DASH_ID': 'seq-to-seq-experiment',\n",
108 |     "                 'DEBUG': True,\n",
109 |     "                 'EVAL_INTERVAL': 10,\n",
110 |     "                 'TEACHER_FORCING_R': 0.5,\n",
111 |     "                 'INPUT_LANG': 'eng',\n",
112 |     "                 'LEARNING_RATE': 0.001,\n",
113 |     "                 'MAX_DATA_LEN': 10,\n",
114 |     "                 'MAX_OUTPUT_LEN': 100,\n",
115 |     "                 'N_EPOCH': 5,\n",
116 |     "                 'N_LAYERS': 1,\n",
117 |     "                 'OUTPUT_LANG': 'cmn',\n",
118 |     "                 'SAVE_INTERVAL': 100})"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 4,
124 |    "metadata": {
125 |     "collapsed": false,
126 |     "deletable": true,
127 |     "editable": true
128 |    },
129 |    "outputs": [
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "********<loading train module>********\n"
135 |      ]
136 |     }
137 |    ],
138 |    "source": [
139 |     "from train import Session"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 5,
145 |    "metadata": {
146 |     "collapsed": false,
147 |     "deletable": true,
148 |     "editable": true
149 |    },
150 |    "outputs": [
151 |     {
152 |      "name": "stdout",
153 |      "output_type": "stream",
154 |      "text": [
155 |       "{'d': {'BATCH_SIZE': 10,\n",
156 |       "       'BI_DIRECTIONAL': False,\n",
157 |       "       'DASH_ID': 'seq-to-seq-experiment',\n",
158 |       "       'DEBUG': True,\n",
159 |       "       'EVAL_INTERVAL': 10,\n",
160 |       "       'INPUT_LANG': 'eng',\n",
161 |       "       'LEARNING_RATE': 0.001,\n",
162 |       "       'MAX_DATA_LEN': 10,\n",
163 |       "       'N_EPOCH': 5,\n",
164 |       "       'N_LAYERS': 1,\n",
165 |       "       'OUTPUT_LANG': 'cmn',\n",
166 |       "       'SAVE_INTERVAL': 100}}\n",
167 |       "\u001b[34mReading Lines... \u001b[0m\n",
168 |       "\u001b[32mNumber of sentence pairs after filtering: 15811\u001b[0m\n",
169 |       "\u001b[32meng\u001b[0m has  \u001b[32m5982\u001b[0m words.\n",
170 |       "\u001b[32mcmn\u001b[0m has  \u001b[32m3206\u001b[0m words.\n",
171 |       "Sequence to sequence model graph:\n",
172 |       " VanillaSequenceToSequence (\n",
173 |       "  (encoder): EncoderRNN (\n",
174 |       "    (embedding): Embedding(5982, 200)\n",
175 |       "    (gru): GRU(200, 200, batch_first=True)\n",
176 |       "  )\n",
177 |       "  (decoder): DecoderRNN (\n",
178 |       "    (embedding): Embedding(3206, 200)\n",
179 |       "    (gru): GRU(200, 200, batch_first=True)\n",
180 |       "    (output_embedding): Linear (200 -> 3206)\n",
181 |       "    (softmax): Softmax ()\n",
182 |       "  )\n",
183 |       ")\n"
184 |      ]
185 |     }
186 |    ],
187 |    "source": [
188 |     "sess = Session(args)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 6,
194 |    "metadata": {
195 |     "collapsed": false,
196 |     "deletable": true,
197 |     "editable": true,
198 |     "scrolled": false
199 |    },
200 |    "outputs": [
201 |     {
202 |      "name": "stderr",
203 |      "output_type": "stream",
204 |      "text": [
205 |       "1582it [01:47, 14.77it/s]"
206 |      ]
207 |     },
208 |     {
209 |      "name": "stdout",
210 |      "output_type": "stream",
211 |      "text": [
212 |       "\u001b[32mepoch 0 is complete\u001b[0m\n"
213 |      ]
214 |     },
215 |     {
216 |      "name": "stderr",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "\n",
220 |       "1582it [01:51, 10.45it/s]"
221 |      ]
222 |     },
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "\u001b[32mepoch 1 is complete\u001b[0m\n"
228 |      ]
229 |     },
230 |     {
231 |      "name": "stderr",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "\n",
235 |       "1582it [01:50, 10.85it/s]"
236 |      ]
237 |     },
238 |     {
239 |      "name": "stdout",
240 |      "output_type": "stream",
241 |      "text": [
242 |       "\u001b[32mepoch 2 is complete\u001b[0m\n"
243 |      ]
244 |     },
245 |     {
246 |      "name": "stderr",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "\n",
250 |       "1582it [01:49, 14.51it/s]"
251 |      ]
252 |     },
253 |     {
254 |      "name": "stdout",
255 |      "output_type": "stream",
256 |      "text": [
257 |       "\u001b[32mepoch 3 is complete\u001b[0m\n"
258 |      ]
259 |     },
260 |     {
261 |      "name": "stderr",
262 |      "output_type": "stream",
263 |      "text": [
264 |       "\n",
265 |       "1582it [01:49, 14.39it/s]"
266 |      ]
267 |     },
268 |     {
269 |      "name": "stdout",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "\u001b[32mepoch 4 is complete\u001b[0m\n"
273 |      ]
274 |     },
275 |     {
276 |      "name": "stderr",
277 |      "output_type": "stream",
278 |      "text": [
279 |       "\n"
280 |      ]
281 |     }
282 |    ],
283 |    "source": [
284 |     "for i in range(args.N_EPOCH):\n",
285 |     "    sess.train()\n",
286 |     "    sess.ledger.green('epoch {} is complete'.format(i))"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 7,
292 |    "metadata": {
293 |     "collapsed": true,
294 |     "deletable": true,
295 |     "editable": true
296 |    },
297 |    "outputs": [],
298 |    "source": [
299 |     "# TODO: somehow the evaluation always stops."
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 8,
305 |    "metadata": {
306 |     "collapsed": false,
307 |     "deletable": true,
308 |     "editable": true
309 |    },
310 |    "outputs": [
311 |     {
312 |      "name": "stdout",
313 |      "output_type": "stream",
314 |      "text": [
315 |       "\u001b[37mattention-networks\u001b[0m/\u001b[32mlanguage.py\u001b[0m \u001b[37mL\u001b[0m\u001b[31m68:\u001b[0m \u001b[30mledger.debug(indexes[0])\u001b[0m\n",
316 |       "\u001b[37m----------------------\u001b[0m\n",
317 |       "1\n",
318 |       "<SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS><SOS>\n"
319 |      ]
320 |     }
321 |    ],
322 |    "source": [
323 |     "sentence = sess.evaluate('This is a job.')\n",
324 |     "print(sentence)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": null,
330 |    "metadata": {
331 |     "collapsed": true,
332 |     "deletable": true,
333 |     "editable": true
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     ""
338 |    ]
339 |   }
340 |  ],
341 |  "metadata": {
342 |   "anaconda-cloud": {},
343 |   "kernelspec": {
344 |    "display_name": "deep-learning-python3",
345 |    "language": "python",
346 |    "name": "deep-learning"
347 |   },
348 |   "language_info": {
349 |    "codemirror_mode": {
350 |     "name": "ipython",
351 |     "version": 3.0
352 |    },
353 |    "file_extension": ".py",
354 |    "mimetype": "text/x-python",
355 |    "name": "python",
356 |    "nbconvert_exporter": "python",
357 |    "pygments_lexer": "ipython3",
358 |    "version": "3.6.0"
359 |   }
360 |  },
361 |  "nbformat": 4,
362 |  "nbformat_minor": 0
363 | }


--------------------------------------------------------------------------------
/word_seq_2_seq/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/__init__.py


--------------------------------------------------------------------------------
/word_seq_2_seq/crawl.out:
--------------------------------------------------------------------------------
 1 | ]0;IPython: pytorch_playground/attention-networks******* download_links Module is reloaded *******
 2 | Request to http://www.manythings.org/anki/afr-eng.zip has been made.. File ./data/afr-eng.zip has been saved.
 3 | Request to http://www.manythings.org/anki/ara-eng.zip has been made..... File ./data/ara-eng.zip has been saved.
 4 | Request to http://www.manythings.org/anki/aze-eng.zip has been made.. File ./data/aze-eng.zip has been saved.
 5 | Request to http://www.manythings.org/anki/bel-eng.zip has been made.. File ./data/bel-eng.zip has been saved.
 6 | Request to http://www.manythings.org/anki/ben-eng.zip has been made... File ./data/ben-eng.zip has been saved.
 7 | Request to http://www.manythings.org/anki/ber-eng.zip has been made............ File ./data/ber-eng.zip has been saved.
 8 | Request to http://www.manythings.org/anki/bul-eng.zip has been made....... File ./data/bul-eng.zip has been saved.
 9 | Request to http://www.manythings.org/anki/yue-eng.zip has been made... File ./data/yue-eng.zip has been saved.
10 | Request to http://www.manythings.org/anki/cat-eng.zip has been made.. File ./data/cat-eng.zip has been saved.
11 | Request to http://www.manythings.org/anki/cbk-eng.zip has been made.. File ./data/cbk-eng.zip has been saved.
12 | Request to http://www.manythings.org/anki/cmn-eng.zip has been made......... File ./data/cmn-eng.zip has been saved.
13 | Request to http://www.manythings.org/anki/hrv-eng.zip has been made.. File ./data/hrv-eng.zip has been saved.
14 | Request to http://www.manythings.org/anki/ces-eng.zip has been made.... File ./data/ces-eng.zip has been saved.
15 | Request to http://www.manythings.org/anki/dan-eng.zip has been made...... File ./data/dan-eng.zip has been saved.
16 | Request to http://www.manythings.org/anki/nld-eng.zip has been made.......... File ./data/nld-eng.zip has been saved.
17 | Request to http://www.manythings.org/anki/eng-eng.zip has been made.... File ./data/eng-eng.zip has been saved.
18 | Request to http://www.manythings.org/anki/est-eng.zip has been made.. File ./data/est-eng.zip has been saved.
19 | Request to http://www.manythings.org/anki/fin-eng.zip has been made................. File ./data/fin-eng.zip has been saved.
20 | Request to http://www.manythings.org/anki/fra-eng.zip has been made............................................. File ./data/fra-eng.zip has been saved.
21 | Request to http://www.manythings.org/anki/deu-eng.zip has been made........................................................ File ./data/deu-eng.zip has been saved.
22 | Request to http://www.manythings.org/anki/ell-eng.zip has been made..... File ./data/ell-eng.zip has been saved.
23 | Request to http://www.manythings.org/anki/heb-eng.zip has been made................................ File ./data/heb-eng.zip has been saved.
24 | Request to http://www.manythings.org/anki/hin-eng.zip has been made... File ./data/hin-eng.zip has been saved.
25 | Request to http://www.manythings.org/anki/hun-eng.zip has been made................. File ./data/hun-eng.zip has been saved.
26 | Request to http://www.manythings.org/anki/isl-eng.zip has been made.... File ./data/isl-eng.zip has been saved.
27 | Request to http://www.manythings.org/anki/ind-eng.zip has been made.... File ./data/ind-eng.zip has been saved.
28 | Request to http://www.manythings.org/anki/ita-eng.zip has been made.................................................. File ./data/ita-eng.zip has been saved.
29 | Request to http://www.manythings.org/anki/jpn-eng.zip has been made.................... File ./data/jpn-eng.zip has been saved.
30 | Request to http://www.manythings.org/anki/kha-eng.zip has been made.. File ./data/kha-eng.zip has been saved.
31 | Request to http://www.manythings.org/anki/khm-eng.zip has been made.. File ./data/khm-eng.zip has been saved.
32 | Request to http://www.manythings.org/anki/kor-eng.zip has been made.. File ./data/kor-eng.zip has been saved.
33 | Request to http://www.manythings.org/anki/lvs-eng.zip has been made.. File ./data/lvs-eng.zip has been saved.
34 | Request to http://www.manythings.org/anki/lit-eng.zip has been made.. File ./data/lit-eng.zip has been saved.
35 | Request to http://www.manythings.org/anki/nds-eng.zip has been made... File ./data/nds-eng.zip has been saved.
36 | Request to http://www.manythings.org/anki/mkd-eng.zip has been made...................... File ./data/mkd-eng.zip has been saved.
37 | Request to http://www.manythings.org/anki/mal-eng.zip has been made.. File ./data/mal-eng.zip has been saved.
38 | Request to http://www.manythings.org/anki/mar-eng.zip has been made.......... File ./data/mar-eng.zip has been saved.
39 | Request to http://www.manythings.org/anki/nob-eng.zip has been made... File ./data/nob-eng.zip has been saved.
40 | Request to http://www.manythings.org/anki/pes-eng.zip has been made.. File ./data/pes-eng.zip has been saved.
41 | Request to http://www.manythings.org/anki/pol-eng.zip has been made.............. File ./data/pol-eng.zip has been saved.
42 | Request to http://www.manythings.org/anki/por-eng.zip has been made................................ File ./data/por-eng.zip has been saved.
43 | Request to http://www.manythings.org/anki/ron-eng.zip has been made.... File ./data/ron-eng.zip has been saved.
44 | Request to http://www.manythings.org/anki/rus-eng.zip has been made...................................................................................... File ./data/rus-eng.zip has been saved.
45 | Request to http://www.manythings.org/anki/srp-eng.zip has been made.... File ./data/srp-eng.zip has been saved.
46 | Request to http://www.manythings.org/anki/slk-eng.zip has been made.. File ./data/slk-eng.zip has been saved.
47 | Request to http://www.manythings.org/anki/spa-eng.zip has been made.......................................... File ./data/spa-eng.zip has been saved.
48 | Request to http://www.manythings.org/anki/swe-eng.zip has been made...... File ./data/swe-eng.zip has been saved.
49 | Request to http://www.manythings.org/anki/tgl-eng.zip has been made... File ./data/tgl-eng.zip has been saved.
50 | Request to http://www.manythings.org/anki/tat-eng.zip has been made.. File ./data/tat-eng.zip has been saved.
51 | Request to http://www.manythings.org/anki/tur-eng.zip has been made.................................................................................................................................... File ./data/tur-eng.zip has been saved.
52 | Request to http://www.manythings.org/anki/ukr-eng.zip has been made......... File ./data/ukr-eng.zip has been saved.
53 | Request to http://www.manythings.org/anki/urd-eng.zip has been made.. File ./data/urd-eng.zip has been saved.
54 | Request to http://www.manythings.org/anki/vie-eng.zip has been made... File ./data/vie-eng.zip has been saved.
55 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/data.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import language
 4 | from utils import ledger
 5 | 
 6 | 
 7 | def normalize_strings(s):
 8 |     s = s.lower().strip()
 9 |     # http://stackoverflow.com/a/518232/2809427
10 |     # disable below since it can be learned.
11 |     # s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
12 |     s = re.sub(r"([.!?;:@])", r" \1", s)
13 |     # s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
14 |     return s
15 | 
16 | 
17 | def read_language(l1, l2, normalize_fn=None):
18 |     ledger.info('Reading Lines... ')
19 |     with open('./training-data/{}-{}.txt'.format(l1, l2)) as f:
20 |         lines = f.read().strip().split('\n')
21 |         for l in lines:
22 |             if normalize_fn is None:
23 |                 yield l.split('\t')
24 |             else:
25 |                 yield [normalize_fn(s) for s in l.split('\t')]
26 | 
27 | 
28 | def trim_by_length(length, token_sep=' '):
29 |     def trim(p):
30 |         if length <= 0:
31 |             return True
32 |         elif len(p[0].split(token_sep)) > length or len(p[1].split(token_sep)) > length:
33 |             return False
34 |         return True
35 | 
36 |     return trim
37 | 
38 | 
39 | import math
40 | 
41 | 
42 | def get_batch(pairs, batch_size):
43 |     n_pairs = len(pairs)
44 |     n_batch = math.ceil(n_pairs / batch_size)
45 |     for i in range(n_batch):
46 |         sent_1 = []
47 |         sent_2 = []
48 |         # for batch_size 1 no padding is needed
49 |         if batch_size == 1:
50 |             p = pairs[i]
51 |             sent_1.append(p[0])
52 |             sent_2.append(p[1])
53 |             yield sent_1, sent_2
54 |         else:
55 |             input_max_length = 0
56 |             output_max_length = 0
57 |             for p in pairs[i * batch_size:min((i + 1) * batch_size, n_pairs)]:
58 |                 input_max_length = max(len(p[0]), input_max_length)
59 |                 output_max_length = max(len(p[1]), output_max_length)
60 | 
61 |             for p in pairs[i * batch_size:min((i + 1) * batch_size, n_pairs)]:
62 |                 # [2] is [<EOS>], [0] [<NULL>]
63 |                 sent_1.append([1] + p[0] + [2] + [0] * (input_max_length - len(p[0])))
64 |                 sent_2.append([1] + p[1] + [2] + [0] * (output_max_length - len(p[1])))
65 |             yield sent_1, sent_2
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     ledger.green('Number of sentence pairs: {}'.format(len(list(read_language('eng', 'fra', normalize_strings)))))
70 |     max_len = 10
71 | 
72 |     ps = filter(trim_by_length(max_len), read_language('eng', 'fra', normalize_strings))
73 |     ps = list(ps)
74 |     ledger.green('Number of sentence pairs after filtering: {}'.format(len(ps)))
75 | 
76 |     l1, l2 = language.get_language_pairs('eng', 'fra', ps)
77 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/download_anki.py:
--------------------------------------------------------------------------------
1 | from scraper import Page
2 | 
3 | p = Page('http://www.manythings.org/anki/', debug=True)
4 | p.set_mask('(.*).zip')
5 | p.request()
6 | 
7 | for m, n in p.get_anchors():
8 |     n_p = Page(p.url + m[0], debug=True)
9 |     n_p.download('./data/' + m[0], 'wb', chunk_size=4096*2**4)


--------------------------------------------------------------------------------
/word_seq_2_seq/evaluate.py:
--------------------------------------------------------------------------------
1 | # Evaluate
2 | 
3 | 
4 | def print_random_pairs(model):
5 |     input, output = data_loader.get_pair()
6 |     hidden = model.init_hidden()
7 |     model(input, output, hidden)
8 | 
9 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/language.py:
--------------------------------------------------------------------------------
  1 | # Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
  2 | from utils import ledger
  3 | 
  4 | 
  5 | def unicodeToAscii(s):
  6 |     return ''.join(
  7 |         c for c in unicodedata.normalize('NFD', s)
  8 |         if unicodedata.category(c) != 'Mn'
  9 |     )
 10 | 
 11 | 
 12 | # Lowercase, trim, and remove non-letter characters
 13 | def normalizeString(s):
 14 |     s = unicodeToAscii(s.lower().strip())
 15 |     s = re.sub(r"([.!?])", r" \1", s)
 16 |     s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
 17 |     return s
 18 | 
 19 | 
 20 | import utils
 21 | 
 22 | CJK_LANGUAGES = ['cmn', 'jpn']  # , 'kor']
 23 | 
 24 | 
 25 | class Language:
 26 |     def __init__(self, name):
 27 |         self.name = name
 28 |         self.word2index = {}
 29 |         self.word2count = {}
 30 |         self.NULL = "<NULL>"
 31 |         self.NULL_ind = 0
 32 |         self.SOS = '<SOS>'
 33 |         self.SOS_ind = 1
 34 |         self.EOS = '<EOS>'
 35 |         self.EOS_ind = 2
 36 |         self.index2word = {
 37 |             self.NULL_ind: self.NULL,
 38 |             self.SOS_ind: self.SOS,
 39 |             self.EOS_ind: self.EOS
 40 |         }
 41 |         self.n_words = 3
 42 |         self.CJK_LANGUAGES = CJK_LANGUAGES
 43 | 
 44 |     def add_sentence(self, sentence):
 45 |         for word in self.tokenize(sentence):
 46 |             self.add_word(word)
 47 | 
 48 |     def add_word(self, word):
 49 |         if word not in self.word2index:
 50 |             self.word2index[word] = self.n_words
 51 |             self.word2count[word] = 1
 52 |             self.index2word[self.n_words] = word
 53 |             self.n_words += 1
 54 |         else:
 55 |             self.word2count[word] += 1
 56 | 
 57 |     def tokenize(self, sentence):
 58 |         is_cjk = self.name in self.CJK_LANGUAGES
 59 |         if is_cjk:
 60 |             # ledger.warn('is CJK language!', lang.name)
 61 |             return [char for seg in sentence.split(' ') for char in seg]
 62 |         return sentence.split(' ')
 63 | 
 64 |     def sentence_to_indexes(self, sentence):
 65 |         return [self.word2index[w] for w in self.tokenize(sentence)]
 66 | 
 67 |     def indexes_to_sentence(self, indexes):
 68 |         ledger.debug(indexes[0])
 69 |         is_cjk = self.name in self.CJK_LANGUAGES
 70 |         if is_cjk:
 71 |             return ''.join([self.index2word[i] for i in indexes])
 72 |         return ' '.join([self.index2word[i] for i in indexes])
 73 | 
 74 |     def summarize(self):
 75 |         utils.ledger.green('{}'.format(self.name), end=' ')
 76 |         utils.ledger.print('has ', end=' ')
 77 |         utils.ledger.green(self.n_words, end=' ')
 78 |         utils.ledger.print('words', end='.\n')
 79 | 
 80 |     def __getattribute__(self, key):
 81 |         if key == "__dict__":
 82 |             return {
 83 |                 "name": self.name,
 84 |                 "word2index": self.word2index,
 85 |                 "word2count": self.word2count,
 86 |                 "NULL": self.NULL,
 87 |                 "NULL_ind": self.NULL_ind,
 88 |                 "SOS": self.SOS,
 89 |                 "SOS_ind": self.SOS_ind,
 90 |                 "EOS": self.EOS,
 91 |                 "EOS_ind": self.EOS_ind,
 92 |                 "index2word": self.index2word,
 93 |                 "n_words": self.n_words
 94 |             }
 95 |         else:
 96 |             return super().__getattribute__(key)
 97 | 
 98 |     def load_dict(self, state_dict):
 99 |         for k in state_dict:
100 |             setattr(self, k, state_dict[k])
101 | 
102 | 
103 | def get_language_pairs(name_1, name_2, sentence_pairs):
104 |     l1 = Language(name_1)
105 |     l2 = Language(name_2)
106 |     for p in sentence_pairs:
107 |         l1.add_sentence(p[0])
108 |         l2.add_sentence(p[1])
109 |     return l1, l2
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     l = Language('eng')
114 |     print(vars(l))
115 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/metric.py:
--------------------------------------------------------------------------------
1 | from utils import ledger
2 | from visdom import Visdom
3 | 
4 | def plot_accuracy():
5 |     ledger.log('')
6 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.autograd import Variable
  3 | import torch.nn as nn
  4 | 
  5 | import utils
  6 | 
  7 | 
  8 | class EncoderRNN(nn.Module):
  9 |     def __init__(self, n_words, embedding_size, n_layers=1, bidirectional=False):
 10 |         super(EncoderRNN, self).__init__()
 11 |         self.char_set = n_words
 12 |         self.hidden_size = embedding_size
 13 |         self.n_layers = n_layers
 14 |         if bidirectional:
 15 |             self.bi_multiplier = 2
 16 |         else:
 17 |             self.bi_multiplier = 1
 18 | 
 19 |         self.embedding = nn.Embedding(n_words, embedding_size)
 20 |         self.gru = nn.GRU(embedding_size, embedding_size, batch_first=True)
 21 | 
 22 |     def forward(self, input, hidden):
 23 |         """batch index goes first."""
 24 |         batch_size = input.size()[0]
 25 |         embeded = self.embedding(input).view(batch_size, -1, self.hidden_size)
 26 |         output, hidden = self.gru(embeded, hidden)
 27 |         return output, hidden
 28 | 
 29 |     def init_hidden(self, batch_size, random=False):
 30 |         """remember, hidden layer always has batch_size at index = 1, reguardless of the batch_first flag."""
 31 |         if random:
 32 |             return Variable(
 33 |                 torch.randn(
 34 |                     self.bi_multiplier * self.n_layers,
 35 |                     batch_size,
 36 |                     self.hidden_size))
 37 |         else:
 38 |             return Variable(
 39 |                 torch.zeros(
 40 |                     self.bi_multiplier * self.n_layers,
 41 |                     batch_size,
 42 |                     self.hidden_size))
 43 | 
 44 | 
 45 | class DecoderRNN(nn.Module):
 46 |     def __init__(self, n_words, embedding_size, n_layers=1, bidirectional=False):
 47 |         """Need to impedance match input and output. Input is class_index, output is class_index, 
 48 |         but we also need the softmax raw during training, since it contains more information."""
 49 |         super(DecoderRNN, self).__init__()
 50 |         self.n_words = n_words
 51 |         self.hidden_size = embedding_size
 52 |         self.n_layers = n_layers
 53 |         if bidirectional:
 54 |             self.bi_multiplier = 2
 55 |         else:
 56 |             self.bi_multiplier = 1
 57 | 
 58 |         self.embedding = nn.Embedding(self.n_words, embedding_size)
 59 |         # add dropout
 60 |         self.gru = nn.GRU(embedding_size, embedding_size, batch_first=True)
 61 |         self.output_embedding = nn.Linear(self.hidden_size, n_words)
 62 |         self.output_softmax = nn.Softmax()
 63 | 
 64 |     def embed(self, input_word_indexes):
 65 |         """word index to embedding"""
 66 |         batch_size = input_word_indexes.size()[0]
 67 |         embeded = self.embedding(input_word_indexes).view(batch_size, -1, self.hidden_size)
 68 |         return embeded
 69 | 
 70 |     def forward(self, embeded, hidden):
 71 |         """batch index goes first. Input and output are both size <,,n_words>"""
 72 |         batch_size = embeded.size()[0]
 73 |         output, hidden = self.gru(embeded, hidden)
 74 |         output_embeded = self.output_embedding(output.view(-1, self.hidden_size)).view(batch_size, -1, self.n_words)
 75 |         return output_embeded, hidden
 76 | 
 77 |     def extract(self, output_embeded):
 78 |         """word embedding to class indexes"""
 79 |         b_size, seq_len, n_words = output_embeded.size()
 80 |         output_softmax = self.output_softmax(output_embeded.view(-1, n_words))
 81 |         # TODO: alternatively: output_words = output_softmax.multinomial(1).view(batch_size, -1)
 82 |         _, output_word_indexes = output_softmax.topk(1, dim=1)  # .view(batch_size, -1)
 83 |         return output_word_indexes.view(b_size, seq_len)
 84 | 
 85 | 
 86 | def init_hidden(self, batch_size, random=False):
 87 |     """remember, hidden layer always has batch_size at index = 1, reguardless of the batch_first flag."""
 88 |     if random:
 89 |         return Variable(
 90 |             torch.randn(
 91 |                 self.bi_multiplier * self.n_layers,
 92 |                 batch_size,
 93 |                 self.hidden_size))
 94 |     else:
 95 |         return Variable(
 96 |             torch.zeros(
 97 |                 self.bi_multiplier * self.n_layers,
 98 |                 batch_size,
 99 |                 self.hidden_size))  # TODO: attention
100 | 
101 | 
102 | # 1. [ ] get training hooked up
103 | #   1. [ ]
104 | 
105 | import random
106 | 
107 | 
108 | class VanillaSequenceToSequence(nn.Module):
109 |     def __init__(self, input_lang, output_lang, hidden_size, n_layers=1, bidirectional=False):
110 |         super(VanillaSequenceToSequence, self).__init__()
111 |         self.input_lang = input_lang
112 |         self.output_lang = output_lang
113 |         self.encoder = EncoderRNN(input_lang.n_words, hidden_size, n_layers, bidirectional)
114 |         self.decoder = DecoderRNN(output_lang.n_words, hidden_size, n_layers, bidirectional)
115 | 
116 |     def init_hidden(self, batch_size):
117 |         return self.encoder.init_hidden(batch_size)
118 | 
119 |     def get_SOS_vec(self, batch_size):
120 |         return Variable(torch.LongTensor([[self.output_lang.SOS_ind]] * batch_size))
121 | 
122 |     def forward(self, input, hidden, target=None, teacher_r=0, max_output_length=None):
123 |         # DONE: Should really use Module.train and Module.eval to set the training flag, then handle the different logic inside the forward function. This separate evaluation function is repetitive and will not needed in that case.
124 |         # NOTE: hidden always has second index being the batch index.
125 |         batch_size = hidden.size()[1]
126 |         target_size = 0 if target is None else target.size()[1]
127 |         encoder_output, encoded = self.encoder(input, hidden)
128 |         slices = []
129 |         embeded_outputs = []
130 |         hidden = encoded
131 |         output_word_batch = self.get_SOS_vec(batch_size)
132 |         # TODO: make it so end of sentence for all elements in batch trigger end of while loop.
133 |         eos_flags = list(range(batch_size))
134 |         while len(slices) < (max_output_length or self.args.MAX_OUTPUT_LEN) \
135 |                 and len(eos_flags) != 0:
136 |             # word_slice size(b_size, 1), is correct
137 |             ## This is where you add teacher forcing
138 |             index = len(slices)
139 |             i_vec = Variable(torch.LongTensor([index]))
140 |             # TODO: use tensor combine/add_with_mask operator here instead.
141 |             output_slice_forced = output_word_batch \
142 |                 if random.random() > teacher_r or index >= target_size \
143 |                 else target.index_select(1, i_vec)
144 |             output_embedded, hidden = self.decoder(self.decoder.embed(output_slice_forced), hidden)
145 |             # convert embedded to class_index here
146 |             output_word_batch = self.decoder.extract(output_embedded)
147 | 
148 |             # Now add the slices to the output stack. word_slice(b_size, 1) -> size(b_size)
149 |             slices.append(output_word_batch.view(batch_size))
150 |             embeded_outputs.append(output_embedded.view(batch_size, -1))
151 | 
152 |             for ind, s in enumerate(output_word_batch):
153 |                 s_index = int(s.data.numpy()[0])
154 |                 if ind in eos_flags:
155 |                     if s_index == self.output_lang.EOS_ind:
156 |                         eos_flags.remove(ind)
157 | 
158 |         # TODO: fix mismatch output between evaluate and forward.
159 |         return torch.stack(slices, dim=1), hidden, torch.stack(embeded_outputs, dim=1)
160 | 
161 |     def setup_training(self, learning_rate):
162 |         self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
163 |         self.optimizer.zero_grad()
164 |         self.loss_fn = nn.CrossEntropyLoss()
165 |         self.init_hidden_()
166 | 
167 |     def load(self, fn):
168 |         # TODO: load the input and output language as well, to maintain the word list.
169 |         checkpoint = torch.load(fn)
170 |         self.load_state_dict(checkpoint['state_dict'])
171 |         self.input_lang.load_dict(checkpoint['input_lang'])
172 |         self.output_lang.load_dict(checkpoint['output_lang'])
173 |         return checkpoint
174 | 
175 |     def save(self, fn="seq-to-seq.cp", meta=None, **kwargs):
176 |         # TODO: save input and output language as well, to maintain the word list.
177 |         d = {k: kwargs[k] for k in kwargs}
178 |         d["state_dict"] = self.state_dict()
179 |         d["input_lang"] = vars(self.input_lang)
180 |         d["output_lang"] = vars(self.output_lang)
181 |         if meta is not None:
182 |             d['meta'] = meta
183 |         torch.save(d, fn)
184 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/model_eval.spec.py:
--------------------------------------------------------------------------------
 1 | from train import Session
 2 | from utils import ledger, Struct
 3 | 
 4 | args = Struct(**{'BATCH_SIZE': 10,
 5 |                  'BI_DIRECTIONAL': False,
 6 |                  'DASH_ID': 'seq-to-seq-experiment',
 7 |                  'DEBUG': True,
 8 |                  'EVAL_INTERVAL': 10,
 9 |                  'TEACHER_FORCING_R': 0.5,
10 |                  'INPUT_LANG': 'eng',
11 |                  'LEARNING_RATE': 0.001,
12 |                  'MAX_DATA_LEN': 50,
13 |                  'MAX_OUTPUT_LEN': 100,
14 |                  'N_EPOCH': 5,
15 |                  'N_LAYERS': 1,
16 |                  'OUTPUT_LANG': 'cmn',
17 |                  'SAVE_INTERVAL': 100})
18 | 
19 | with Session(args) as sess:
20 |     for i in range(args.N_EPOCH):
21 |         sess.train()
22 |     # sess.load_pretrain('./trained/test.cp')
23 |     sentence = sess.evaluate('This is a job.')
24 |     print(sentence)
25 | 
26 | # want to figure out why the execution stops at 1 output
27 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/scraper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from bs4 import BeautifulSoup
 3 | import requests
 4 | import re
 5 | 
 6 | print('******* {} Module is reloaded *******'.format(__name__))
 7 | 
 8 | class Page():
 9 |     def __init__(self, url, debug=False):
10 |         self.url = url
11 |         self._debug = debug
12 | 
13 |     def request(self):
14 |         page = requests.get(self.url)
15 |         self.soup = BeautifulSoup(page.content, 'html.parser')
16 |         return self
17 | 
18 |     def set_mask(self, regexString):
19 |         self.url_mask = re.compile(regexString)
20 |         return self
21 | 
22 |     def test_mask(self, string):
23 |         return self.url_mask.match(string)
24 | 
25 |     def get_anchors(self):
26 |         for n in self.soup.findAll('a'):
27 |             if 'href' in n.attrs and self.url_mask.match(n['href']) is not None:
28 |                 yield self.url_mask.match(n['href']), n
29 | 
30 |     def debug(self, output, *args, **kwargs):
31 |         if self._debug:
32 |             print(output, *args, **kwargs)
33 |             sys.stdout.flush()
34 | 
35 |     def download(self, path, mode='wb', chunk_size=None):
36 |         r = requests.get(self.url, stream=True)
37 |         with open(path, mode) as f:
38 |             self.debug('Request to {} has been made'.format(self.url), end='')
39 |             for chunk in r.iter_content(chunk_size=chunk_size):
40 |                 if chunk:  # filter out keep-alive new chunks
41 |                     f.write(chunk)
42 |                     self.debug('.'.format(self.url), end='')
43 |             self.debug('. '.format(self.url), end='')
44 |         self.debug('File {} has been saved.'.format(path))
45 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | from torch.autograd import Variable
  9 | from tqdm import tqdm
 10 | 
 11 | import data
 12 | import utils
 13 | from language import get_language_pairs
 14 | from model import VanillaSequenceToSequence
 15 | from visdom_helper import visdom_helper
 16 | 
 17 | print('********<loading train module>********')
 18 | 
 19 | 
 20 | def LTZeroInt(s):
 21 |     if int(s) > 0:
 22 |         return int(s)
 23 |     else:
 24 |         raise argparse.ArgumentTypeError('{} need to be larger than 1 and an integer.'.format(s))
 25 | 
 26 | 
 27 | _pr = argparse.ArgumentParser(description='Sequence-To-Sequence Model in PyTorch')
 28 | 
 29 | _pr.add_argument('-d', '--debug', dest='DEBUG', default=True, type=bool, help='debug mode prints more info')
 30 | _pr.add_argument('-cf', '--checkpoint-folder', dest='CHECKPOINT_FOLDER', type=str, default="./trained-checkpoints",
 31 |                  help='folder where it saves checkpoint files')
 32 | _pr.add_argument('-cp', '--checkpoint', dest='CHECKPOINT', type=str, default="",
 33 |                  help='the checkpoint to load')
 34 | _pr.add_argument('--checkpoint-batch-stamp', dest='CHECKPOINT_BATCH_STAMP', type=bool, default=True,
 35 |                  help='the checkpoint to load')
 36 | _pr.add_argument('-il', '--input-lang', dest='INPUT_LANG', default='eng', help='code name for the input language ')
 37 | _pr.add_argument('-ol', '--output-lang', dest='OUTPUT_LANG', default='cmn', help='code name for the output language ')
 38 | _pr.add_argument('--max-data-len', dest='MAX_DATA_LEN', default=10, type=int,
 39 |                  help='maximum length for input output pairs (words)')
 40 | _pr.add_argument('--dash-id', dest='DASH_ID', type=str, default='seq-to-seq-experiment',
 41 |                  help='maximum length for input output pairs')
 42 | _pr.add_argument('--batch-size', dest='BATCH_SIZE', type=int, default=10, help='maximum length for input output pairs')
 43 | _pr.add_argument('--learning-rate', dest='LEARNING_RATE', type=float, default=1e-3,
 44 |                  help='maximum length for input output pairs')
 45 | _pr.add_argument('--n-epoch', dest='N_EPOCH', type=int, default=5, help='number of epochs to train')
 46 | _pr.add_argument('-e', '--eval-interval', dest='EVAL_INTERVAL', type=LTZeroInt, default=10,
 47 |                  help='evaluate model on validation set')
 48 | _pr.add_argument('--teacher-forcing-r', dest='TEACHER_FORCING_R', type=float, default=0.5,
 49 |                  help='Float for the teacher-forcing ratio')
 50 | _pr.add_argument('-s', '--save-interval', dest='SAVE_INTERVAL', type=LTZeroInt, default=100,
 51 |                  help='evaluate model on validation set')
 52 | _pr.add_argument('--n-layers', dest='N_LAYERS', type=int, default=1,
 53 |                  help='maximum length for input output pairs')
 54 | _pr.add_argument('--bi-directional', dest='BI_DIRECTIONAL', type=bool, default=False,
 55 |                  help='whether use bi-directional module for the model')
 56 | 
 57 | 
 58 | class Session():
 59 |     def __init__(self, args):
 60 |         self.name = 'seq-2-seq-translator'
 61 |         self.args = args
 62 |         self.meta = lambda: None
 63 |         self.meta.epoch = 0
 64 | 
 65 |         # create logging and dashboard
 66 |         self.ledger = utils.Ledger(debug=self.args.DEBUG)
 67 |         self.dash = visdom_helper.Dashboard(args.DASH_ID)
 68 | 
 69 |         if self.args.DEBUG:
 70 |             self.ledger.pp(vars(args))
 71 | 
 72 |         # load data
 73 |         source_data = list(data.read_language(self.args.INPUT_LANG, self.args.OUTPUT_LANG, data.normalize_strings))
 74 |         self.ledger.green('sentence pairs in file: {}'.format(len(source_data)))
 75 |         self.pairs = list(filter(data.trim_by_length(self.args.MAX_DATA_LEN), source_data))
 76 |         self.ledger.green('Number of sentence pairs after filtering: {}'.format(len(self.pairs)))
 77 | 
 78 |         self.input_lang, self.output_lang = \
 79 |             get_language_pairs(self.args.INPUT_LANG, args.OUTPUT_LANG, self.pairs)
 80 |         self.input_lang.summarize()
 81 |         self.output_lang.summarize()
 82 | 
 83 |         self.word_index_pairs = [[self.input_lang.sentence_to_indexes(p[0]),
 84 |                                   self.output_lang.sentence_to_indexes(p[1])]
 85 |                                  for p in self.pairs]
 86 | 
 87 |         # Now build the model
 88 |         self.model = VanillaSequenceToSequence(
 89 |             self.input_lang, self.output_lang, 200, n_layers=self.args.N_LAYERS,
 90 |             bidirectional=self.args.BI_DIRECTIONAL)
 91 |         self.ledger.log('Sequence to sequence model graph:\n', self.model)
 92 | 
 93 |     def train(self, debug=False):
 94 |         # setting up Training
 95 |         optimizer = optim.Adam(self.model.parameters(), lr=self.args.LEARNING_RATE)
 96 |         criterion = nn.CrossEntropyLoss()
 97 | 
 98 |         # train
 99 |         self.meta.losses = []
100 |         # print(self.word_index_pairs)
101 |         for i, (inputs, target) in enumerate(tqdm(data.get_batch(self.word_index_pairs, self.args.BATCH_SIZE))):
102 |             hidden = self.model.init_hidden(len(inputs))
103 |             optimizer.zero_grad()
104 | 
105 |             input_vec = Variable(torch.LongTensor(inputs), requires_grad=False)
106 |             target_vec = Variable(torch.LongTensor(target), requires_grad=False)
107 |             _, target_seq_length = target_vec.size()
108 | 
109 |             # set the training flag
110 |             self.model.train()
111 |             # NOTE: we don't need to output longer than the target, because there is no way to calculate the entropy. This is however a very crude way of doing it, and in the future a better loss metric would be needed. self.args.MAX_OUTPUT_LEN
112 |             output, hidden, output_embeded = self.model(input_vec, hidden, target_vec, self.args.TEACHER_FORCING_R,
113 |                                                         target_seq_length - 1)
114 | 
115 |             b_size, seq_len, n_words = output_embeded.size()
116 |             # TODO: instead of clipping output_embedded, should pad target longer with <NULL>. (different from t_padded!).
117 |             # NOTE: need to cut target_vec to same length as seq_len
118 |             target_without_SOS = target_vec[:, 1:(seq_len + 1)].contiguous().view(-1)
119 |             # self.ledger.debug(output_embeded.size(), target_without_SOS.size())
120 |             loss = criterion(output_embeded.view(-1, n_words), target_without_SOS)
121 | 
122 |             # target_vec.t()[1:].t().view(-1))
123 | 
124 |             # for output_softmax, t in zip(output_softmaxes, target_padded):
125 |             #     loss += criterion(output_softmax, t)
126 | 
127 |             self.meta.losses.append(loss.data.numpy()[0])
128 |             if i % self.args.EVAL_INTERVAL == 0:
129 |                 self.dash.plot('loss', 'line', X=np.arange(0, len(self.meta.losses)), Y=np.array(self.meta.losses))
130 |             if i % self.args.SAVE_INTERVAL == 0:
131 |                 # TODO: add data file
132 |                 self.model.save()
133 | 
134 |             loss.backward()
135 |             optimizer.step()
136 | 
137 |             # execute only once under debug mode.
138 |             if debug:
139 |                 return
140 |         self.meta.epoch += 1
141 | 
142 |     def checkpoint_filename(self, epoch_stamp=False, time_stamp=False):
143 |         return self.name + \
144 |                ('_batch_' + str(self.meta.epoch) if epoch_stamp is True else '') + '.ckp'
145 | 
146 |     def checkpoint_location(self, filename=None, **kwargs):
147 |         return os.path.join(self.args.CHECKPOINT_FOLDER, filename or self.checkpoint_filename(**kwargs))
148 | 
149 |     def __enter__(self):
150 |         # TODO: load parames from checkpoint
151 |         if self.args.CHECKPOINT is not '':
152 |             # try:
153 |             cp = self.model.load(self.checkpoint_location(self.args.CHECKPOINT))
154 |             self.args = utils.Struct(**cp['args'], **vars(self.args))
155 |             # except Exception as e:
156 |             #     self.ledger.raise_(e, 'error with enter.')
157 |             return self
158 | 
159 |     def __exit__(self, exc_type, exc_val, exc_tb):
160 |         self.model.save(self.checkpoint_location(self.args.CHECKPOINT), {
161 |             "args": vars(self.args),
162 |             "meta": {
163 |                 'losses': self.meta.losses,
164 |                 'epoch': self.meta.epoch
165 |             }
166 |         })
167 |         pass
168 | 
169 |     def evaluate(self, input_sentence):
170 | 
171 |         normalized = data.normalize_strings(input_sentence)
172 |         input_ind = [self.input_lang.sentence_to_indexes(normalized)]
173 |         input_vec = Variable(torch.LongTensor(input_ind), requires_grad=False)
174 |         hidden = self.model.init_hidden(1)
175 |         self.model.eval()
176 |         translated, _, _ = self.model.forward(input_vec, hidden, max_output_length=1000)
177 |         first_in_batch = 0
178 |         output_sentence = self.output_lang.indexes_to_sentence(translated[first_in_batch].data.numpy())
179 | 
180 |         return output_sentence
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     args = _pr.parse_args()
185 |     with Session(args) as sess:
186 |         for i in range(args.N_EPOCH):
187 |             sess.train()
188 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/afr-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/afr-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/ara-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/ara-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/aze-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/aze-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/bel-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/bel-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/ben-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/ben-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/ber-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/ber-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/bul-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/bul-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/cat-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/cat-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/cat-eng/_about.txt:
--------------------------------------------------------------------------------
 1 | ** Info **
 2 | 
 3 | Check for newest version here:
 4 |   http://www.manythings.org/anki/
 5 | Date of this file:
 6 |   2017-01-14
 7 | 
 8 | This data is from the sentences_detailed.csv file from tatoeba.org.
 9 | http://tatoeba.org/files/downloads/sentences_detailed.csv
10 | 
11 | 
12 | 
13 | ** Terms of Use **
14 | 
15 | See the terms of use.
16 | These files have been released under the same license as the
17 | source.
18 | 
19 | http://tatoeba.org/eng/terms_of_use
20 | http://creativecommons.org/licenses/by/2.0
21 | 
22 | Attribution: www.manythings.org/anki and tatoeba.org
23 | 
24 | 
25 | 
26 | ** Warnings ** 
27 | 
28 | The data from the Tatoeba Project contains errors.
29 | 
30 | To lower the number of errors you are likely to see, only
31 | sentences by native speakers and proofread sentences have
32 | been included.
33 | 
34 | For the non-English language, I made these (possibly wrong)
35 | assumptions.
36 | Assumption 1: Sentences written by native speakers can be
37 | trusted.
38 | Assumption 2: Contributors to the Tatoeba Project are honest
39 | about what their native language is.
40 | 
41 | For English, I used the sentences that I have proofread
42 | and thought were OK.
43 | Of course, I may have missed a few errors.
44 | 
45 | 
46 | 
47 | ** Downloading Anki ** 
48 | 
49 | See http://ankisrs.net/
50 | 
51 | 
52 | 
53 | ** Importing into Anki ** 
54 | 
55 | Information is at http://ankisrs.net/docs/manual.html#importing
56 | 
57 | Of particular interest may be about "duplicates" at http://ankisrs.net/docs/manual.html#duplicates-and-updating.
58 | You can choose:
59 | 1. not to allow duplicates (alternate translations) as cards.
60 | 2. allow duplicates (alternate translations) as cards.
61 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/cat-eng/cat.txt:
--------------------------------------------------------------------------------
  1 | Wow!	Carai!
  2 | Really?	De veritat?
  3 | Thanks.	Gràcies!
  4 | Goodbye!	Adéu!
  5 | Hurry up.	Afanya't.
  6 | Too late.	Massa tard.
  7 | Thank you.	Gràcies!
  8 | Can I help?	Puc ajudar?
  9 | I envy him.	L'envejo.
 10 | Time flies.	El temps vola.
 11 | I'm 17, too.	Jo també tinc 17 anys.
 12 | I'm at home.	Estic a casa.
 13 | Money talks.	Qui paga, mana.
 14 | We love you.	T'estimem.
 15 | We love you.	Us estimem.
 16 | Who are you?	Qui ets tu?
 17 | Who are you?	Qui és vostè?
 18 | Who are you?	Qui ets?
 19 | Who are you?	Qui sou?
 20 | He has a dog.	Ell té un gos.
 21 | She stood up.	Ella es va aixecar.
 22 | Hi, everybody.	Hola a tots.
 23 | I'm desperate.	Estic desesperat.
 24 | Let me try it.	Deixa'm intentar-ho.
 25 | You look good.	Tens bona cara.
 26 | You look good.	Fas bona cara.
 27 | You look good.	Fas bon aspecte.
 28 | Are you insane?	Estàs boig?
 29 | Can I help you?	Puc ajudar?
 30 | I need a stamp.	Necessito un segell.
 31 | I saw him jump.	El vaig veure saltar.
 32 | Leave me alone!	Deixa'm en pau!
 33 | Who painted it?	Qui ho ha pintat?
 34 | I didn't say it.	No ho he dit pas.
 35 | I felt the same.	Em sentia igual.
 36 | I have two cats.	Tinc dos gats.
 37 | I speak Swedish.	Parlo suec.
 38 | It's cold today!	Avui fa fred!
 39 | It's your fault.	És culpa teva.
 40 | Who are you all?	Qui sou tots vosaltres?
 41 | Who are you all?	Qui sou totes vosaltres?
 42 | Here is your bag.	Aquí és la teva bossa.
 43 | Here is your bag.	Ací està la teua bossa.
 44 | Here is your bag.	Ací tens la teua bossa.
 45 | I am now on duty.	Ara estic de servei.
 46 | I ate the cheese.	Em vaig menjar el formatge.
 47 | I have a problem.	Tinc un problema.
 48 | I have a problem.	Tinc un maldecap.
 49 | I have no family.	No tinc família.
 50 | I work in a bank.	Jo treballo a un banc.
 51 | I wrote a letter.	Vaig escriure una carta.
 52 | I'm already late.	Ja faig tard.
 53 | I'm not a doctor.	Jo no sóc metge.
 54 | Let go of my arm.	Deixa'm anar el braç.
 55 | She lives nearby.	Viu aquí prop.
 56 | They're my books.	Són els meus llibres.
 57 | This is your dog.	Aquest és el teu gos.
 58 | Tom walked alone.	En Tom caminava sol.
 59 | What is going on?	Què hi ha?
 60 | What is going on?	Què passa?
 61 | Who are you with?	Amb qui estàs?
 62 | Who are you with?	Amb qui esteu?
 63 | Answer in English.	Contesta en anglès!
 64 | He went back home.	Ell va tornar a casa.
 65 | I have an earache.	Tinc otitis.
 66 | I have black eyes.	Tinc els ulls negres.
 67 | I think he did it.	Crec que ho va fer ell.
 68 | I'm a salesperson.	Sóc venedor.
 69 | Let me have a try.	Deixa'm intentar-ho.
 70 | Nobody is perfect.	Ningú és perfecte.
 71 | She has long hair.	Ella té el cabell llarg.
 72 | Tom doesn't drink.	Tom no beu.
 73 | Why is snow white?	Per què és blanca la neu?
 74 | You need to hurry.	T'has d'afanyar.
 75 | Are you hungry now?	Tens fam, ara?
 76 | Are you hungry now?	Teniu fam, ara?
 77 | Are you hungry now?	Ara tens fam?
 78 | Come along with me.	Acompanya'm.
 79 | Do it by all means.	Fes-ho com sigui.
 80 | Do it by all means.	Fes-ho peti qui peti.
 81 | Everyone loves him.	Tots l'estimen.
 82 | He speaks fluently.	Parla amb soltura.
 83 | I don't have a cat.	No tinc cap gat.
 84 | I don't want sugar.	No vull sucre.
 85 | I have lost my key.	He perdut la meva clau.
 86 | It was a nightmare.	Va ser un malson.
 87 | Shut up and listen!	Calla i escolta!
 88 | Shut up and listen.	Calla i escolta!
 89 | Stay out of my way.	Fora del meu camí!
 90 | This coat fits you.	Aquest abric et queda bé.
 91 | Tom is a silly man.	En tom és un ximplet.
 92 | What's the problem?	Quin problema hi ha?
 93 | What's your secret?	Quin és el teu secret?
 94 | You are a good boy.	Ets un bon noi.
 95 | Europe is in crisis.	Europa està en crisi.
 96 | Everybody loves him.	Tots l'estimen.
 97 | He'll return at six.	Tornarà a les sis.
 98 | He's the oldest son.	Ell és el fill gran.
 99 | I can see the light.	Puc veure la llum.
100 | I want to stay here.	Vull quedar-me aquí.
101 | I'll call him later.	Li cridaré més tard.
102 | I've got a question.	Tinc una pregunta.
103 | She has a white cat.	Ella té un gat blanc.
104 | She raised her hand.	Ella va aixecar la mà.
105 | She raised her hand.	Ella va alçar la mà.
106 | She raised her hand.	Ella alçà la mà.
107 | She raised her hand.	Va alçar la mà.
108 | The bicycle is mine.	La bicicleta és meva.
109 | They were satisfied.	Estaven satisfets.
110 | They were satisfied.	Estaven cofois.
111 | Where are you going?	A on vas?
112 | Your father is tall.	Ton pare és alt.
113 | Birds fly in the sky.	Els ocells volen pel cel.
114 | Can he speak English?	Que parla anglès, ell?
115 | Do you live in Tokyo?	Vius a Tokyo?
116 | Fish live in the sea.	Els peixos viuen al mar.
117 | He is a good athlete.	Ell és un bon atleta.
118 | I dislike big cities.	No m'agraden les ciutats grans.
119 | I don't want to work.	No vull treballar.
120 | I have two daughters.	Tinc dues filles.
121 | I'm glad you're here.	M'alegra que estigues ací.
122 | I'm glad you're here.	M'alegra que estigueu ací.
123 | It's a piece of cake.	Això és bufar i fer ampolles.
124 | It's really annoying.	És realment molest.
125 | Sorry for being late.	Perdó pel retard.
126 | That's a bright idea.	És una idea brillant.
127 | The ground seems wet.	El sòl sembla mullat.
128 | Tom always says that.	Tom sempre diu això.
129 | Tom doesn't watch TV.	En Tom no mira la tele.
130 | Who's coming with me?	Qui ve amb mi?
131 | You can come with me.	Pots venir amb mi.
132 | Your son is a genius.	El vostre fill és un geni.
133 | Do you have two books?	Tens dos llibres?
134 | Does he speak English?	Que parla anglès, ell?
135 | Does he speak English?	Parla anglès?
136 | Does he speak English?	Ell parla anglès?
137 | His house was on fire.	La seva casa està en flames.
138 | I don't have time now.	Ara no tinc temps.
139 | I have to go to sleep.	He d'anar a dormir.
140 | I know these students.	conec aquests estudiants.
141 | I'd like some aspirin.	Voldria una aspirina.
142 | Money opens all doors.	Els diners obren totes les portes.
143 | She is a good swimmer.	És una bona nedadora.
144 | Thanks for everything.	Mercès per tot.
145 | The girl said nothing.	La nena no va dir res.
146 | Those are empty words.	Són paraules buides.
147 | We have two daughters.	Tenim dues filles.
148 | What's wrong with you?	Què et passa?
149 | You should eat slower.	Has de menjar més a poc a poc.
150 | You're taller than me.	Ets més alt que jo.
151 | Come whenever you like.	Vingui quan vulgui.
152 | Come whenever you like.	Veniu quan vulgueu.
153 | Come whenever you like.	Vine quan vulguis.
154 | Does the bus stop here?	L'autobús para ací?
155 | I didn't buy this book.	No vaig comprar aquest llibre.
156 | I think he has done it.	Crec que ell ho ha fet.
157 | Mary married for money.	La Mary es va casar per diners.
158 | They will not eat meat.	No menjaran carn.
159 | Tom wants to be famous.	Tom vol ser famós.
160 | We have to act quickly.	Hem d'actuar ràpid.
161 | What is wrong with him?	Què li passa?
162 | Ask me something easier.	Pregunta'm una cosa més fàcil.
163 | Do you have a cellphone?	Tens un mòbil?
164 | He died three years ago.	Va morir fa tres anys.
165 | I don't like big cities.	No m'agraden les ciutats grans.
166 | I felt like I would die.	Sentia que em moriria.
167 | I hope that I can do it.	Espere poder-ho fer.
168 | I listened to her story.	Vaig escoltar la història d'ella.
169 | I won't see him anymore.	No el veuré mai més.
170 | I'll start this evening.	Començaré aquest vespre.
171 | I'll start this evening.	Començaré esta vesprada.
172 | I'm sure of his success.	Estic segur del seu èxit.
173 | I've forgotten his name.	He oblidat el seu nom.
174 | Many sailors can't swim.	Molts mariners no saben nedar,
175 | My father quit drinking.	El meu pare va aturar de beure.
176 | My father quit drinking.	Mon pare va deixar de beure.
177 | She helped me willingly.	Ella em va ajudar de bon gust.
178 | Tell me when he returns.	Avisa'm quan torni.
179 | Tom is thirty years old.	Tom té trenta anys.
180 | What a beautiful sunset!	Quina posta de sol més bonica!
181 | What a beautiful sunset.	Quina posta més maca!
182 | What a beautiful sunset.	Quina posta de sol més bonica.
183 | Where are our umbrellas?	On estan els nostres paraigües?
184 | Would you draw me a map?	Em faríes un mapa?
185 | You will miss the train.	Perdràs el tren.
186 | Can your mom drive a car?	La teva mama sap conduir?
187 | Can your mom drive a car?	Ta mare sap conduir?
188 | Can your mom drive a car?	La teua mare sap conduir un cotxe?
189 | Can your mom drive a car?	La vostra mare sap conduir?
190 | Do you have a cell phone?	Tens un mòbil?
191 | Don't ever do that again.	No ho facis mai més.
192 | Fire is always dangerous.	El foc sempre és perillós.
193 | He fell and hurt his leg.	Ell es va caure i es va fer mal a la cama.
194 | He has never played golf.	Ell no ha jugat mai al golf.
195 | He looks like his father.	Ell s'assembla al seu pare.
196 | He speaks five languages.	Ell parla cinc idiomes.
197 | I can't stand that noise.	No puc aguantar aquest soroll.
198 | I didn't know what to do.	No sabia què fer.
199 | I don't know her address.	No sé la seva adreça.
200 | I made my son a new suit.	He fet un vestit nou per a mon fill.
201 | I only have eyes for you.	Aquí no veig ningú més que tu.
202 | I recognized him at once.	El vaig reconèixer de seguida.
203 | I'm talking on the phone.	Parlo per telèfon.
204 | I'm talking on the phone.	Estic parlant per telèfon.
205 | It doesn't sound natural.	No sona natural.
206 | It is nice and cool here.	Aquí fa fresca i s'hi està bé.
207 | My mother is always busy.	La meva mare sempre està ocupada.
208 | She had a strange hat on.	Ella portava un barret estrany.
209 | This is his car, I think.	Aquest és el seu cotxe, crec.
210 | Tom is a history teacher.	En Tom és un professor d'història.
211 | Tom visited Mary's grave.	En Tom va visitar la tomba de la Mary.
212 | Turn down the TV, please.	Baixa el volum del televisor.
213 | Turn the TV down, please.	Baixa el volum del televisor.
214 | You are everything to me.	Tu ets tot per mi.
215 | You don't have to eat it.	Vostè no ha de menjar-ho.
216 | I like to listen to music.	M'agrada escoltar música.
217 | I made a careless mistake.	Vaig cometre una negligència.
218 | I'm busy, so I can't help.	Estic ocupat, no puc ajudar-te.
219 | I'm busy, so I can't help.	Estic ocupat, no puc ajudar-vos.
220 | I'm standing in the shade.	M'estic dret a l'ombra.
221 | It's always been that way.	Sempre ha sigut així.
222 | That'll put you in danger.	Això et posarà en perill.
223 | We may be late for school.	Potser farem tard a l'escola.
224 | What a nice sounding word!	Què bé sona aquesta paraula!
225 | Where will you be staying?	On t'estaràs?
226 | Where will you be staying?	On t'allotjaràs?
227 | Where will you be staying?	On et quedaràs?
228 | Where will you be staying?	On us allotjareu?
229 | You agree with Tom, right?	Estàs d'acord amb Tom, oi?
230 | You agree with Tom, right?	Esteu d'acord amb Tom, veritat?
231 | You don't have to do this.	No has de fer-ho.
232 | You don't have to do this.	Això no ho has de fer.
233 | Do they have any good news?	Tens bones notícies?
234 | Do you come here every day?	Véns aquí cada dia?
235 | Do you come here every day?	Veniu ací tots els dies?
236 | Do you have a mobile phone?	Tens un mòbil?
237 | Do you know his birthplace?	Saps on va néixer?
238 | I have to buy one tomorrow.	He de comprar-ne un demà.
239 | I just want to be near you.	Només vull estar prop de tu.
240 | I know he likes jazz music.	Sé que li agrada el jazz.
241 | I'd rather do it by myself.	Prefereixo fer-lo pel meu compte.
242 | I'm afraid I caught a cold.	Em sembla que he agafat un constipat.
243 | It's all you can really do.	És tot el que pots fer.
244 | It's always been like that.	Sempre ha sigut així.
245 | She's Tom's younger sister.	És la germana petita d'en Tom.
246 | She's Tom's younger sister.	És la germana menuda de Tom.
247 | The bird's wing was broken.	L'ala de l'ocell estava trencada.
248 | The bird's wing was broken.	L'ocell tenia una ala trencada.
249 | The bird's wing was broken.	El pardal tenia una ala trencada.
250 | There were ten eggs in all.	Hi havia deu ous en total.
251 | There's no reason to worry.	No hi ha cap motiu per preocupar-se.
252 | Things are not that simple.	Les coses no són tan senzilles.
253 | This store sells old books.	Aquesta botiga ven llibres vells.
254 | You're not a child anymore.	Ja no ets un nen.
255 | Columbus discovered America.	Colón va descobrir Amèrica.
256 | Don't you like Chinese food?	No t'agrada el menjar xinès?
257 | France is in western Europe.	França és a l'Europa Occidental.
258 | He plays baseball every day.	Juga al beisbol tots el dies.
259 | He's the one who touched me.	Ell és el que em va tocar.
260 | I don't know if he knows it.	No sé si ho sap.
261 | I'll be back in ten minutes.	Tornaré en deu minuts.
262 | I'm the one who has the key.	Jo sóc qui té la clau.
263 | Take off your socks, please.	Sisplau, lleva't els mitjons.
264 | Take off your socks, please.	Lleva't els calcetins, per favor.
265 | Take off your socks, please.	Lleveu-vos els calcetins, per favor.
266 | The food was great in Italy.	El menjar va ser cosa fina a Itàlia.
267 | They work eight hours a day.	Treballen vuit hores al dia.
268 | What are you doing tomorrow?	Què fas demà?
269 | What is wrong with that guy?	Què li passa a aquet paio?
270 | Where will we go afterwards?	On anirem després?
271 | Do you know where she's gone?	Saps on ha anat ella?
272 | He goes to the office by car.	Va al despatx amb cotxe.
273 | He is the manager of a hotel.	És el director d'un hotel.
274 | He lost all the money he had.	Va perdre tots els diners que tenia.
275 | He plays the piano very well.	Ell toca el piano molt bé.
276 | I don't want to go to school.	No vull anar a l'escola.
277 | I have something to tell you.	T'he de dir una cosa.
278 | I have something to tell you.	Us he de dir una cosa.
279 | I have something to tell you.	Tinc una cosa a dir-te.
280 | I must have the wrong number.	Dec tenir el número equivocat.
281 | I never get tired of talking.	No em canso mai de parlar.
282 | I saw him tear up the letter.	El vaig veure estripar la carta.
283 | I will get in touch with you.	Em posaré en contacte amb tu.
284 | Japan is smaller than Canada.	El Japó és més petit que el Canadà.
285 | She sent you her best wishes.	Ella t'envia els seus millors desitjos.
286 | That's exactly what happened.	Això és exactament el què va passar.
287 | The girl didn't say anything.	La nena no va dir res.
288 | The soldier gave water to me.	El soldat m'ha donat aigua.
289 | We killed time playing cards.	Matàvem el temps jugant a les cartes.
290 | We must control our passions.	Hem de controlar les nostres passions.
291 | What you think is irrelevant.	El que penses és irellevant.
292 | Do you have medical insurance?	Teniu assegurança mèdica?
293 | He comes here every five days.	Ve aquí cada cinc dies.
294 | He left the book on the table.	Va deixar el llibre sobre la taula.
295 | How many children do you have?	Quants fills tens?
296 | I believe the choice is clear.	Crec que l'elecció està clara.
297 | I study for 3 hours every day.	Jo estudio 3 hores cada dia.
298 | It was cheaper than I thought.	És més barat del que em vaig pensar.
299 | Let me know whenever you come.	Quan vinguis, fes-m'ho saber.
300 | Most schools are closed today.	La majoria d'escoles avui estan tancades.
301 | My dad died before I was born.	Mon pare va morir abans del meu naixement.
302 | Nobody equals him in strength.	Ningú no li fa ombra.
303 | Nobody equals him in strength.	Ningú no li és rival.
304 | Our summer is short, but warm.	El nostre estiu és curt, però calorós.
305 | She didn't tell me her secret.	Ella no em va dir el seu secret.
306 | She is giving a party tonight.	Ella fa una festa aquesta nit.
307 | Tom wants to change the world.	En Tom vol canviar el món.
308 | Tom's arm had to be amputated.	Van haver d'amputar el braç al Tom.
309 | Tom's arm had to be amputated.	Li van haver d'amputar el braç a Tom.
310 | You agree with Tom, don't you?	Estàs d'acord amb Tom, no?
311 | He is a very thoughtful person.	És una persona molt considerada.
312 | I don't know when he will come.	No sé quan vindrà.
313 | I don't like it when you swear.	No m'agrada que digues paraulotes.
314 | I don't like it when you swear.	No m'agrada que digueu paraulotes.
315 | I have breakfast every morning.	Cada dia esmorzo.
316 | I have not seen him since then.	No l'he vist des d'aleshores.
317 | I opened the box. It was empty.	Vaig obrir la caixa. Estava buida.
318 | I wish I could buy that guitar.	Com voldria poder comprar aquesta guitarra.
319 | I wish I could buy that guitar.	M'agradaria poder comprar eixa guitarra.
320 | I wish I could buy that guitar.	Tant de bo pogués comprar aquesta guitarra.
321 | I wish I could buy that guitar.	Tant de bo poguera comprar eixa guitarra.
322 | I'm very glad to see you again.	Estic molt content de tornar-te a veure.
323 | Please circle the right answer.	Encercleu la resposta correcta, sisplau.
324 | He had a firm belief in his God.	Té una creença ferma en Déu.
325 | He is getting better bit by bit.	Ell s'està millorant poc a poc
326 | He is getting better bit by bit.	Està millorant poc a poc.
327 | He told me an interesting story.	M'ha contat una història interessant.
328 | Helen Keller was deaf and blind.	Hellen Keller era sorda i cega.
329 | How much does he earn per month?	Quant guanya al mes?
330 | I can repeat it again and again.	Puc repetir-ho vint vegades.
331 | I caught the ball with one hand.	Vaig agafar la pilota amb una mà.
332 | I heard him sing at the concert.	El vaig sentir cantant al concert.
333 | I was not aware of his presence.	Jo no era conscient que ell estava al davant.
334 | I wonder if he'll come tomorrow.	Em pregunto si vindrà demà.
335 | I'm a professional photographer.	Jo sóc fotògraf professional.
336 | Let me know when he will arrive.	Ja em diràs quan arriba.
337 | My mother speaks little English.	La meva mare parla una mica d'anglès.
338 | She made the same mistake again.	Ella va cometre una altra vegada la mateixa errada.
339 | She will have a baby next month.	Ella vol tenir un fill el mes vinent.
340 | The food didn't taste very good.	El menjar no tenia gaire bon gust.
341 | The food didn't taste very good.	El menjar no feia gaire bon gust.
342 | The sun appeared on the horizon.	El Sol apareix a l'horitzont.
343 | The sun appeared on the horizon.	El sol aparegué a l'horitzó.
344 | The sun gives us heat and light.	El Sol ens dóna calor i llum.
345 | The sun is larger than the moon.	El Sol és més gran que la Lluna.
346 | Tom is too young to drive a car.	Tom és massa jove per portar un cotxe.
347 | Tom was fired for a good reason.	Tom va ser despedit per una causa justa.
348 | You don't have to kick yourself.	No et facis mala sang.
349 | You should've told me yesterday.	M'ho hauries d'haver dit ahir.
350 | Your opinion is important to me.	La teua opinió és important per a mi.
351 | Asians generally have black hair.	Els asiàtics normalment tenen el cabell negre,
352 | Do you know who wrote this novel?	Saps qui va escriure aquesta novela?
353 | Do you know who wrote this novel?	Sabeu qui va escriure aquesta novel·la?
354 | Don't compare me to a movie star.	No em comparis amb una estrella de cinema.
355 | He went skiing during the winter.	Se'n va anar a esquiar a l'hivern.
356 | I have lived in Tokyo since 1985.	He viscut a Tokyo des de 1985.
357 | I saw the moon above the horizon.	Veig la lluna sobre l'horitzont.
358 | My brother-in-law is a policeman.	El meu cunyat és policia.
359 | My father died before I was born.	Mon pare va morir abans del meu naixement.
360 | My father died before I was born.	Mon pare va morir abans que jo nasquera.
361 | My father died before I was born.	El meu pare va morir abans de néixer jo.
362 | The bus arrived ten minutes late.	El bus va arribar deu minuts tard.
363 | The bus arrived ten minutes late.	L'autobús arribà deu minuts tard.
364 | The bus arrived ten minutes late.	L'autobús va arribar amb deu minuts de retard.
365 | The flood caused a lot of damage.	La riada va fer molt de mal.
366 | The flood caused a lot of damage.	La inundació va fer molt de mal.
367 | The rumor is true to some extent.	Fins a un cert punt, el rumor és cert.
368 | The teacher told me study harder.	El professor em va dir que estudiés molt.
369 | Tom and Mary acted like children.	En Tom i la Mary es portaven com nens.
370 | Tom couldn't hold back his tears.	Tom no va poder contenir les llàgrimes.
371 | Tom couldn't hold back his tears.	Tom no podia contenir les llàgrimes.
372 | Tom doesn't go to school anymore.	Tom ja no va a l'escola.
373 | Tom is no longer studying French.	En Tom ja no estudia francès.
374 | When can we see each other again?	On ens podem tornar a veure?
375 | Are we talking about the same Tom?	Estem parlant del mateix Tom?
376 | Are we talking about the same Tom?	Parlem del mateix Tom?
377 | Everyone hoped that she would win.	Tothom esperava que guanyés.
378 | He was willing to work for others.	Ell estava disposat a treballar per altres.
379 | I burned my fingers on a hot iron.	Em vaig cremar els dits amb un ferro roent.
380 | I burned my fingers on a hot iron.	Em vaig cremar els dits amb una planxa calenta.
381 | I have nothing in common with her.	No tinc res en comú amb ella.
382 | I spend money as soon as I get it.	Em gasto els diners de seguida que en tinc.
383 | I write letters that I never send.	Escric cartes que no envio mai.
384 | Is Flight 123 going to be delayed?	El vol 123, té retard?
385 | Last night we worked until 10 p.m.	Ahir a la nit vàrem treballar fins a les deu.
386 | My mother knows how to make cakes.	La meva mare sap com fer pastissos.
387 | Tell me your plans for the future.	Explica'm els teus plans per al futur.
388 | Tell me your plans for the future.	Conta'm els teus plans de futur.
389 | Tell me your plans for the future.	Expliqueu-me els vostres plans per al futur.
390 | Thank you so much for inviting me.	Moltes gràcies per la invitació.
391 | The plane took off exactly at six.	L'avió s'enlairà a les sis clavades.
392 | Today's meeting has been canceled.	La reunió d'avui ha sigut cancelada.
393 | Where's the nearest travel agency?	On és l'agència de viatges més propera?
394 | Bangkok is Thailand's capital city.	Bangkok és la capital de Tailàndia.
395 | Do you want to play tennis with us?	Vols jugar a tennis amb nosaltres?
396 | He helped poor people all his life.	Ell va ajudar els pobres tota la seva vida.
397 | Her husband is now living in Tokyo.	El seu marit viu a Tòkio ara.
398 | I can't remember where I bought it.	No puc recordar on el vaig comprar.
399 | I can't remember where I bought it.	No recorde on el vaig comprar.
400 | I can't remember where I bought it.	No recorde on ho vaig comprar.
401 | I can't remember where I bought it.	No recorde on la vaig comprar.
402 | I can't remember where I bought it.	No me'n recorde d'on ho vaig comprar.
403 | I can't remember where I bought it.	No recordo on el vaig comprar.
404 | I heard a beautiful song yesterday.	Ahir vaig sentir una cançó bonica.
405 | I thanked him for what he had done.	Li vaig agrair el que va fer.
406 | I'd like to meet your older sister.	Voldria trobar-me amb la teva germana gran.
407 | I'd like to meet your older sister.	M'agradaria conèixer la teva germana gran.
408 | I'm the one who pays all the bills.	Jo sóc qui paga totes les factures.
409 | I'm very slow at making up my mind.	Sóc molt lent a l'hora de prendre decisions.
410 | I, too, didn't understand anything.	Jo tampoc entenc res.
411 | Is there a post office around here?	Hi ha alguna oficina postal per aquí?
412 | Is there a post office around here?	Hi ha alguna oficina de correus prop d'ací?
413 | Is there a post office around here?	Hi ha per ací alguna oficina de correus?
414 | The door is locked at nine o'clock.	La porta es tanca amb clau a les nou.
415 | The lion is the king of the jungle.	El lleó és el rei de la selva.
416 | These questions are easy to answer.	Aquestes preguntes són fàcils de respondre.
417 | We are sorry for the inconvenience.	Ens sap greu la molèstia causada.
418 | We're not going to change anything.	No canviarem res.
419 | What little money I had was stolen.	Els pocs diners que tenia me'ls van robar.
420 | A lot of jobs are done by computers.	Moltes feines les fan els ordinadors.
421 | Do you wonder why no one trusts him?	T'estranya que ningú hi confiï?
422 | Don't go to sleep with the light on.	No et durmis amb el llum encès.
423 | I can't remember which is my racket.	No recorde quina és la meua raqueta.
424 | I don't think we can take that risk.	Crec que no podem córrer aquest risc.
425 | I don't think we can take that risk.	Crec que no podem córrer eixe risc.
426 | I have nothing to say to any of you.	No tinc res a dir-vos a cap de vosaltres.
427 | I was caught in a shower on the way.	M'ha enxampat un xàfec pel camí.
428 | I'd like to reserve a table for two.	M'agradaria reservar una taula per a dos.
429 | Look that word up in the dictionary.	Cerca aquella paraula al diccionari.
430 | My apartment is on the fourth floor.	El meu apartament està al quart pis.
431 | Night is when most people go to bed.	La nit és quan la majoria de la gent se'n va al llit.
432 | Take this medicine before each meal.	Preneu aquest medicament abans de cada àpat.
433 | Tom may talk to Mary if he wants to.	En Tom, si vol, pot parlar amb la Mary.
434 | Tom may talk to Mary if he wants to.	Tom pot parlar amb Mary, si vol.
435 | When did you come back from Germany?	Quan vas tornar d'Alemanya?
436 | Flowers die if they don't have water.	Sense aigua les flors es panseixen.
437 | His arrogance is no longer tolerable.	La seva arrogància ja no és tolerable.
438 | His courage is worthy of high praise.	La seva valentia mereix grans lloances.
439 | I planted an apple tree in my garden.	He plantat un pomer al meu jardí.
440 | I really must have my watch repaired.	He de dur el rellotge a arreglar.
441 | I'm sick. Will you send for a doctor?	Estic malalt. Oi que avisaràs un metge?
442 | I'm sure of winning the championship.	Estic segur de guanyar el campionat.
443 | It seems that he was a great athlete.	Sembla que va ser un gran atleta.
444 | It's easier to have fun than to work.	És més fàcil divertir-se que treballar.
445 | Please write to me from time to time.	Escriu-me de tant en tant, sí?
446 | What are you going to eat for dinner?	Que soparàs avui?
447 | What do you want to talk to me about?	De què vols parlar amb mi?
448 | What's your opinion of Japanese food?	Quina és la teva opinió sobre el menjar japonès?
449 | Everyone was listening very carefully.	Tots estaven escoltant atentament.
450 | He is three years younger than Father.	Ell és tres anys més jove que el pare.
451 | I don't know what has happened to him.	No sé què li ha passat.
452 | I was the one who knocked on the door.	Vaig ser jo qui va trucar a la porta.
453 | I'll make an exception just this once.	Faré una excepció només per aquesta vegada.
454 | I'm the one who takes out the garbage.	Jo sóc qui treu les escombraries.
455 | Japan imports a large quantity of oil.	El Japó importa una gran quantitat de petroli.
456 | Mary's doctor advised her to exercise.	El metge de la Mary li va aconsellar que fes exercici.
457 | Please correct me if I make a mistake.	Si us plau, corregeix-me si m'equivoco.
458 | Will the work be finished by tomorrow?	Estarà enllestida la feina per a demà?
459 | "Is she reading a book?" "Yes, she is."	"Està llegint un llibre?" "Sí."
460 | "Is she reading a book?" "Yes, she is."	"Està ella llegint un llibre?" "Sí."
461 | All my friends like playing videogames.	A tots els meus amics els agraden els videojocs.
462 | As long as there's life, there is hope.	Mentre hi ha vida, hi ha esperança.
463 | Blue lines on the map designate rivers.	Les línies blaves al mapa designen rius.
464 | How much time do you spend on Facebook?	Quant de temps passes a Facebook?
465 | I don't know whether it is true or not.	No sé si és veritat o no.
466 | I don't think Tom was talking about me.	No crec que Tom estigués parlant de mi.
467 | I have cookies for breakfast every day.	Cada dia menjo galetes per esmorzar.
468 | I would like to visit New York someday.	Un dia m'agradaria visitar New York.
469 | I've been waiting for this day to come.	He estat esperant que arribi aquest dia.
470 | In Japan there are four seasons a year.	Al Japó hi ha quatre estacions cada any.
471 | Mathematics is important in daily life.	Les matemàtiques són importants a la vida diària.
472 | The Japanese economy developed rapidly.	L'economia japonesa es va desenvolupar depressa.
473 | The class was divided into four groups.	La classe es va dividir en quatre grups.
474 | The earth is much larger than the moon.	La Terra és molt més gran que la Lluna.
475 | They arrived late because of the storm.	Ells van arribar tard a causa de la tempesta.
476 | They say golf is very popular in Japan.	Diuen que el golf és molt popular al Japó.
477 | This is the best book I have ever read.	És el millor llibre que he llegit mai.
478 | Tom is interested in French literature.	En Tom està interessat en la literatura francesa.
479 | Tom is making great progress in French.	En Tom està progressant molt amb el francès.
480 | He fought against racial discrimination.	Va lluitar contra la discriminació racial.
481 | I know that there was a big church here.	Sé que aquí hi havia una església gran.
482 | I noticed that she sat in the front row.	Vaig notar que ella va seure a la fila del davant.
483 | President Clinton denied the accusation.	El president Clinton va negar l'acusació.
484 | The men are wearing short sleeve shirts.	Els homes porten camises de màniga curta.
485 | What do these dots represent on the map?	Què signifiquen aquests punts al mapa?
486 | Will you please stop talking about food?	Podries deixar de parlar de menjar?
487 | German is the best language in the world.	L'alemany és la millor llengua del món.
488 | How many people are there in your family?	Quants són a la seva família?
489 | I asked him many questions about ecology.	Li vaig fer moltes preguntes sobre ecologia.
490 | I don't have the strength to keep trying.	No tinc la força per continuar triant.
491 | I started learning English six years ago.	Fa sis anys que vaig començar a aprendre anglès.
492 | I will ask him where he went last Sunday.	Li preguntaré on va anar el diumenge.
493 | I'm surprised that he accepted the offer.	Em sorprèn que acceptés l'oferiment.
494 | It is difficult to speak three languages.	És difícil parlar tres llengues.
495 | There are many beautiful parks in London.	A Londres hi han molts parcs bonics.
496 | Tom does everything he can to save money.	En Tom fa tot el que pot per estalviar.
497 | I am sure of his winning the tennis match.	Estic segur de la seva victòria al tennis.
498 | I don't know the reason why he went there.	No sé el motiu pel qual va anar-hi.
499 | I'd like to know when you can send it out.	M'agadaria saber quan ho pot enviar.
500 | Nothing happens unless you make it happen.	No passa res si tu no fas que passi.
501 | This is the best book that I've ever read.	És el millor llibre que he llegit mai.
502 | As we go up higher, the air becomes cooler.	Com més amunt anem, més fresc és l'aire.
503 | Do you support or oppose the death penalty?	Estàs a favor o en contra de la pena de mort?
504 | English is not easy, but it is interesting.	L'anglès no és fàcil, però és interessant.
505 | I don't have anything to say to any of you.	No tinc res a dir-vos a cap de vosaltres.
506 | I don't know for certain when he will come.	No sé del cert quan vindrà.
507 | I eat a boiled egg for breakfast every day.	Cada dia em menjo un ou dur per esmorzar.
508 | I have been studying French four years now.	Fa quatre anys que estudio francès.
509 | I told you to be here on time this morning.	Et vaig dir que havies de ser aquí puntual aquest matí.
510 | I'm fed up with him always preaching to me.	Estic tip que em sermonegi constantment.
511 | I'm fed up with him always preaching to me.	Estic tip dels seus sermons constants.
512 | It seems those two are made for each other.	Sembla que aquell parell estan fets l'un per l'altre.
513 | This is the place where my father was born.	Aquest és el lloc on va nèixer el meu pare.
514 | When will it be convenient for you to come?	Quan li convendria venir?
515 | Give him this message the moment he arrives.	Dóna-li aquest missatge quan arribi.
516 | I demanded that he pay the bill immediately.	Li vaig demanar de pagar la factura immediatament.
517 | I feel like telling him what I think of him.	Tinc ganes de dir-li què penso d'ell.
518 | I really need to take care of some business.	He de tenir cura d'alguns negocis.
519 | I refused to eat until my parents came home.	No vaig voler menjar fins que els meus pares no tornessin a casa.
520 | Japan imports great quantities of crude oil.	El Japó importa una gran quantitat de petroli.
521 | She makes him do his homework before dinner.	Ella l'obliga a fer els deures abans de sopar.
522 | They fell into the conversation immediately.	Van passar al tema a l'instant.
523 | You should pay more attention to what I say.	Deuries prestar més atenció a allò que dic.
524 | Both of them are unpredictable and impatient.	Tots dos són impredictibles i impacients.
525 | Her explanation of the problem made no sense.	La seva explicació del problema no tenia ni cap ni peus.
526 | I am going to do it whether you agree or not.	Ho faré, estigueu o no d'acord amb mi.
527 | I didn't know you were that kind of a person.	No sabia que eres així.
528 | I will take you to the zoo one of these days.	Un dia d'aquests et portaré al zoo.
529 | My son has gone to America to study medicine.	El meu fill ha anat a Amèrica a estudiar medicina.
530 | She says she brushes her teeth every morning.	Ella diu que es raspatlla les dents tots els dematins.
531 | We need to invest in clean, renewable energy.	Hem d'invertir en energia neta i renovable.
532 | He is one of the candidates running for mayor.	És un dels candidats que es presenta per alcalde.
533 | I haven't got the nerve to ask you for a loan.	No tinc valor per demanar-te un préstec.
534 | It is said that golf is very popular in Japan.	Es diu que el golf és molt popular al Japó.
535 | It seems I'm going to be up all night tonight.	Sembla que avui estaré despert tota la nit.
536 | Please wash your hands properly before eating.	Siusplau renteu-vos les mans com cal abans de menjar.
537 | The urban population of America is increasing.	La població urbana a Amèrica està creixent.
538 | I thought she was angry and would just go away.	Vaig pensar que s'havia enfadat i que se n'aniria.
539 | It doesn't matter whether he comes late or not.	No hi fa res si ve tard o no.
540 | She buys what she wants regardless of the cost.	Compra el que vol sense fixar-se en el que val.
541 | She's curious to find out who sent the flowers.	Ella té curiositat per saber qui va enviar les flors.
542 | Unfortunately, my birthday is only once a year.	Malauradament, el meu aniversario només succeeix una vegada a l'any.
543 | What would it cost to have this chair repaired?	Quant costaria arreglar aquesta cadira?
544 | Drink some coffee. It tastes very good, I think.	Pren una mica de cafè. Té molt bon gust, crec.
545 | He and his sisters are currently living in Tokyo.	En aquest moment, ell i les seves germanes viuen a Tòquio.
546 | He never fails to write to his mother every week.	No passa una setmana que no li escrigui a la seva mare.
547 | I'm not interested in going to the baseball game.	No tinc cap interès a anar al partit de beisbol.
548 | I'm sorry, but I can't find the book you lent me.	Em sap greu, però no trobo el llibre que em vas deixar.
549 | If only I knew, I would tell you all that I knew.	Si ho sabés, et diria tot el que sé.
550 | She tried to squeeze the juice out of the orange.	Va provar d'escórrer la taronja.
551 | This story is far more interesting than that one.	Aquesta història és molt més interessant que aquella.
552 | I took it for granted that he would pass the exam.	Dono per descomptat que aprovarà l'examen.
553 | They insisted on my making use of the opportunity.	Em varen insistir per a que aprofitès aquella oportunitat.
554 | Do you know which deity this temple is dedicated to?	Sabeu a quina divinitat està dedicat aquest temple?
555 | Why don't we see if Tom wants to play cards with us?	Perquè no mirem si en Tom vol jugar a les cartes amb niosaltres?
556 | I was glad to see that he finally came to his senses.	Vaig estar content de veure que al final va posar-hi seny.
557 | It's difficult for me to express myself in Esperanto.	Per mi és difícil expressar-me en esperanto.
558 | I want to live in a quiet city where the air is clean.	Vull viure a una ciutat tranquila amb l'aire pur.
559 | If you don't want to be alone, I can keep you company.	Si no vols estar sol, puc fer-te companyia.
560 | He will take over the business when his father retires.	Ell continuarà el negoci quan son pare es jubili.
561 | My mother likes tulips very much and so does my sister.	A ma mare li agraden molt les tulipes i a ma germana també.
562 | This cola has lost its fizz and doesn't taste any good.	Aquesta cola s'ha esbravat i no té bon gust.
563 | Tom is accustomed to calling up girls on the telephone.	En Tom acostuma a trucar noies.
564 | Cuzco is one of the most interesting places in the world.	Cuzco és un dels indrets més interessants del món.
565 | I stayed in bed one more day just to be on the safe side.	Em vaig quedar un dia més al llit per si de cas.
566 | Tom will likely be discharged from the hospital tomorrow.	Demà donaran d'alta de l'hospital en Tom.
567 | "How are you feeling this morning?" "Pretty good, thanks."	"Com et sents aquest matí?" "Bastant bé, gràcies."
568 | People of my generation all think the same way about this.	Tota la gent de la meva generació pensen igual sobre això.
569 | The only useful answers are those that raise new questions.	Les úniques respostes útils són les que creen noves preguntes.
570 | It takes us thirty minutes to walk from here to the station.	D'aquí a l'estació triguem mitja hora a peu.
571 | The secret of longevity is to choose your parents carefully.	El secret de la longevitat és triar amb compte els pares.
572 | It takes about 10 minutes to get to the train station by foot.	Tens uns 10 minuts d'aquí a l'estació a peu.
573 | This medicine must not be placed within the reach of children.	Aquest medicament no s'ha de deixar a la ma dels nins.
574 | You told her that you had finished the work three days before.	Li vas dir que havies enllestit la feina feia tres dies.
575 | His father died, and to make matters worse, his mother fell ill.	Son pare es va morir, i per acabar-ho d'adobar, sa mare es va posar malalta.
576 | Try to understand it in Spanish, without translating to English.	Tracta d'entendre-ho amb espanyol, sense traduïr-lo amb anglès.
577 | We lost our way, and what was worse, we were caught in a shower.	Ens vam perdre i, encara pitjor, ens va enxampar un xàfec.
578 | She's worried since she hasn't heard from her son for many months.	Està amoïnada perquè fa mesos que no té notícia del seu fill.
579 | I suspected that he was telling a lie, but that didn't surprise me.	Sospitava que m'estava dient una mentida, però això no em va sorprendre.
580 | My daughter won't find it easy to get accustomed to the new school.	La meva filla no trobarà fàcil per acostumar-se a la nova escola.
581 | The bullet penetrated his chest, leaving him in critical condition.	La bala va penetrar al seu pit i el va deixar en estat crític.
582 | I wanted to buy the book, but I found I had no more than 200 yen with me.	Volia comprar el llibre, però vaig adonar-me que no duia més de 200 iens.
583 | For the first time in more than 6 years, the unemployment rate is below 6%.	Per primera vegada en més de 6 anys, la taxa d'atur està per davall del 6%.
584 | We would have bought the plane tickets if the price had been a little lower.	Hauríem comprat els bitllets d'avió si el preu fos un pèl més baix.
585 | My friend has had three jobs in a year; he never sticks to anything for long.	El meu amic ha treballat a tres llocs diferents en un any; res no li dura gaire.
586 | You can't park in a handicapped parking space unless you have a special permit.	No pots aparcar a una plaça d'aparcament per discapacitats si no tens un permís especial.
587 | Drinking lots of water is good for you, sure, but one can't drink that much water at once.	Beure molta aigua és bo per tu, segur, però no es pot beure tanta aigua de cop.
588 | We're going to make sure that no one is taking advantage of the American people for their own short-term gain.	Ens assegurarem que ningú s'estiga aprofitant del poble americà per al seu propi interès a curt termini.
589 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/cbk-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/cbk-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/cbk-eng/_about.txt:
--------------------------------------------------------------------------------
 1 | ** Info **
 2 | 
 3 | Check for newest version here:
 4 |   http://www.manythings.org/anki/
 5 | Date of this file:
 6 |   2017-01-14
 7 | 
 8 | This data is from the sentences_detailed.csv file from tatoeba.org.
 9 | http://tatoeba.org/files/downloads/sentences_detailed.csv
10 | 
11 | 
12 | 
13 | ** Terms of Use **
14 | 
15 | See the terms of use.
16 | These files have been released under the same license as the
17 | source.
18 | 
19 | http://tatoeba.org/eng/terms_of_use
20 | http://creativecommons.org/licenses/by/2.0
21 | 
22 | Attribution: www.manythings.org/anki and tatoeba.org
23 | 
24 | 
25 | 
26 | ** Warnings ** 
27 | 
28 | The data from the Tatoeba Project contains errors.
29 | 
30 | To lower the number of errors you are likely to see, only
31 | sentences by native speakers and proofread sentences have
32 | been included.
33 | 
34 | For the non-English language, I made these (possibly wrong)
35 | assumptions.
36 | Assumption 1: Sentences written by native speakers can be
37 | trusted.
38 | Assumption 2: Contributors to the Tatoeba Project are honest
39 | about what their native language is.
40 | 
41 | For English, I used the sentences that I have proofread
42 | and thought were OK.
43 | Of course, I may have missed a few errors.
44 | 
45 | 
46 | 
47 | ** Downloading Anki ** 
48 | 
49 | See http://ankisrs.net/
50 | 
51 | 
52 | 
53 | ** Importing into Anki ** 
54 | 
55 | Information is at http://ankisrs.net/docs/manual.html#importing
56 | 
57 | Of particular interest may be about "duplicates" at http://ankisrs.net/docs/manual.html#duplicates-and-updating.
58 | You can choose:
59 | 1. not to allow duplicates (alternate translations) as cards.
60 | 2. allow duplicates (alternate translations) as cards.
61 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/ces-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/ces-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/ces-eng/_about.txt:
--------------------------------------------------------------------------------
 1 | ** Info **
 2 | 
 3 | Check for newest version here:
 4 |   http://www.manythings.org/anki/
 5 | Date of this file:
 6 |   2017-01-14
 7 | 
 8 | This data is from the sentences_detailed.csv file from tatoeba.org.
 9 | http://tatoeba.org/files/downloads/sentences_detailed.csv
10 | 
11 | 
12 | 
13 | ** Terms of Use **
14 | 
15 | See the terms of use.
16 | These files have been released under the same license as the
17 | source.
18 | 
19 | http://tatoeba.org/eng/terms_of_use
20 | http://creativecommons.org/licenses/by/2.0
21 | 
22 | Attribution: www.manythings.org/anki and tatoeba.org
23 | 
24 | 
25 | 
26 | ** Warnings ** 
27 | 
28 | The data from the Tatoeba Project contains errors.
29 | 
30 | To lower the number of errors you are likely to see, only
31 | sentences by native speakers and proofread sentences have
32 | been included.
33 | 
34 | For the non-English language, I made these (possibly wrong)
35 | assumptions.
36 | Assumption 1: Sentences written by native speakers can be
37 | trusted.
38 | Assumption 2: Contributors to the Tatoeba Project are honest
39 | about what their native language is.
40 | 
41 | For English, I used the sentences that I have proofread
42 | and thought were OK.
43 | Of course, I may have missed a few errors.
44 | 
45 | 
46 | 
47 | ** Downloading Anki ** 
48 | 
49 | See http://ankisrs.net/
50 | 
51 | 
52 | 
53 | ** Importing into Anki ** 
54 | 
55 | Information is at http://ankisrs.net/docs/manual.html#importing
56 | 
57 | Of particular interest may be about "duplicates" at http://ankisrs.net/docs/manual.html#duplicates-and-updating.
58 | You can choose:
59 | 1. not to allow duplicates (alternate translations) as cards.
60 | 2. allow duplicates (alternate translations) as cards.
61 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/cmn-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/cmn-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/cmn-eng/_about.txt:
--------------------------------------------------------------------------------
 1 | ** Info **
 2 | 
 3 | Check for newest version here:
 4 |   http://www.manythings.org/anki/
 5 | Date of this file:
 6 |   2017-01-14
 7 | 
 8 | This data is from the sentences_detailed.csv file from tatoeba.org.
 9 | http://tatoeba.org/files/downloads/sentences_detailed.csv
10 | 
11 | 
12 | 
13 | ** Terms of Use **
14 | 
15 | See the terms of use.
16 | These files have been released under the same license as the
17 | source.
18 | 
19 | http://tatoeba.org/eng/terms_of_use
20 | http://creativecommons.org/licenses/by/2.0
21 | 
22 | Attribution: www.manythings.org/anki and tatoeba.org
23 | 
24 | 
25 | 
26 | ** Warnings ** 
27 | 
28 | The data from the Tatoeba Project contains errors.
29 | 
30 | To lower the number of errors you are likely to see, only
31 | sentences by native speakers and proofread sentences have
32 | been included.
33 | 
34 | For the non-English language, I made these (possibly wrong)
35 | assumptions.
36 | Assumption 1: Sentences written by native speakers can be
37 | trusted.
38 | Assumption 2: Contributors to the Tatoeba Project are honest
39 | about what their native language is.
40 | 
41 | For English, I used the sentences that I have proofread
42 | and thought were OK.
43 | Of course, I may have missed a few errors.
44 | 
45 | 
46 | 
47 | ** Downloading Anki ** 
48 | 
49 | See http://ankisrs.net/
50 | 
51 | 
52 | 
53 | ** Importing into Anki ** 
54 | 
55 | Information is at http://ankisrs.net/docs/manual.html#importing
56 | 
57 | Of particular interest may be about "duplicates" at http://ankisrs.net/docs/manual.html#duplicates-and-updating.
58 | You can choose:
59 | 1. not to allow duplicates (alternate translations) as cards.
60 | 2. allow duplicates (alternate translations) as cards.
61 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/dan-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/dan-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/dan-eng/_about.txt:
--------------------------------------------------------------------------------
 1 | ** Info **
 2 | 
 3 | Check for newest version here:
 4 |   http://www.manythings.org/anki/
 5 | Date of this file:
 6 |   2017-01-14
 7 | 
 8 | This data is from the sentences_detailed.csv file from tatoeba.org.
 9 | http://tatoeba.org/files/downloads/sentences_detailed.csv
10 | 
11 | 
12 | 
13 | ** Terms of Use **
14 | 
15 | See the terms of use.
16 | These files have been released under the same license as the
17 | source.
18 | 
19 | http://tatoeba.org/eng/terms_of_use
20 | http://creativecommons.org/licenses/by/2.0
21 | 
22 | Attribution: www.manythings.org/anki and tatoeba.org
23 | 
24 | 
25 | 
26 | ** Warnings ** 
27 | 
28 | The data from the Tatoeba Project contains errors.
29 | 
30 | To lower the number of errors you are likely to see, only
31 | sentences by native speakers and proofread sentences have
32 | been included.
33 | 
34 | For the non-English language, I made these (possibly wrong)
35 | assumptions.
36 | Assumption 1: Sentences written by native speakers can be
37 | trusted.
38 | Assumption 2: Contributors to the Tatoeba Project are honest
39 | about what their native language is.
40 | 
41 | For English, I used the sentences that I have proofread
42 | and thought were OK.
43 | Of course, I may have missed a few errors.
44 | 
45 | 
46 | 
47 | ** Downloading Anki ** 
48 | 
49 | See http://ankisrs.net/
50 | 
51 | 
52 | 
53 | ** Importing into Anki ** 
54 | 
55 | Information is at http://ankisrs.net/docs/manual.html#importing
56 | 
57 | Of particular interest may be about "duplicates" at http://ankisrs.net/docs/manual.html#duplicates-and-updating.
58 | You can choose:
59 | 1. not to allow duplicates (alternate translations) as cards.
60 | 2. allow duplicates (alternate translations) as cards.
61 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/deu-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/deu-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/ell-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/ell-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/eng-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/eng-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/est-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/est-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/fin-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/fin-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/fra-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/fra-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/heb-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/heb-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/hin-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/hin-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/hrv-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/hrv-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/hun-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/hun-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/hun-eng/_about.txt:
--------------------------------------------------------------------------------
 1 | ** Info **
 2 | 
 3 | Check for newest version here:
 4 |   http://www.manythings.org/anki/
 5 | Date of this file:
 6 |   2017-01-14
 7 | 
 8 | This data is from the sentences_detailed.csv file from tatoeba.org.
 9 | http://tatoeba.org/files/downloads/sentences_detailed.csv
10 | 
11 | 
12 | 
13 | ** Terms of Use **
14 | 
15 | See the terms of use.
16 | These files have been released under the same license as the
17 | source.
18 | 
19 | http://tatoeba.org/eng/terms_of_use
20 | http://creativecommons.org/licenses/by/2.0
21 | 
22 | Attribution: www.manythings.org/anki and tatoeba.org
23 | 
24 | 
25 | 
26 | ** Warnings ** 
27 | 
28 | The data from the Tatoeba Project contains errors.
29 | 
30 | To lower the number of errors you are likely to see, only
31 | sentences by native speakers and proofread sentences have
32 | been included.
33 | 
34 | For the non-English language, I made these (possibly wrong)
35 | assumptions.
36 | Assumption 1: Sentences written by native speakers can be
37 | trusted.
38 | Assumption 2: Contributors to the Tatoeba Project are honest
39 | about what their native language is.
40 | 
41 | For English, I used the sentences that I have proofread
42 | and thought were OK.
43 | Of course, I may have missed a few errors.
44 | 
45 | 
46 | 
47 | ** Downloading Anki ** 
48 | 
49 | See http://ankisrs.net/
50 | 
51 | 
52 | 
53 | ** Importing into Anki ** 
54 | 
55 | Information is at http://ankisrs.net/docs/manual.html#importing
56 | 
57 | Of particular interest may be about "duplicates" at http://ankisrs.net/docs/manual.html#duplicates-and-updating.
58 | You can choose:
59 | 1. not to allow duplicates (alternate translations) as cards.
60 | 2. allow duplicates (alternate translations) as cards.
61 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/ind-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/ind-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/isl-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/isl-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/ita-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/ita-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/jpn-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/jpn-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/kha-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/kha-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/khm-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/khm-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/kor-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/kor-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/lit-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/lit-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/lvs-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/lvs-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/mal-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/mal-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/mar-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/mar-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/mkd-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/mkd-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/nds-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/nds-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/nld-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/nld-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/nob-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/nob-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/pes-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/pes-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/pol-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/pol-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/por-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/por-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/ron-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/ron-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/rus-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/rus-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/slk-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/slk-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/spa-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/spa-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/srp-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/srp-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/swe-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/swe-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/tat-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/tat-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/tgl-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/tgl-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/tur-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/tur-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/ukr-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/ukr-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/urd-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/urd-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/vie-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/vie-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/yue-eng.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/char2wav_pytorch/b71ae72609a532eb315bccca0980f90c11b0060b/word_seq_2_seq/training-data/yue-eng.zip


--------------------------------------------------------------------------------
/word_seq_2_seq/training-data/yue-eng/_about.txt:
--------------------------------------------------------------------------------
 1 | ** Info **
 2 | 
 3 | Check for newest version here:
 4 |   http://www.manythings.org/anki/
 5 | Date of this file:
 6 |   2017-01-14
 7 | 
 8 | This data is from the sentences_detailed.csv file from tatoeba.org.
 9 | http://tatoeba.org/files/downloads/sentences_detailed.csv
10 | 
11 | 
12 | 
13 | ** Terms of Use **
14 | 
15 | See the terms of use.
16 | These files have been released under the same license as the
17 | source.
18 | 
19 | http://tatoeba.org/eng/terms_of_use
20 | http://creativecommons.org/licenses/by/2.0
21 | 
22 | Attribution: www.manythings.org/anki and tatoeba.org
23 | 
24 | 
25 | 
26 | ** Warnings ** 
27 | 
28 | The data from the Tatoeba Project contains errors.
29 | 
30 | To lower the number of errors you are likely to see, only
31 | sentences by native speakers and proofread sentences have
32 | been included.
33 | 
34 | For the non-English language, I made these (possibly wrong)
35 | assumptions.
36 | Assumption 1: Sentences written by native speakers can be
37 | trusted.
38 | Assumption 2: Contributors to the Tatoeba Project are honest
39 | about what their native language is.
40 | 
41 | For English, I used the sentences that I have proofread
42 | and thought were OK.
43 | Of course, I may have missed a few errors.
44 | 
45 | 
46 | 
47 | ** Downloading Anki ** 
48 | 
49 | See http://ankisrs.net/
50 | 
51 | 
52 | 
53 | ** Importing into Anki ** 
54 | 
55 | Information is at http://ankisrs.net/docs/manual.html#importing
56 | 
57 | Of particular interest may be about "duplicates" at http://ankisrs.net/docs/manual.html#duplicates-and-updating.
58 | You can choose:
59 | 1. not to allow duplicates (alternate translations) as cards.
60 | 2. allow duplicates (alternate translations) as cards.
61 | 


--------------------------------------------------------------------------------
/word_seq_2_seq/utils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from termcolor import cprint as _cprint, colored as c
  3 | from pprint import pprint
  4 | import traceback
  5 | 
  6 | 
  7 | class Ledger():
  8 |     def __init__(self, debug=True):
  9 |         self.is_debug = debug
 10 |         pass
 11 | 
 12 |     def p(self, *args, **kwargs):
 13 |         self.print(*args, **kwargs)
 14 | 
 15 |     def print(self, *args, **kwargs):
 16 |         """use stdout.flush to allow streaming to file when used by IPython. IPython doesn't have -u option."""
 17 |         print(*args, **kwargs)
 18 |         sys.stdout.flush()
 19 | 
 20 |     def cp(self, *args, **kwargs):
 21 |         self.cprint(*args, **kwargs)
 22 | 
 23 |     def cprint(self, *args, sep=' ', color='white', **kwargs):
 24 |         """use stdout.flush to allow streaming to file when used by IPython. IPython doesn't have -u option."""
 25 |         _cprint(sep.join([str(a) for a in args]), color, **kwargs)
 26 |         sys.stdout.flush()
 27 | 
 28 |     def pp(self, *args, **kwargs):
 29 |         self.pprint(*args, **kwargs)
 30 | 
 31 |     def pprint(self, *args, **kwargs):
 32 |         pprint(*args, **kwargs)
 33 |         sys.stdout.flush()
 34 | 
 35 |     def log(self, *args, **kwargs):
 36 |         """use stdout.flush to allow streaming to file when used by IPython. IPython doesn't have -u option."""
 37 |         self.print(*args, **kwargs)
 38 | 
 39 |     # TODO: take a look at https://gist.github.com/FredLoney/5454553
 40 |     def debug(self, *args, **kwargs):
 41 |         # DONE: current call stack instead of last traceback instead of.
 42 |         if self.is_debug:
 43 |             stacks = traceback.extract_stack()
 44 |             last_caller = stacks[-2]
 45 |             path = last_caller.filename.split('/')
 46 |             self.white(path[-2], end='/')
 47 |             self.green(path[-1], end=' ')
 48 |             self.white('L', end='')
 49 |             self.red('{}:'.format(last_caller.lineno), end=' ')
 50 |             self.grey(last_caller.line)
 51 |             self.white('----------------------')
 52 |             self.print(*args, **kwargs)
 53 | 
 54 |     def refresh(self, *args, **kwargs):
 55 |         """allow keyword override of end='\r', so that only last print refreshes the console."""
 56 |         # to prevent from creating new line
 57 |         # default new end to single space.
 58 |         if 'end' not in kwargs:
 59 |             kwargs['end'] = ' '
 60 |         self.print('\r', *args, **kwargs)
 61 | 
 62 |     def info(self, *args, **kwargs):
 63 |         self.cprint(*args, color='blue', **kwargs)
 64 | 
 65 |     def error(self, *args, sep='', **kwargs):
 66 |         self.cprint(*args, color='red', **kwargs)
 67 | 
 68 |     def warn(self, *args, **kwargs):
 69 |         self.cprint(*args, color='yellow', **kwargs)
 70 | 
 71 |     def highlight(self, *args, **kwargs):
 72 |         self.cprint(*args, color='green', **kwargs)
 73 | 
 74 |     def green(self, *args, **kwargs):
 75 |         self.cprint(*args, color='green', **kwargs)
 76 | 
 77 |     def grey(self, *args, **kwargs):
 78 |         self.cprint(*args, color='grey', **kwargs)
 79 | 
 80 |     def red(self, *args, **kwargs):
 81 |         self.cprint(*args, color='red', **kwargs)
 82 | 
 83 |     def yellow(self, *args, **kwargs):
 84 |         self.cprint(*args, color='yellow', **kwargs)
 85 | 
 86 |     def blue(self, *args, **kwargs):
 87 |         self.cprint(*args, color='blue', **kwargs)
 88 | 
 89 |     def magenta(self, *args, **kwargs):
 90 |         self.cprint(*args, color='magenta', **kwargs)
 91 | 
 92 |     def cyan(self, *args, **kwargs):
 93 |         self.cprint(*args, color='cyan', **kwargs)
 94 | 
 95 |     def white(self, *args, **kwargs):
 96 |         self.cprint(*args, color='white', **kwargs)
 97 | 
 98 |         # def assert(self, statement, warning):
 99 |         #     if not statement:
100 |         #         self.error(warning)
101 |         #
102 | 
103 |     def raise_(self, exception, *args, **kwargs):
104 |         self.error(*args, **kwargs)
105 |         raise exception
106 | 
107 | 
108 | class Struct():
109 |     def __init__(self, **d):
110 |         """Features:
111 |         0. Take in a list of keyword arguments in constructor, and assign them as attributes
112 |         1. Correctly handles `dir` command, so shows correct auto-completion in editors.
113 |         2. Correctly handles `vars` command, and returns a dictionary version of self. 
114 |         
115 |         When recursive is set to False, 
116 |         """
117 |         # double underscore variables are mangled by python, so we use keyword argument dictionary instead.
118 |         # Otherwise you will have to use __Struct_recursive = False instead.
119 |         if '__recursive' in d:
120 |             __recursive = d['__recursive']
121 |             del d['__recursive']
122 |         else:
123 |             __recursive = True
124 |         self.__is_recursive = __recursive
125 |         # keep the input as a reference. Destructuring breaks this reference.
126 |         self.__d = d
127 | 
128 |     def __dir__(self):
129 |         return self.__dict__.keys()
130 | 
131 |     def __str__(self):
132 |         return str(self.__dict__)
133 | 
134 |     def __getattr__(self, key):
135 |         value = self.__d[key]
136 |         if type(value) == type({}) and self.__is_recursive:
137 |             return Struct(**value)
138 |         else:
139 |             return value
140 | 
141 |     def __getattribute__(self, key):
142 |         if key == "_Struct__d" or key == "__dict__":
143 |             return super().__getattribute__("__d")
144 |         elif key in ["_Struct__is_recursive", "__is_recursive"]:
145 |             return super().__getattribute__("__is_recursive")
146 |         else:
147 |             return super().__getattr__(key)
148 | 
149 |     def __setattr__(self, key, value):
150 |         if key == "_Struct__d":
151 |             super().__setattr__("__d", value)
152 |         elif key == "_Struct__is_recursive":
153 |             super().__setattr__("__is_recursive", value)
154 |         else:
155 |             self.__d[key] = value
156 | 
157 | 
158 | ledger = Ledger()
159 | 
160 | if __name__ == "__main__":
161 |     import time
162 | 
163 |     # print('running test as main script...')
164 |     # ledger.log('blah_1', 'blah_2')
165 |     # for i in range(10):
166 |     #     ledger.refresh('{}: hahahaha'.format(i))
167 |     #     ledger.green('hahaha', end=" ")
168 |     #     time.sleep(0.5)
169 | 
170 |     # test dictionary to object
171 |     test_dict = {
172 |         'a': 0,
173 |         'b': 1
174 |     }
175 | 
176 |     test_args = Struct(**test_dict)
177 |     assert test_args.a == 0
178 |     assert test_args.b == 1
179 |     test_args.haha = 0
180 |     assert test_args.haha == 0
181 |     test_args.haha = {'a': 1}
182 |     assert test_args.haha != {'a': 1}
183 |     assert vars(test_args.haha) == {'a': 1}
184 |     assert test_args.haha.a == 1
185 |     assert test_args.__dict__['haha']['a'] == 1
186 |     assert vars(test_args)['haha']['a'] == 1
187 |     print(test_args)
188 | 
189 |     test_args = Struct(__recursive=False, **test_dict)
190 |     assert test_args.__is_recursive == False
191 |     assert test_args.a == 0
192 |     assert test_args.b == 1
193 |     test_args.haha = {'a': 1}
194 |     assert test_args.haha['a'] == 1
195 |     assert test_args.haha == {'a': 1}
196 | 
197 |     ledger.green('*Struct* tests have passed.')
198 | 
199 |     # Some other usage patterns
200 |     test_args = Struct(**test_dict, **{'ha': 'ha', 'no': 'no'})
201 |     print(test_args.ha)
202 | 


--------------------------------------------------------------------------------