├── .gitignore ├── LICENSE ├── README.md ├── notes └── UsefulLinks.md ├── requirements.txt ├── setup.cfg ├── setup.py ├── translator_train ├── data │ ├── cmn.txt │ └── fra.txt ├── eng_to_cmn_char_translator_train.py ├── eng_to_cmn_word_translator_train.py ├── eng_to_cnm_glove_translator_train.py ├── eng_to_fra_char_translator_train.py ├── eng_to_fra_glove_translator_train.py ├── eng_to_fra_word_translator_train.py ├── models │ ├── eng-to-cmn │ │ ├── eng-to-cmn-char-architecture.json │ │ ├── eng-to-cmn-char-context.npy │ │ ├── eng-to-cmn-char-input-char2idx.npy │ │ ├── eng-to-cmn-char-input-idx2char.npy │ │ ├── eng-to-cmn-char-target-char2idx.npy │ │ ├── eng-to-cmn-char-target-idx2char.npy │ │ ├── eng-to-cmn-char-weights.h5 │ │ ├── eng-to-cmn-glove-architecture.json │ │ ├── eng-to-cmn-glove-context.npy │ │ ├── eng-to-cmn-glove-target-idx2word.npy │ │ ├── eng-to-cmn-glove-target-word2idx.npy │ │ ├── eng-to-cmn-glove-unknown-emb.npy │ │ ├── eng-to-cmn-glove-weights.h5 │ │ ├── eng-to-cmn-word-architecture.json │ │ ├── eng-to-cmn-word-context.npy │ │ ├── eng-to-cmn-word-input-idx2word.npy │ │ ├── eng-to-cmn-word-input-word2idx.npy │ │ ├── eng-to-cmn-word-target-idx2word.npy │ │ ├── eng-to-cmn-word-target-word2idx.npy │ │ └── eng-to-cmn-word-weights.h5 │ └── eng-to-fra │ │ ├── eng-to-fra-char-architecture.json │ │ ├── eng-to-fra-char-context.npy │ │ ├── eng-to-fra-char-input-char2idx.npy │ │ ├── eng-to-fra-char-input-idx2char.npy │ │ ├── eng-to-fra-char-target-char2idx.npy │ │ ├── eng-to-fra-char-target-idx2char.npy │ │ ├── eng-to-fra-char-weights.h5 │ │ ├── eng-to-fra-glove-architecture.json │ │ ├── eng-to-fra-glove-context.npy │ │ ├── eng-to-fra-glove-target-idx2word.npy │ │ ├── eng-to-fra-glove-target-word2idx.npy │ │ ├── eng-to-fra-glove-unknown-emb.npy │ │ ├── eng-to-fra-glove-weights.h5 │ │ ├── eng-to-fra-word-architecture.json │ │ ├── eng-to-fra-word-context.npy │ │ ├── eng-to-fra-word-input-idx2word.npy │ │ ├── eng-to-fra-word-input-word2idx.npy │ │ ├── eng-to-fra-word-target-idx2word.npy │ │ ├── eng-to-fra-word-target-word2idx.npy │ │ └── eng-to-fra-word-weights.h5 └── worksheet.py └── translator_web ├── __init__.py ├── eng_to_cmn_char_translator_predict.py ├── eng_to_cmn_glove_translator_predict.py ├── eng_to_cmn_word_translator_predict.py ├── eng_to_fra_char_translator_predict.py ├── eng_to_fra_glove_translator_predict.py ├── eng_to_fra_word_translator_predict.py ├── flaskr.py ├── static └── style.css └── templates ├── eng_to_cmn_char_translator.html ├── eng_to_cmn_char_translator_result.html ├── eng_to_cmn_word_glove_translator.html ├── eng_to_cmn_word_glove_translator_result.html ├── eng_to_cmn_word_translator.html ├── eng_to_cmn_word_translator_result.html ├── eng_to_fra_char_translator.html ├── eng_to_fra_char_translator_result.html ├── eng_to_fra_word_glove_translator.html ├── eng_to_fra_word_glove_translator_result.html ├── eng_to_fra_word_translator.html ├── eng_to_fra_word_translator_result.html ├── home.html └── layout.html /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | .idea/ 10 | *.iml 11 | 12 | Thumbs.db 13 | 14 | translator_web/uploads 15 | translator_train/very_large_data 16 | 17 | # Distribution / packaging 18 | .Python 19 | env/ 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | .hypothesis/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # dotenv 91 | .env 92 | 93 | # virtualenv 94 | .venv 95 | venv/ 96 | ENV/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Xianshun Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # keras-language-translator-web-api 2 | 3 | A simple language translator implemented in Keras with Flask serving web 4 | 5 | The language translator is built based on seq2seq models, and can infer based on either character-level or word-level. 6 | 7 | The seq2seq model is implemented using LSTM encoder-decoder on Keras. 8 | 9 | # Usage 10 | 11 | Run the following command to install the keras, flask and other dependency modules: 12 | 13 | ```bash 14 | sudo pip install -r requirements.txt 15 | ``` 16 | 17 | The translator models are trained using eng-to-french and eng-to-chinese data set and are available in the 18 | translator_train/models directory. During runtime, the flask app will load these trained models to perform the 19 | translation. 20 | 21 | Currently only the eng-to-chinese and eng-to-french translations models are provided as examples, you can 22 | go to [http://www.manythings.org/anki/](http://www.manythings.org/anki/) to download more datasets for the translator 23 | training and use the scripts in the translator_train to generate new seq2seq for other language translation 24 | 25 | ## Training (Optional) 26 | 27 | As the trained models are already included in the "translator_train/models" folder in the project, the training is 28 | not required. However, if you like to tune the parameters of the seq2seq and retrain the models, you can use the 29 | following command to run the training: 30 | 31 | ```bash 32 | cd translator_train 33 | python eng_to_cmn_char_seq2seq_train.py 34 | ``` 35 | 36 | The above commands will train seq2seq model using eng-to-chinese dataset on the character-level and store the trained model 37 | in "translator_train/models/eng-to-cmn/eng-to-cmn-char-**" 38 | 39 | If you like to train other models, you can use the same command above on another train python scripts: 40 | 41 | * eng_to_cmn_word_translator_train.py: train on eng-to-chinese on word-level (one hot encoding) 42 | * eng_to_cmn_glove_translator_train.py: train on eng-to-chinese on word-level (GloVe encoding) 43 | * eng_to_fra_char_translator_train.py: train on eng-to-french on character-level 44 | * eng_to_fra_word_translator_train.py: train on eng-to-french on word-level (one hot encoding) 45 | * eng_to_fra_glove_translator_train.py: train on eng-to-french on word-level (GloVe encoding) 46 | 47 | ## Running Web Api Server 48 | 49 | Goto translator_web directory and run the following command: 50 | 51 | ```bash 52 | python flaskr.py 53 | ``` 54 | 55 | Now navigate your browser to http://localhost:5000 and you can try out various predictors built with the following 56 | trained seq2seq models: 57 | 58 | * Character-level seq2seq models 59 | * Word-level seq2seq models (one hot encoding) 60 | * Word-level seq2seq models (GloVe encoding) 61 | 62 | ## Invoke Web Api 63 | 64 | To translate an english sentence to other languages using web api, after the flask server is started, run the following curl POST query 65 | in your terminal: 66 | 67 | ```bash 68 | curl -H 'Content-Type application/json' -X POST -d '{"level":"level_type", "sentence":"your_sentence_here", "target_lang":"target_language"}' http://localhost:5000/translate_eng 69 | ``` 70 | 71 | The level_type can be "char" or "word", the target_lang can be "chinese" or "french" 72 | 73 | (Note that same results can be obtained by running a curl GET query to http://localhost:5000/translate_eng?sentence=your_sentence_here&level=level_type&target_lang=target_language) 74 | 75 | For example, you can translate the sentence "Be nice." by running the following command: 76 | 77 | ```bash 78 | curl -H 'Content-Type: application/json' -X POST -d '{"level":"word", "sentence":"Be nice.", "target_lang":"chinese"}' http://localhost:5000/translate_eng 79 | ``` 80 | 81 | And the following will be the json response: 82 | 83 | ```json 84 | { 85 | "level": "word", 86 | "sentence": "Be nice.", 87 | "target_lang": "chinese", 88 | "translated": "和气点。" 89 | } 90 | ``` 91 | 92 | Here are some examples for eng translation using some other configuration options: 93 | 94 | ```bash 95 | curl -H 'Content-Type: application/json' -X POST -d '{"level":"char", "sentence":"Be nice.", "target_lang":"chinese"}' http://localhost:5000/translate_eng 96 | curl -H 'Content-Type: application/json' -X POST -d '{"level":"word-glove", "sentence":"Be nice.", "target_lang":"chinese"}' http://localhost:5000/translate_eng 97 | curl -H 'Content-Type: application/json' -X POST -d '{"level":"word", "sentence":"Be nice.", "target_lang":"french"}' http://localhost:5000/translate_eng 98 | curl -H 'Content-Type: application/json' -X POST -d '{"level":"word-glove", "sentence":"Be nice.", "target_lang":"french"}' http://localhost:5000/translate_eng 99 | curl -H 'Content-Type: application/json' -X POST -d '{"level":"char", "sentence":"Be nice.", "target_lang":"french"}' http://localhost:5000/translate_eng 100 | ``` 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /notes/UsefulLinks.md: -------------------------------------------------------------------------------- 1 | # Tutorial on character-cased seq2seq keras models for language translation 2 | 3 | * [https://blog.keras.io/category/tutorials.html](https://blog.keras.io/category/tutorials.html) 4 | 5 | # Language Translation DataSet 6 | 7 | * [http://www.manythings.org/anki/](http://www.manythings.org/anki/) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask == 0.12.2 2 | gevent 3 | keras 4 | numpy 5 | nltk 6 | h5py 7 | pillow 8 | https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.11.0-cp27-none-linux_x86_64.whl -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='translator_web', 5 | packages=['translator_web'], 6 | include_package_data=True, 7 | install_requires=[ 8 | 'flask', 9 | 'keras', 10 | 'sklearn' 11 | ], 12 | setup_requires=[ 13 | 'pytest-runner', 14 | ], 15 | tests_require=[ 16 | 'pytest', 17 | ], 18 | ) -------------------------------------------------------------------------------- /translator_train/eng_to_cmn_char_translator_train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from keras.models import Model 3 | from keras.layers import Input, LSTM, Dense 4 | import numpy as np 5 | from keras.callbacks import ModelCheckpoint 6 | 7 | BATCH_SIZE = 64 8 | NUM_EPOCHS = 100 9 | HIDDEN_UNITS = 256 10 | NUM_SAMPLES = 10000 11 | DATA_PATH = 'data/cmn.txt' 12 | WEIGHT_FILE_PATH = 'models/eng-to-cmn/eng-to-cmn-char-weights.h5' 13 | ARCHITECTURE_FILE_PATH = 'models/eng-to-cmn/eng-to-cmn-char-architecture.json' 14 | 15 | input_texts = [] 16 | target_texts = [] 17 | input_characters = set() 18 | target_characters = set() 19 | 20 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 21 | 22 | for line in lines[: min(NUM_SAMPLES, len(lines)-1)]: 23 | input_text, target_text = line.split('\t') 24 | target_text = '\t' + target_text + '\n' 25 | input_texts.append(input_text) 26 | target_texts.append(target_text) 27 | for char in input_text: 28 | if char not in input_characters: 29 | input_characters.add(char) 30 | for char in target_text: 31 | if char not in target_characters: 32 | target_characters.add(char) 33 | 34 | input_characters = sorted(list(input_characters)) 35 | target_characters = sorted(list(target_characters)) 36 | num_encoder_tokens = len(input_characters) 37 | num_decoder_tokens = len(target_characters) 38 | max_encoder_seq_length = max([len(txt) for txt in input_texts]) 39 | max_decoder_seq_length = max([len(txt) for txt in target_texts]) 40 | 41 | input_char2idx = dict([(char, i) for i, char in enumerate(input_characters)]) 42 | input_idx2char = dict([(i, char) for i, char in enumerate(input_characters)]) 43 | target_char2idx = dict([(char, i) for i, char in enumerate(target_characters)]) 44 | target_idx2char = dict([(i, char) for i, char in enumerate(target_characters)]) 45 | 46 | np.save('models/eng-to-cmn/eng-to-cmn-char-input-char2idx.npy', input_char2idx) 47 | np.save('models/eng-to-cmn/eng-to-cmn-char-target-char2idx.npy', target_char2idx) 48 | np.save('models/eng-to-cmn/eng-to-cmn-char-input-idx2char.npy', input_idx2char) 49 | np.save('models/eng-to-cmn/eng-to-cmn-char-target-idx2char.npy', target_idx2char) 50 | 51 | context = dict() 52 | context['max_encoder_seq_length'] = max_encoder_seq_length 53 | context['max_decoder_seq_length'] = max_decoder_seq_length 54 | context['num_encoder_tokens'] = num_encoder_tokens 55 | context['num_decoder_tokens'] = num_decoder_tokens 56 | 57 | np.save('models/eng-to-cmn/eng-to-cmn-char-context.npy', context) 58 | 59 | encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32') 60 | decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32') 61 | decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32') 62 | 63 | for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): 64 | for t, char in enumerate(input_text): 65 | encoder_input_data[i, t, input_char2idx[char]] = 1 66 | for t, char in enumerate(target_text): 67 | decoder_input_data[i, t, target_char2idx[char]] = 1 68 | if t > 0: 69 | decoder_target_data[i, t-1, target_char2idx[char]] = 1 70 | 71 | 72 | encoder_inputs = Input(shape=(None, num_encoder_tokens), name='encoder_inputs') 73 | encoder = LSTM(units=HIDDEN_UNITS, return_state=True, name="encoder_lstm") 74 | encoder_outputs, state_h, state_c = encoder(encoder_inputs) 75 | encoder_states = [state_h, state_c] 76 | 77 | decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs') 78 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True, return_state=True, name='decoder_lstm') 79 | decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) 80 | decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense') 81 | decoder_outputs = decoder_dense(decoder_outputs) 82 | 83 | model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 84 | json = model.to_json() 85 | open(ARCHITECTURE_FILE_PATH, 'w').write(json) 86 | 87 | model.compile(optimizer='rmsprop', loss='categorical_crossentropy') 88 | checkpoint = ModelCheckpoint(filepath=WEIGHT_FILE_PATH, save_best_only=True) 89 | model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, 90 | validation_split=0.2, callbacks=[checkpoint]) 91 | 92 | 93 | model.save_weights(WEIGHT_FILE_PATH) 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /translator_train/eng_to_cmn_word_translator_train.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.callbacks import ModelCheckpoint 3 | from keras.layers.recurrent import LSTM 4 | from keras.layers import Dense, Input, Embedding 5 | from keras.preprocessing.sequence import pad_sequences 6 | from collections import Counter 7 | import nltk 8 | import numpy as np 9 | 10 | BATCH_SIZE = 64 11 | NUM_EPOCHS = 100 12 | HIDDEN_UNITS = 256 13 | NUM_SAMPLES = 10000 14 | MAX_VOCAB_SIZE = 10000 15 | DATA_PATH = 'data/cmn.txt' 16 | WEIGHT_FILE_PATH = 'models/eng-to-cmn/eng-to-cmn-word-weights.h5' 17 | ARCHITECTURE_FILE_PATH = 'models/eng-to-cmn/eng-to-cmn-word-architecture.json' 18 | 19 | input_counter = Counter() 20 | target_counter = Counter() 21 | 22 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 23 | for line in lines[: min(NUM_SAMPLES, len(lines)-1)]: 24 | input_text, target_text = line.split('\t') 25 | input_words = [w for w in nltk.word_tokenize(input_text.lower())] 26 | target_text = '\t' + target_text + '\n' 27 | for w in input_words: 28 | input_counter[w] += 1 29 | for char in target_text: 30 | target_counter[char] += 1 31 | 32 | input_word2idx = dict() 33 | target_word2idx = dict() 34 | for idx, word in enumerate(input_counter.most_common(MAX_VOCAB_SIZE)): 35 | input_word2idx[word[0]] = idx + 2 36 | for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)): 37 | target_word2idx[word[0]] = idx 38 | 39 | input_word2idx['PAD'] = 0 40 | input_word2idx['UNK'] = 1 41 | 42 | input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()]) 43 | target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()]) 44 | 45 | num_encoder_tokens = len(input_idx2word) 46 | num_decoder_tokens = len(target_idx2word) 47 | 48 | np.save('models/eng-to-cmn/eng-to-cmn-word-input-word2idx.npy', input_word2idx) 49 | np.save('models/eng-to-cmn/eng-to-cmn-word-input-idx2word.npy', input_idx2word) 50 | np.save('models/eng-to-cmn/eng-to-cmn-word-target-word2idx.npy', target_word2idx) 51 | np.save('models/eng-to-cmn/eng-to-cmn-word-target-idx2word.npy', target_idx2word) 52 | 53 | encoder_input_data = [] 54 | 55 | encoder_max_seq_length = 0 56 | decoder_max_seq_length = 0 57 | 58 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 59 | for line in lines[: min(NUM_SAMPLES, len(lines)-1)]: 60 | input_text, target_text = line.split('\t') 61 | target_text = '\t' + target_text + '\n' 62 | input_words = [w for w in nltk.word_tokenize(input_text.lower())] 63 | encoder_input_wids = [] 64 | for w in input_words: 65 | w2idx = 1 # default [UNK] 66 | if w in input_word2idx: 67 | w2idx = input_word2idx[w] 68 | encoder_input_wids.append(w2idx) 69 | 70 | encoder_input_data.append(encoder_input_wids) 71 | encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length) 72 | decoder_max_seq_length = max(len(target_text), decoder_max_seq_length) 73 | 74 | encoder_input_data = pad_sequences(encoder_input_data, encoder_max_seq_length) 75 | 76 | decoder_target_data = np.zeros(shape=(NUM_SAMPLES, decoder_max_seq_length, num_decoder_tokens)) 77 | decoder_input_data = np.zeros(shape=(NUM_SAMPLES, decoder_max_seq_length, num_decoder_tokens)) 78 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 79 | for lineIdx, line in enumerate(lines[: min(NUM_SAMPLES, len(lines)-1)]): 80 | _, target_text = line.split('\t') 81 | target_text = '\t' + target_text + '\n' 82 | for idx, char in enumerate(target_text): 83 | if char in target_word2idx: 84 | w2idx = target_word2idx[char] 85 | decoder_input_data[lineIdx, idx, w2idx] = 1 86 | if idx > 0: 87 | decoder_target_data[lineIdx, idx-1, w2idx] = 1 88 | 89 | context = dict() 90 | context['num_encoder_tokens'] = num_encoder_tokens 91 | context['num_decoder_tokens'] = num_decoder_tokens 92 | context['encoder_max_seq_length'] = encoder_max_seq_length 93 | context['decoder_max_seq_length'] = decoder_max_seq_length 94 | 95 | np.save('models/eng-to-cmn/eng-to-cmn-word-context.npy', context) 96 | 97 | encoder_inputs = Input(shape=(None, ), name='encoder_inputs') 98 | encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=HIDDEN_UNITS, 99 | input_length=encoder_max_seq_length, name='encoder_embedding') 100 | encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm') 101 | encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs)) 102 | encoder_states = [encoder_state_h, encoder_state_c] 103 | 104 | decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs') 105 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm') 106 | decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, 107 | initial_state=encoder_states) 108 | decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense') 109 | decoder_outputs = decoder_dense(decoder_outputs) 110 | 111 | model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 112 | 113 | model.compile(loss='categorical_crossentropy', optimizer='rmsprop') 114 | 115 | 116 | json = model.to_json() 117 | open(ARCHITECTURE_FILE_PATH, 'w').write(json) 118 | 119 | checkpoint = ModelCheckpoint(filepath=WEIGHT_FILE_PATH, save_best_only=True) 120 | model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, 121 | verbose=1, validation_split=0.2, callbacks=[checkpoint]) 122 | 123 | model.save_weights(WEIGHT_FILE_PATH) 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /translator_train/eng_to_cnm_glove_translator_train.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.layers.recurrent import LSTM 3 | from keras.layers import Dense, Input, Embedding 4 | from keras.preprocessing.sequence import pad_sequences 5 | from collections import Counter 6 | from keras.callbacks import ModelCheckpoint 7 | import nltk 8 | import numpy as np 9 | import os 10 | import zipfile 11 | import sys 12 | import urllib.request 13 | 14 | BATCH_SIZE = 64 15 | NUM_EPOCHS = 100 16 | HIDDEN_UNITS = 256 17 | NUM_SAMPLES = 10000 18 | MAX_VOCAB_SIZE = 10000 19 | GLOVE_EMBEDDING_SIZE = 100 20 | DATA_PATH = 'data/cmn.txt' 21 | 22 | target_counter = Counter() 23 | 24 | GLOVE_MODEL = "very_large_data/glove.6B." + str(GLOVE_EMBEDDING_SIZE) + "d.txt" 25 | WHITELIST = 'abcdefghijklmnopqrstuvwxyz1234567890?.,' 26 | WEIGHT_FILE_PATH = 'models/eng-to-cmn/eng-to-cmn-glove-weights.h5' 27 | ARCHITECTURE_FILE_PATH = 'models/eng-to-cmn/eng-to-cmn-glove-architecture.json' 28 | 29 | def in_white_list(_word): 30 | for char in _word: 31 | if char in WHITELIST: 32 | return True 33 | 34 | return False 35 | 36 | 37 | def reporthook(block_num, block_size, total_size): 38 | read_so_far = block_num * block_size 39 | if total_size > 0: 40 | percent = read_so_far * 1e2 / total_size 41 | s = "\r%5.1f%% %*d / %d" % ( 42 | percent, len(str(total_size)), read_so_far, total_size) 43 | sys.stderr.write(s) 44 | if read_so_far >= total_size: # near the end 45 | sys.stderr.write("\n") 46 | else: # total size is unknown 47 | sys.stderr.write("read %d\n" % (read_so_far,)) 48 | 49 | 50 | def download_glove(): 51 | if not os.path.exists(GLOVE_MODEL): 52 | 53 | glove_zip = 'very_large_data/glove.6B.zip' 54 | 55 | if not os.path.exists('very_large_data'): 56 | os.makedirs('very_large_data') 57 | 58 | if not os.path.exists(glove_zip): 59 | print('glove file does not exist, downloading from internet') 60 | urllib.request.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename=glove_zip, 61 | reporthook=reporthook) 62 | 63 | print('unzipping glove file') 64 | zip_ref = zipfile.ZipFile(glove_zip, 'r') 65 | zip_ref.extractall('very_large_data') 66 | zip_ref.close() 67 | 68 | 69 | def load_glove(): 70 | download_glove() 71 | _word2em = {} 72 | file = open(GLOVE_MODEL, mode='rt', encoding='utf8') 73 | for line in file: 74 | words = line.strip().split() 75 | word = words[0] 76 | embeds = np.array(words[1:], dtype=np.float32) 77 | _word2em[word] = embeds 78 | file.close() 79 | return _word2em 80 | 81 | word2em = load_glove() 82 | 83 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 84 | for line in lines[: min(NUM_SAMPLES, len(lines)-1)]: 85 | input_text, target_text = line.split('\t') 86 | input_words = [w for w in nltk.word_tokenize(input_text.lower())] 87 | target_text = '\t' + target_text + '\n' 88 | for char in target_text: 89 | target_counter[char] += 1 90 | 91 | target_word2idx = dict() 92 | for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)): 93 | target_word2idx[word[0]] = idx 94 | 95 | target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()]) 96 | 97 | num_decoder_tokens = len(target_idx2word) 98 | 99 | np.save('models/eng-to-cmn/eng-to-cmn-glove-target-word2idx.npy', target_word2idx) 100 | np.save('models/eng-to-cmn/eng-to-cmn-glove-target-idx2word.npy', target_idx2word) 101 | 102 | unknown_emb = np.random.randn(GLOVE_EMBEDDING_SIZE) 103 | 104 | np.save('models/eng-to-cmn/eng-to-cmn-glove-unknown-emb.npy', unknown_emb) 105 | 106 | encoder_max_seq_length = 0 107 | decoder_max_seq_length = 0 108 | 109 | input_texts_word2em = [] 110 | 111 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 112 | for line in lines[: min(NUM_SAMPLES, len(lines)-1)]: 113 | input_text, target_text = line.split('\t') 114 | target_text = '\t' + target_text + '\n' 115 | input_words = [w for w in nltk.word_tokenize(input_text.lower())] 116 | encoder_input_wids = [] 117 | for w in input_words: 118 | em = unknown_emb 119 | if w in word2em: 120 | em = word2em[w] 121 | encoder_input_wids.append(em) 122 | 123 | input_texts_word2em.append(encoder_input_wids) 124 | encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length) 125 | decoder_max_seq_length = max(len(target_text), decoder_max_seq_length) 126 | 127 | encoder_input_data = pad_sequences(input_texts_word2em, encoder_max_seq_length) 128 | 129 | decoder_target_data = np.zeros(shape=(NUM_SAMPLES, decoder_max_seq_length, num_decoder_tokens)) 130 | decoder_input_data = np.zeros(shape=(NUM_SAMPLES, decoder_max_seq_length, num_decoder_tokens)) 131 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 132 | for lineIdx, line in enumerate(lines[: min(NUM_SAMPLES, len(lines)-1)]): 133 | _, target_text = line.split('\t') 134 | target_text = '\t' + target_text + '\n' 135 | for idx, char in enumerate(target_text): 136 | if char in target_word2idx: 137 | w2idx = target_word2idx[char] 138 | decoder_input_data[lineIdx, idx, w2idx] = 1 139 | if idx > 0: 140 | decoder_target_data[lineIdx, idx-1, w2idx] = 1 141 | 142 | context = dict() 143 | context['num_decoder_tokens'] = num_decoder_tokens 144 | context['encoder_max_seq_length'] = encoder_max_seq_length 145 | context['decoder_max_seq_length'] = decoder_max_seq_length 146 | 147 | np.save('models/eng-to-cmn/eng-to-cmn-glove-context.npy', context) 148 | 149 | encoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='encoder_inputs') 150 | encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm') 151 | encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs) 152 | encoder_states = [encoder_state_h, encoder_state_c] 153 | 154 | decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs') 155 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm') 156 | decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, 157 | initial_state=encoder_states) 158 | decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense') 159 | decoder_outputs = decoder_dense(decoder_outputs) 160 | 161 | model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 162 | 163 | model.compile(loss='categorical_crossentropy', optimizer='rmsprop') 164 | 165 | json = model.to_json() 166 | open(ARCHITECTURE_FILE_PATH, 'w').write(json) 167 | 168 | checkpoint = ModelCheckpoint(filepath=WEIGHT_FILE_PATH, save_best_only=True) 169 | model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, 170 | verbose=1, validation_split=0.2, callbacks=[checkpoint]) 171 | 172 | model.save_weights(WEIGHT_FILE_PATH) 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /translator_train/eng_to_fra_char_translator_train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from keras.models import Model 3 | from keras.layers import Input, LSTM, Dense 4 | import numpy as np 5 | from keras.callbacks import ModelCheckpoint 6 | 7 | BATCH_SIZE = 64 8 | NUM_EPOCHS = 100 9 | HIDDEN_UNITS = 256 10 | NUM_SAMPLES = 10000 11 | DATA_PATH = 'data/fra.txt' 12 | WEIGHT_FILE_PATH = 'models/eng-to-fra/eng-to-fra-char-weights.h5' 13 | ARCHITECTURE_FILE_PATH = 'models/eng-to-fra/eng-to-fra-char-architecture.json' 14 | 15 | input_texts = [] 16 | target_texts = [] 17 | input_characters = set() 18 | target_characters = set() 19 | 20 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 21 | 22 | for line in lines[: min(NUM_SAMPLES, len(lines)-1)]: 23 | input_text, target_text = line.split('\t') 24 | target_text = '\t' + target_text + '\n' 25 | input_texts.append(input_text) 26 | target_texts.append(target_text) 27 | for char in input_text: 28 | if char not in input_characters: 29 | input_characters.add(char) 30 | for char in target_text: 31 | if char not in target_characters: 32 | target_characters.add(char) 33 | 34 | input_characters = sorted(list(input_characters)) 35 | target_characters = sorted(list(target_characters)) 36 | num_encoder_tokens = len(input_characters) 37 | num_decoder_tokens = len(target_characters) 38 | max_encoder_seq_length = max([len(txt) for txt in input_texts]) 39 | max_decoder_seq_length = max([len(txt) for txt in target_texts]) 40 | 41 | input_char2idx = dict([(char, i) for i, char in enumerate(input_characters)]) 42 | input_idx2char = dict([(i, char) for i, char in enumerate(input_characters)]) 43 | target_char2idx = dict([(char, i) for i, char in enumerate(target_characters)]) 44 | target_idx2char = dict([(i, char) for i, char in enumerate(target_characters)]) 45 | 46 | np.save('models/eng-to-fra/eng-to-fra-char-input-char2idx.npy', input_char2idx) 47 | np.save('models/eng-to-fra/eng-to-fra-char-target-char2idx.npy', target_char2idx) 48 | np.save('models/eng-to-fra/eng-to-fra-char-input-idx2char.npy', input_idx2char) 49 | np.save('models/eng-to-fra/eng-to-fra-char-target-idx2char.npy', target_idx2char) 50 | 51 | context = dict() 52 | context['max_encoder_seq_length'] = max_encoder_seq_length 53 | context['max_decoder_seq_length'] = max_decoder_seq_length 54 | context['num_encoder_tokens'] = num_encoder_tokens 55 | context['num_decoder_tokens'] = num_decoder_tokens 56 | 57 | np.save('models/eng-to-fra/eng-to-fra-char-context.npy', context) 58 | 59 | encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32') 60 | decoder_input_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32') 61 | decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32') 62 | 63 | for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): 64 | for t, char in enumerate(input_text): 65 | encoder_input_data[i, t, input_char2idx[char]] = 1 66 | for t, char in enumerate(target_text): 67 | decoder_input_data[i, t, target_char2idx[char]] = 1 68 | if t > 0: 69 | decoder_target_data[i, t-1, target_char2idx[char]] = 1 70 | 71 | 72 | encoder_inputs = Input(shape=(None, num_encoder_tokens), name='encoder_inputs') 73 | encoder = LSTM(units=HIDDEN_UNITS, return_state=True, name="encoder_lstm") 74 | encoder_outputs, state_h, state_c = encoder(encoder_inputs) 75 | encoder_states = [state_h, state_c] 76 | 77 | decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs') 78 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True, return_state=True, name='decoder_lstm') 79 | decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) 80 | decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense') 81 | decoder_outputs = decoder_dense(decoder_outputs) 82 | 83 | model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 84 | 85 | model.compile(optimizer='rmsprop', loss='categorical_crossentropy') 86 | 87 | json = model.to_json() 88 | open(ARCHITECTURE_FILE_PATH, 'w').write(json) 89 | 90 | checkpoint = ModelCheckpoint(filepath=WEIGHT_FILE_PATH, save_best_only=True) 91 | model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, 92 | validation_split=0.2, callbacks=[checkpoint]) 93 | 94 | model.save_weights(WEIGHT_FILE_PATH) 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /translator_train/eng_to_fra_glove_translator_train.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.callbacks import ModelCheckpoint 3 | from keras.layers.recurrent import LSTM 4 | from keras.layers import Dense, Input, Embedding 5 | from keras.preprocessing.sequence import pad_sequences 6 | from collections import Counter 7 | import nltk 8 | import numpy as np 9 | import os 10 | import sys 11 | import zipfile 12 | import urllib.request 13 | 14 | BATCH_SIZE = 64 15 | NUM_EPOCHS = 100 16 | HIDDEN_UNITS = 256 17 | NUM_SAMPLES = 10000 18 | MAX_VOCAB_SIZE = 10000 19 | GLOVE_EMBEDDING_SIZE = 100 20 | DATA_PATH = 'data/fra.txt' 21 | WEIGHT_FILE_PATH = 'models/eng-to-fra/eng-to-fra-glove-weights.h5' 22 | ARCHITECTURE_FILE_PATH = 'models/eng-to-fra/eng-to-fra-glove-architecture.json' 23 | 24 | target_counter = Counter() 25 | 26 | GLOVE_MODEL = "very_large_data/glove.6B." + str(GLOVE_EMBEDDING_SIZE) + "d.txt" 27 | WHITELIST = 'abcdefghijklmnopqrstuvwxyz1234567890?.,' 28 | 29 | 30 | def in_white_list(_word): 31 | for char in _word: 32 | if char in WHITELIST: 33 | return True 34 | 35 | return False 36 | 37 | 38 | def reporthook(block_num, block_size, total_size): 39 | read_so_far = block_num * block_size 40 | if total_size > 0: 41 | percent = read_so_far * 1e2 / total_size 42 | s = "\r%5.1f%% %*d / %d" % ( 43 | percent, len(str(total_size)), read_so_far, total_size) 44 | sys.stderr.write(s) 45 | if read_so_far >= total_size: # near the end 46 | sys.stderr.write("\n") 47 | else: # total size is unknown 48 | sys.stderr.write("read %d\n" % (read_so_far,)) 49 | 50 | 51 | def download_glove(): 52 | if not os.path.exists(GLOVE_MODEL): 53 | 54 | glove_zip = 'very_large_data/glove.6B.zip' 55 | 56 | if not os.path.exists('very_large_data'): 57 | os.makedirs('very_large_data') 58 | 59 | if not os.path.exists(glove_zip): 60 | print('glove file does not exist, downloading from internet') 61 | urllib.request.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename=glove_zip, 62 | reporthook=reporthook) 63 | 64 | print('unzipping glove file') 65 | zip_ref = zipfile.ZipFile(glove_zip, 'r') 66 | zip_ref.extractall('very_large_data') 67 | zip_ref.close() 68 | 69 | 70 | def load_glove(): 71 | download_glove() 72 | _word2em = {} 73 | file = open(GLOVE_MODEL, mode='rt', encoding='utf8') 74 | for line in file: 75 | words = line.strip().split() 76 | word = words[0] 77 | embeds = np.array(words[1:], dtype=np.float32) 78 | _word2em[word] = embeds 79 | file.close() 80 | return _word2em 81 | 82 | word2em = load_glove() 83 | 84 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 85 | for line in lines[: min(NUM_SAMPLES, len(lines)-1)]: 86 | input_text, target_text = line.split('\t') 87 | input_words = [w for w in nltk.word_tokenize(input_text.lower())] 88 | target_text = 'START ' + target_text.lower() + ' END' 89 | target_words = [w for w in nltk.word_tokenize(target_text)] 90 | for w in target_words: 91 | target_counter[w] += 1 92 | 93 | target_word2idx = dict() 94 | for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)): 95 | target_word2idx[word[0]] = idx + 1 96 | 97 | target_word2idx['UNK'] = 0 98 | 99 | target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()]) 100 | 101 | num_decoder_tokens = len(target_idx2word) 102 | 103 | np.save('models/eng-to-fra/eng-to-fra-glove-target-word2idx.npy', target_word2idx) 104 | np.save('models/eng-to-fra/eng-to-fra-glove-target-idx2word.npy', target_idx2word) 105 | 106 | unknown_emb = np.random.randn(GLOVE_EMBEDDING_SIZE) 107 | 108 | np.save('models/eng-to-fra/eng-to-fra-glove-unknown-emb', unknown_emb) 109 | 110 | encoder_input_data = [] 111 | 112 | encoder_max_seq_length = 0 113 | decoder_max_seq_length = 0 114 | 115 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 116 | for line in lines[: min(NUM_SAMPLES, len(lines)-1)]: 117 | input_text, target_text = line.split('\t') 118 | target_text = 'START ' + target_text.lower() + ' END' 119 | input_words = [w for w in nltk.word_tokenize(input_text.lower())] 120 | target_words = [w for w in nltk.word_tokenize(target_text)] 121 | encoder_input_emb = [] 122 | for w in input_words: 123 | emb = unknown_emb 124 | if w in word2em: 125 | emb = word2em[w] 126 | encoder_input_emb.append(emb) 127 | 128 | encoder_input_data.append(encoder_input_emb) 129 | encoder_max_seq_length = max(len(encoder_input_emb), encoder_max_seq_length) 130 | decoder_max_seq_length = max(len(target_words), decoder_max_seq_length) 131 | 132 | encoder_input_data = pad_sequences(encoder_input_data, encoder_max_seq_length) 133 | 134 | decoder_target_data = np.zeros(shape=(NUM_SAMPLES, decoder_max_seq_length, num_decoder_tokens)) 135 | decoder_input_data = np.zeros(shape=(NUM_SAMPLES, decoder_max_seq_length, num_decoder_tokens)) 136 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 137 | for lineIdx, line in enumerate(lines[: min(NUM_SAMPLES, len(lines)-1)]): 138 | _, target_text = line.split('\t') 139 | target_text = 'START ' + target_text.lower() + ' END' 140 | target_words = [w for w in nltk.word_tokenize(target_text)] 141 | for idx, w in enumerate(target_words): 142 | w2idx = 0 # default [UNK] 143 | if w in target_word2idx: 144 | w2idx = target_word2idx[w] 145 | decoder_input_data[lineIdx, idx, w2idx] = 1 146 | if idx > 0: 147 | decoder_target_data[lineIdx, idx-1, w2idx] = 1 148 | 149 | context = dict() 150 | context['num_decoder_tokens'] = num_decoder_tokens 151 | context['encoder_max_seq_length'] = encoder_max_seq_length 152 | context['decoder_max_seq_length'] = decoder_max_seq_length 153 | 154 | np.save('models/eng-to-fra/eng-to-fra-glove-context.npy', context) 155 | 156 | encoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='encoder_inputs') 157 | encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm') 158 | encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs) 159 | encoder_states = [encoder_state_h, encoder_state_c] 160 | 161 | decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs') 162 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm') 163 | decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, 164 | initial_state=encoder_states) 165 | decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense') 166 | decoder_outputs = decoder_dense(decoder_outputs) 167 | 168 | model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 169 | 170 | model.compile(loss='categorical_crossentropy', optimizer='rmsprop') 171 | 172 | json = model.to_json() 173 | open(ARCHITECTURE_FILE_PATH, 'w').write(json) 174 | 175 | checkpoint = ModelCheckpoint(filepath=WEIGHT_FILE_PATH, save_best_only=True) 176 | model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, 177 | verbose=1, validation_split=0.2, callbacks=[checkpoint]) 178 | 179 | model.save_weights(WEIGHT_FILE_PATH) 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /translator_train/eng_to_fra_word_translator_train.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.callbacks import ModelCheckpoint 3 | from keras.layers.recurrent import LSTM 4 | from keras.layers import Dense, Input, Embedding 5 | from keras.preprocessing.sequence import pad_sequences 6 | from collections import Counter 7 | import nltk 8 | import numpy as np 9 | 10 | BATCH_SIZE = 64 11 | NUM_EPOCHS = 100 12 | HIDDEN_UNITS = 256 13 | NUM_SAMPLES = 10000 14 | MAX_VOCAB_SIZE = 10000 15 | DATA_PATH = 'data/fra.txt' 16 | WEIGHT_FILE_PATH = 'models/eng-to-fra/eng-to-fra-word-weights.h5' 17 | ARCHITECTURE_FILE_PATH = 'models/eng-to-fra/eng-to-fra-word-architecture.json' 18 | 19 | input_counter = Counter() 20 | target_counter = Counter() 21 | 22 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 23 | for line in lines[: min(NUM_SAMPLES, len(lines)-1)]: 24 | input_text, target_text = line.split('\t') 25 | input_words = [w for w in nltk.word_tokenize(input_text.lower())] 26 | target_text = 'START ' + target_text.lower() + ' END' 27 | target_words = [w for w in nltk.word_tokenize(target_text)] 28 | for w in input_words: 29 | input_counter[w] += 1 30 | for w in target_words: 31 | target_counter[w] += 1 32 | 33 | input_word2idx = dict() 34 | target_word2idx = dict() 35 | for idx, word in enumerate(input_counter.most_common(MAX_VOCAB_SIZE)): 36 | input_word2idx[word[0]] = idx + 2 37 | for idx, word in enumerate(target_counter.most_common(MAX_VOCAB_SIZE)): 38 | target_word2idx[word[0]] = idx + 1 39 | 40 | input_word2idx['PAD'] = 0 41 | input_word2idx['UNK'] = 1 42 | target_word2idx['UNK'] = 0 43 | 44 | input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()]) 45 | target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()]) 46 | 47 | num_encoder_tokens = len(input_idx2word) 48 | num_decoder_tokens = len(target_idx2word) 49 | 50 | np.save('models/eng-to-fra/eng-to-fra-word-input-word2idx.npy', input_word2idx) 51 | np.save('models/eng-to-fra/eng-to-fra-word-input-idx2word.npy', input_idx2word) 52 | np.save('models/eng-to-fra/eng-to-fra-word-target-word2idx.npy', target_word2idx) 53 | np.save('models/eng-to-fra/eng-to-fra-word-target-idx2word.npy', target_idx2word) 54 | 55 | encoder_input_data = [] 56 | 57 | encoder_max_seq_length = 0 58 | decoder_max_seq_length = 0 59 | 60 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 61 | for line in lines[: min(NUM_SAMPLES, len(lines)-1)]: 62 | input_text, target_text = line.split('\t') 63 | target_text = 'START ' + target_text.lower() + ' END' 64 | input_words = [w for w in nltk.word_tokenize(input_text.lower())] 65 | target_words = [w for w in nltk.word_tokenize(target_text)] 66 | encoder_input_wids = [] 67 | for w in input_words: 68 | w2idx = 1 # default [UNK] 69 | if w in input_word2idx: 70 | w2idx = input_word2idx[w] 71 | encoder_input_wids.append(w2idx) 72 | 73 | encoder_input_data.append(encoder_input_wids) 74 | encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length) 75 | decoder_max_seq_length = max(len(target_words), decoder_max_seq_length) 76 | 77 | encoder_input_data = pad_sequences(encoder_input_data, encoder_max_seq_length) 78 | 79 | decoder_target_data = np.zeros(shape=(NUM_SAMPLES, decoder_max_seq_length, num_decoder_tokens)) 80 | decoder_input_data = np.zeros(shape=(NUM_SAMPLES, decoder_max_seq_length, num_decoder_tokens)) 81 | lines = open(DATA_PATH, 'rt', encoding='utf8').read().split('\n') 82 | for lineIdx, line in enumerate(lines[: min(NUM_SAMPLES, len(lines)-1)]): 83 | _, target_text = line.split('\t') 84 | target_text = 'START ' + target_text.lower() + ' END' 85 | target_words = [w for w in nltk.word_tokenize(target_text)] 86 | for idx, w in enumerate(target_words): 87 | w2idx = 0 # default [UNK] 88 | if w in target_word2idx: 89 | w2idx = target_word2idx[w] 90 | decoder_input_data[lineIdx, idx, w2idx] = 1 91 | if idx > 0: 92 | decoder_target_data[lineIdx, idx-1, w2idx] = 1 93 | 94 | context = dict() 95 | context['num_encoder_tokens'] = num_encoder_tokens 96 | context['num_decoder_tokens'] = num_decoder_tokens 97 | context['encoder_max_seq_length'] = encoder_max_seq_length 98 | context['decoder_max_seq_length'] = decoder_max_seq_length 99 | 100 | np.save('models/eng-to-fra/eng-to-fra-word-context.npy', context) 101 | 102 | encoder_inputs = Input(shape=(None, ), name='encoder_inputs') 103 | encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=HIDDEN_UNITS, 104 | input_length=encoder_max_seq_length, name='encoder_embedding') 105 | encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm') 106 | encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs)) 107 | encoder_states = [encoder_state_h, encoder_state_c] 108 | 109 | decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs') 110 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm') 111 | decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, 112 | initial_state=encoder_states) 113 | decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense') 114 | decoder_outputs = decoder_dense(decoder_outputs) 115 | 116 | model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 117 | 118 | model.compile(loss='categorical_crossentropy', optimizer='rmsprop') 119 | 120 | json = model.to_json() 121 | open(ARCHITECTURE_FILE_PATH, 'w').write(json) 122 | 123 | checkpoint = ModelCheckpoint(filepath=WEIGHT_FILE_PATH, save_best_only=True) 124 | model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, 125 | verbose=1, validation_split=0.2, callbacks=[checkpoint]) 126 | 127 | model.save_weights(WEIGHT_FILE_PATH) 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-char-architecture.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Model", "config": {"name": "model_1", "layers": [{"name": "encoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null, 73], "dtype": "float32", "sparse": false, "name": "encoder_inputs"}, "inbound_nodes": []}, {"name": "decoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null, 2640], "dtype": "float32", "sparse": false, "name": "decoder_inputs"}, "inbound_nodes": []}, {"name": "encoder_lstm", "class_name": "LSTM", "config": {"name": "encoder_lstm", "trainable": true, "return_sequences": false, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["encoder_inputs", 0, 0, {}]]]}, {"name": "decoder_lstm", "class_name": "LSTM", "config": {"name": "decoder_lstm", "trainable": true, "return_sequences": true, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["decoder_inputs", 0, 0, {}], ["encoder_lstm", 0, 1, {}], ["encoder_lstm", 0, 2, {}]]]}, {"name": "decoder_dense", "class_name": "Dense", "config": {"name": "decoder_dense", "trainable": true, "units": 2640, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["decoder_lstm", 0, 0, {}]]]}], "input_layers": [["encoder_inputs", 0, 0], ["decoder_inputs", 0, 0]], "output_layers": [["decoder_dense", 0, 0]]}, "keras_version": "2.0.5", "backend": "tensorflow"} -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-char-context.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-char-context.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-char-input-char2idx.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-char-input-char2idx.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-char-input-idx2char.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-char-input-idx2char.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-char-target-char2idx.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-char-target-char2idx.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-char-target-idx2char.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-char-target-idx2char.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-char-weights.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-char-weights.h5 -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-glove-architecture.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Model", "config": {"name": "model_1", "layers": [{"name": "encoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null, 100], "dtype": "float32", "sparse": false, "name": "encoder_inputs"}, "inbound_nodes": []}, {"name": "decoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null, 2640], "dtype": "float32", "sparse": false, "name": "decoder_inputs"}, "inbound_nodes": []}, {"name": "encoder_lstm", "class_name": "LSTM", "config": {"name": "encoder_lstm", "trainable": true, "return_sequences": false, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["encoder_inputs", 0, 0, {}]]]}, {"name": "decoder_lstm", "class_name": "LSTM", "config": {"name": "decoder_lstm", "trainable": true, "return_sequences": true, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["decoder_inputs", 0, 0, {}], ["encoder_lstm", 0, 1, {}], ["encoder_lstm", 0, 2, {}]]]}, {"name": "decoder_dense", "class_name": "Dense", "config": {"name": "decoder_dense", "trainable": true, "units": 2640, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["decoder_lstm", 0, 0, {}]]]}], "input_layers": [["encoder_inputs", 0, 0], ["decoder_inputs", 0, 0]], "output_layers": [["decoder_dense", 0, 0]]}, "keras_version": "2.0.5", "backend": "tensorflow"} -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-glove-context.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-glove-context.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-glove-target-idx2word.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-glove-target-idx2word.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-glove-target-word2idx.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-glove-target-word2idx.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-glove-unknown-emb.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-glove-unknown-emb.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-glove-weights.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-glove-weights.h5 -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-word-architecture.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Model", "config": {"name": "model_1", "layers": [{"name": "encoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null], "dtype": "float32", "sparse": false, "name": "encoder_inputs"}, "inbound_nodes": []}, {"name": "encoder_embedding", "class_name": "Embedding", "config": {"name": "encoder_embedding", "trainable": true, "batch_input_shape": [null, 12], "dtype": "int32", "input_dim": 3557, "output_dim": 256, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 12}, "inbound_nodes": [[["encoder_inputs", 0, 0, {}]]]}, {"name": "decoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null, 2640], "dtype": "float32", "sparse": false, "name": "decoder_inputs"}, "inbound_nodes": []}, {"name": "encoder_lstm", "class_name": "LSTM", "config": {"name": "encoder_lstm", "trainable": true, "return_sequences": false, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["encoder_embedding", 0, 0, {}]]]}, {"name": "decoder_lstm", "class_name": "LSTM", "config": {"name": "decoder_lstm", "trainable": true, "return_sequences": true, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["decoder_inputs", 0, 0, {}], ["encoder_lstm", 0, 1, {}], ["encoder_lstm", 0, 2, {}]]]}, {"name": "decoder_dense", "class_name": "Dense", "config": {"name": "decoder_dense", "trainable": true, "units": 2640, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["decoder_lstm", 0, 0, {}]]]}], "input_layers": [["encoder_inputs", 0, 0], ["decoder_inputs", 0, 0]], "output_layers": [["decoder_dense", 0, 0]]}, "keras_version": "2.0.5", "backend": "tensorflow"} -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-word-context.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-word-context.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-word-input-idx2word.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-word-input-idx2word.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-word-input-word2idx.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-word-input-word2idx.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-word-target-idx2word.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-word-target-idx2word.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-word-target-word2idx.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-word-target-word2idx.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-cmn/eng-to-cmn-word-weights.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-cmn/eng-to-cmn-word-weights.h5 -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-char-architecture.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Model", "config": {"name": "model_1", "layers": [{"name": "encoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null, 71], "dtype": "float32", "sparse": false, "name": "encoder_inputs"}, "inbound_nodes": []}, {"name": "decoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null, 93], "dtype": "float32", "sparse": false, "name": "decoder_inputs"}, "inbound_nodes": []}, {"name": "encoder_lstm", "class_name": "LSTM", "config": {"name": "encoder_lstm", "trainable": true, "return_sequences": false, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["encoder_inputs", 0, 0, {}]]]}, {"name": "decoder_lstm", "class_name": "LSTM", "config": {"name": "decoder_lstm", "trainable": true, "return_sequences": true, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["decoder_inputs", 0, 0, {}], ["encoder_lstm", 0, 1, {}], ["encoder_lstm", 0, 2, {}]]]}, {"name": "decoder_dense", "class_name": "Dense", "config": {"name": "decoder_dense", "trainable": true, "units": 93, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["decoder_lstm", 0, 0, {}]]]}], "input_layers": [["encoder_inputs", 0, 0], ["decoder_inputs", 0, 0]], "output_layers": [["decoder_dense", 0, 0]]}, "keras_version": "2.0.5", "backend": "tensorflow"} -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-char-context.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-char-context.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-char-input-char2idx.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-char-input-char2idx.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-char-input-idx2char.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-char-input-idx2char.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-char-target-char2idx.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-char-target-char2idx.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-char-target-idx2char.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-char-target-idx2char.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-char-weights.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-char-weights.h5 -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-glove-architecture.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Model", "config": {"name": "model_1", "layers": [{"name": "encoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null, 100], "dtype": "float32", "sparse": false, "name": "encoder_inputs"}, "inbound_nodes": []}, {"name": "decoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null, 4655], "dtype": "float32", "sparse": false, "name": "decoder_inputs"}, "inbound_nodes": []}, {"name": "encoder_lstm", "class_name": "LSTM", "config": {"name": "encoder_lstm", "trainable": true, "return_sequences": false, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["encoder_inputs", 0, 0, {}]]]}, {"name": "decoder_lstm", "class_name": "LSTM", "config": {"name": "decoder_lstm", "trainable": true, "return_sequences": true, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["decoder_inputs", 0, 0, {}], ["encoder_lstm", 0, 1, {}], ["encoder_lstm", 0, 2, {}]]]}, {"name": "decoder_dense", "class_name": "Dense", "config": {"name": "decoder_dense", "trainable": true, "units": 4655, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["decoder_lstm", 0, 0, {}]]]}], "input_layers": [["encoder_inputs", 0, 0], ["decoder_inputs", 0, 0]], "output_layers": [["decoder_dense", 0, 0]]}, "keras_version": "2.0.5", "backend": "tensorflow"} -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-glove-context.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-glove-context.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-glove-target-idx2word.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-glove-target-idx2word.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-glove-target-word2idx.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-glove-target-word2idx.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-glove-unknown-emb.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-glove-unknown-emb.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-glove-weights.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-glove-weights.h5 -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-word-architecture.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Model", "config": {"name": "model_1", "layers": [{"name": "encoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null], "dtype": "float32", "sparse": false, "name": "encoder_inputs"}, "inbound_nodes": []}, {"name": "encoder_embedding", "class_name": "Embedding", "config": {"name": "encoder_embedding", "trainable": true, "batch_input_shape": [null, 6], "dtype": "int32", "input_dim": 2183, "output_dim": 256, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 6}, "inbound_nodes": [[["encoder_inputs", 0, 0, {}]]]}, {"name": "decoder_inputs", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null, 4655], "dtype": "float32", "sparse": false, "name": "decoder_inputs"}, "inbound_nodes": []}, {"name": "encoder_lstm", "class_name": "LSTM", "config": {"name": "encoder_lstm", "trainable": true, "return_sequences": false, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["encoder_embedding", 0, 0, {}]]]}, {"name": "decoder_lstm", "class_name": "LSTM", "config": {"name": "decoder_lstm", "trainable": true, "return_sequences": true, "return_state": true, "go_backwards": false, "stateful": false, "unroll": false, "implementation": 0, "units": 256, "activation": "tanh", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0}, "inbound_nodes": [[["decoder_inputs", 0, 0, {}], ["encoder_lstm", 0, 1, {}], ["encoder_lstm", 0, 2, {}]]]}, {"name": "decoder_dense", "class_name": "Dense", "config": {"name": "decoder_dense", "trainable": true, "units": 4655, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["decoder_lstm", 0, 0, {}]]]}], "input_layers": [["encoder_inputs", 0, 0], ["decoder_inputs", 0, 0]], "output_layers": [["decoder_dense", 0, 0]]}, "keras_version": "2.0.5", "backend": "tensorflow"} -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-word-context.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-word-context.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-word-input-idx2word.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-word-input-idx2word.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-word-input-word2idx.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-word-input-word2idx.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-word-target-idx2word.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-word-target-idx2word.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-word-target-word2idx.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-word-target-word2idx.npy -------------------------------------------------------------------------------- /translator_train/models/eng-to-fra/eng-to-fra-word-weights.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chen0040/keras-language-translator-web-api/06dc1d106e2293abaadd506992988a4a66b5eb78/translator_train/models/eng-to-fra/eng-to-fra-word-weights.h5 -------------------------------------------------------------------------------- /translator_train/worksheet.py: -------------------------------------------------------------------------------- 1 | import keras 2 | 3 | print(keras.__version__) -------------------------------------------------------------------------------- /translator_web/__init__.py: -------------------------------------------------------------------------------- 1 | from .flaskr import app -------------------------------------------------------------------------------- /translator_web/eng_to_cmn_char_translator_predict.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model, model_from_json 2 | from keras.layers import Input, LSTM, Dense 3 | import numpy as np 4 | 5 | HIDDEN_UNITS = 256 6 | 7 | 8 | class EngToCmnCharTranslator(object): 9 | model = None 10 | encoder_model = None 11 | decoder_model = None 12 | input_char2idx = None 13 | input_idx2char = None 14 | target_char2idx = None 15 | target_idx2char = None 16 | max_encoder_seq_length = None 17 | max_decoder_seq_length = None 18 | num_encoder_tokens = None 19 | num_decoder_tokens = None 20 | 21 | def __init__(self): 22 | self.input_char2idx = np.load('../translator_train/models/eng-to-cmn/eng-to-cmn-char-input-char2idx.npy').item() 23 | self.input_idx2char = np.load('../translator_train/models/eng-to-cmn/eng-to-cmn-char-input-idx2char.npy').item() 24 | self.target_char2idx = np.load('../translator_train/models/eng-to-cmn/eng-to-cmn-char-target-char2idx.npy').item() 25 | self.target_idx2char = np.load('../translator_train/models/eng-to-cmn/eng-to-cmn-char-target-idx2char.npy').item() 26 | context = np.load('../translator_train/models/eng-to-cmn/eng-to-cmn-char-context.npy').item() 27 | self.max_encoder_seq_length = context['max_encoder_seq_length'] 28 | self.max_decoder_seq_length = context['max_decoder_seq_length'] 29 | self.num_encoder_tokens = context['num_encoder_tokens'] 30 | self.num_decoder_tokens = context['num_decoder_tokens'] 31 | 32 | encoder_inputs = Input(shape=(None, self.num_encoder_tokens), name='encoder_inputs') 33 | encoder = LSTM(units=HIDDEN_UNITS, return_state=True, name="encoder_lstm") 34 | encoder_outputs, state_h, state_c = encoder(encoder_inputs) 35 | encoder_states = [state_h, state_c] 36 | 37 | decoder_inputs = Input(shape=(None, self.num_decoder_tokens), name='decoder_inputs') 38 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True, return_state=True, name='decoder_lstm') 39 | decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) 40 | decoder_dense = Dense(self.num_decoder_tokens, activation='softmax', name='decoder_dense') 41 | decoder_outputs = decoder_dense(decoder_outputs) 42 | 43 | self.model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 44 | 45 | # model_json = open('../translator_train/models/eng-to-cmn/eng-to-cmn-char-architecture.json', 'r').read() 46 | # self.model = model_from_json(model_json) 47 | self.model.load_weights('../translator_train/models/eng-to-cmn/eng-to-cmn-char-weights.h5') 48 | self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy') 49 | 50 | self.encoder_model = Model(encoder_inputs, encoder_states) 51 | 52 | decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] 53 | decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) 54 | decoder_states = [state_h, state_c] 55 | decoder_outputs = decoder_dense(decoder_outputs) 56 | self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) 57 | 58 | def translate_lang(self, input_text): 59 | input_seq = np.zeros((1, self.max_encoder_seq_length, self.num_encoder_tokens)) 60 | for idx, char in enumerate(input_text): 61 | if char in self.input_char2idx: 62 | idx2 = self.input_char2idx[char] 63 | input_seq[0, idx, idx2] = 1 64 | states_value = self.encoder_model.predict(input_seq) 65 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 66 | target_seq[0, 0, self.target_char2idx['\t']] = 1 67 | target_text = '' 68 | terminated = False 69 | while not terminated: 70 | output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) 71 | 72 | sample_token_idx = np.argmax(output_tokens[0, -1, :]) 73 | sample_character = self.target_idx2char[sample_token_idx] 74 | target_text += sample_character 75 | 76 | if sample_character == '\n' or len(target_text) >= self.max_decoder_seq_length: 77 | terminated = True 78 | 79 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 80 | target_seq[0, 0, sample_token_idx] = 1 81 | states_value = [h, c] 82 | return target_text.strip() 83 | 84 | def test_run(self): 85 | print(self.translate_lang('Be nice.')) 86 | print(self.translate_lang('Drop it!')) 87 | print(self.translate_lang('No way!')) 88 | 89 | 90 | def main(): 91 | model = EngToCmnCharTranslator() 92 | model.test_run() 93 | 94 | if __name__ == '__main__': 95 | main() 96 | -------------------------------------------------------------------------------- /translator_web/eng_to_cmn_glove_translator_predict.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model, model_from_json 2 | from keras.layers import Input, LSTM, Dense, Embedding 3 | from keras.preprocessing.sequence import pad_sequences 4 | import nltk 5 | import numpy as np 6 | import os 7 | import sys 8 | import urllib.request 9 | import zipfile 10 | 11 | HIDDEN_UNITS = 256 12 | GLOVE_EMBEDDING_SIZE = 100 13 | 14 | VERY_LARGE_DATA_DIR_PATH = '../translator_train/very_large_data' 15 | MODEL_DIR_PATH = '../translator_train/models/eng-to-cmn' 16 | GLOVE_MODEL = VERY_LARGE_DATA_DIR_PATH + "/glove.6B." + str(GLOVE_EMBEDDING_SIZE) + "d.txt" 17 | WHITELIST = 'abcdefghijklmnopqrstuvwxyz1234567890?.,' 18 | 19 | 20 | def in_white_list(_word): 21 | for char in _word: 22 | if char in WHITELIST: 23 | return True 24 | 25 | return False 26 | 27 | 28 | def reporthook(block_num, block_size, total_size): 29 | read_so_far = block_num * block_size 30 | if total_size > 0: 31 | percent = read_so_far * 1e2 / total_size 32 | s = "\r%5.1f%% %*d / %d" % ( 33 | percent, len(str(total_size)), read_so_far, total_size) 34 | sys.stderr.write(s) 35 | if read_so_far >= total_size: # near the end 36 | sys.stderr.write("\n") 37 | else: # total size is unknown 38 | sys.stderr.write("read %d\n" % (read_so_far,)) 39 | 40 | 41 | def download_glove(): 42 | if not os.path.exists(GLOVE_MODEL): 43 | 44 | glove_zip = VERY_LARGE_DATA_DIR_PATH + '/glove.6B.zip' 45 | 46 | if not os.path.exists(VERY_LARGE_DATA_DIR_PATH): 47 | os.makedirs(VERY_LARGE_DATA_DIR_PATH) 48 | 49 | if not os.path.exists(glove_zip): 50 | print('glove file does not exist, downloading from internet') 51 | urllib.request.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename=glove_zip, 52 | reporthook=reporthook) 53 | 54 | print('unzipping glove file') 55 | zip_ref = zipfile.ZipFile(glove_zip, 'r') 56 | zip_ref.extractall(VERY_LARGE_DATA_DIR_PATH) 57 | zip_ref.close() 58 | 59 | 60 | def load_glove(): 61 | download_glove() 62 | _word2em = {} 63 | file = open(GLOVE_MODEL, mode='rt', encoding='utf8') 64 | for line in file: 65 | words = line.strip().split() 66 | word = words[0] 67 | embeds = np.array(words[1:], dtype=np.float32) 68 | _word2em[word] = embeds 69 | file.close() 70 | return _word2em 71 | 72 | 73 | class EngToCmnGloveTranslator(object): 74 | model = None 75 | encoder_model = None 76 | decoder_model = None 77 | target_word2idx = None 78 | target_idx2word = None 79 | max_encoder_seq_length = None 80 | max_decoder_seq_length = None 81 | num_decoder_tokens = None 82 | word2em = None 83 | unknown_emb = None 84 | 85 | def __init__(self): 86 | self.word2em = load_glove() 87 | self.unknown_emb = np.load(MODEL_DIR_PATH + '/eng-to-cmn-glove-unknown-emb.npy') 88 | self.target_word2idx = np.load( 89 | MODEL_DIR_PATH + '/eng-to-cmn-glove-target-word2idx.npy').item() 90 | self.target_idx2word = np.load( 91 | MODEL_DIR_PATH + '/eng-to-cmn-glove-target-idx2word.npy').item() 92 | context = np.load(MODEL_DIR_PATH + '/eng-to-cmn-glove-context.npy').item() 93 | self.max_decoder_seq_length = context['decoder_max_seq_length'] 94 | self.max_encoder_seq_length = context['encoder_max_seq_length'] 95 | self.num_decoder_tokens = context['num_decoder_tokens'] 96 | 97 | encoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='encoder_inputs') 98 | encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name="encoder_lstm") 99 | encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs) 100 | encoder_states = [encoder_state_h, encoder_state_c] 101 | 102 | decoder_inputs = Input(shape=(None, self.num_decoder_tokens), name='decoder_inputs') 103 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True, return_state=True, name='decoder_lstm') 104 | decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) 105 | decoder_dense = Dense(self.num_decoder_tokens, activation='softmax', name='decoder_dense') 106 | decoder_outputs = decoder_dense(decoder_outputs) 107 | 108 | self.model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 109 | 110 | self.model.load_weights(MODEL_DIR_PATH + '/eng-to-cmn-glove-weights.h5') 111 | self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy') 112 | 113 | self.encoder_model = Model(encoder_inputs, encoder_states) 114 | 115 | decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] 116 | decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) 117 | decoder_states = [state_h, state_c] 118 | decoder_outputs = decoder_dense(decoder_outputs) 119 | self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) 120 | 121 | def translate_lang(self, input_text): 122 | input_seq = [] 123 | input_wids = [] 124 | for word in nltk.word_tokenize(input_text.lower()): 125 | emb = self.unknown_emb 126 | if word in self.word2em: 127 | emb = self.word2em[word] 128 | input_wids.append(emb) 129 | input_seq.append(input_wids) 130 | input_seq = pad_sequences(input_seq, self.max_encoder_seq_length) 131 | states_value = self.encoder_model.predict(input_seq) 132 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 133 | target_seq[0, 0, self.target_word2idx['\t']] = 1 134 | target_text = '' 135 | terminated = False 136 | while not terminated: 137 | output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) 138 | 139 | sample_token_idx = np.argmax(output_tokens[0, -1, :]) 140 | sample_word = self.target_idx2word[sample_token_idx] 141 | target_text += sample_word 142 | 143 | if sample_word == '\n' or len(target_text) >= self.max_decoder_seq_length: 144 | terminated = True 145 | 146 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 147 | target_seq[0, 0, sample_token_idx] = 1 148 | 149 | states_value = [h, c] 150 | return target_text.strip() 151 | 152 | def test_run(self): 153 | print(self.translate_lang('Be nice.')) 154 | print(self.translate_lang('Drop it!')) 155 | print(self.translate_lang('Get out!')) 156 | 157 | 158 | def main(): 159 | model = EngToCmnGloveTranslator() 160 | model.test_run() 161 | 162 | 163 | if __name__ == '__main__': 164 | main() 165 | -------------------------------------------------------------------------------- /translator_web/eng_to_cmn_word_translator_predict.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model, model_from_json 2 | from keras.layers import Input, LSTM, Dense, Embedding 3 | from keras.preprocessing.sequence import pad_sequences 4 | import nltk 5 | import numpy as np 6 | 7 | HIDDEN_UNITS = 256 8 | 9 | 10 | class EngToCmnWordTranslator(object): 11 | model = None 12 | encoder_model = None 13 | decoder_model = None 14 | input_word2idx = None 15 | input_idx2word = None 16 | target_word2idx = None 17 | target_idx2word = None 18 | max_encoder_seq_length = None 19 | max_decoder_seq_length = None 20 | num_encoder_tokens = None 21 | num_decoder_tokens = None 22 | 23 | def __init__(self): 24 | self.input_word2idx = np.load('../translator_train/models/eng-to-cmn/eng-to-cmn-word-input-word2idx.npy').item() 25 | self.input_idx2word = np.load('../translator_train/models/eng-to-cmn/eng-to-cmn-word-input-idx2word.npy').item() 26 | self.target_word2idx = np.load('../translator_train/models/eng-to-cmn/eng-to-cmn-word-target-word2idx.npy').item() 27 | self.target_idx2word = np.load('../translator_train/models/eng-to-cmn/eng-to-cmn-word-target-idx2word.npy').item() 28 | context = np.load('../translator_train/models/eng-to-cmn/eng-to-cmn-word-context.npy').item() 29 | self.max_encoder_seq_length = context['encoder_max_seq_length'] 30 | self.max_decoder_seq_length = context['decoder_max_seq_length'] 31 | self.num_encoder_tokens = context['num_encoder_tokens'] 32 | self.num_decoder_tokens = context['num_decoder_tokens'] 33 | 34 | encoder_inputs = Input(shape=(None, ), name='encoder_inputs') 35 | encoder_embedding = Embedding(input_dim=self.num_encoder_tokens, output_dim=HIDDEN_UNITS, 36 | input_length=self.max_encoder_seq_length, name='encoder_embedding') 37 | encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name="encoder_lstm") 38 | encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs)) 39 | encoder_states = [encoder_state_h, encoder_state_c] 40 | 41 | decoder_inputs = Input(shape=(None, self.num_decoder_tokens), name='decoder_inputs') 42 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True, return_state=True, name='decoder_lstm') 43 | decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) 44 | decoder_dense = Dense(self.num_decoder_tokens, activation='softmax', name='decoder_dense') 45 | decoder_outputs = decoder_dense(decoder_outputs) 46 | 47 | self.model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 48 | 49 | # model_json = open('../translator_train/models/eng-to-cmn/eng-to-cmn-word-architecture.json', 'r').read() 50 | # self.model = model_from_json(model_json) 51 | self.model.load_weights('../translator_train/models/eng-to-cmn/eng-to-cmn-word-weights.h5') 52 | self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy') 53 | 54 | self.encoder_model = Model(encoder_inputs, encoder_states) 55 | 56 | decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] 57 | decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) 58 | decoder_states = [state_h, state_c] 59 | decoder_outputs = decoder_dense(decoder_outputs) 60 | self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) 61 | 62 | def translate_lang(self, input_text): 63 | input_seq = [] 64 | input_wids = [] 65 | for word in nltk.word_tokenize(input_text.lower()): 66 | idx = 1 # default [UNK] 67 | if word in self.input_word2idx: 68 | idx = self.input_word2idx[word] 69 | input_wids.append(idx) 70 | input_seq.append(input_wids) 71 | input_seq = pad_sequences(input_seq, self.max_encoder_seq_length) 72 | states_value = self.encoder_model.predict(input_seq) 73 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 74 | target_seq[0, 0, self.target_word2idx['\t']] = 1 75 | target_text = '' 76 | terminated = False 77 | while not terminated: 78 | output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) 79 | 80 | sample_token_idx = np.argmax(output_tokens[0, -1, :]) 81 | sample_word = self.target_idx2word[sample_token_idx] 82 | target_text += sample_word 83 | 84 | if sample_word == '\n' or len(target_text) >= self.max_decoder_seq_length: 85 | terminated = True 86 | 87 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 88 | target_seq[0, 0, sample_token_idx] = 1 89 | 90 | states_value = [h, c] 91 | return target_text.strip() 92 | 93 | def test_run(self): 94 | print(self.translate_lang('Be nice.')) 95 | print(self.translate_lang('Drop it!')) 96 | print(self.translate_lang('Get out!')) 97 | 98 | 99 | def main(): 100 | model = EngToCmnWordTranslator() 101 | model.test_run() 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /translator_web/eng_to_fra_char_translator_predict.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model, model_from_json 2 | from keras.layers import Input, LSTM, Dense 3 | import numpy as np 4 | 5 | HIDDEN_UNITS = 256 6 | 7 | 8 | class EngToFraCharTranslator(object): 9 | model = None 10 | encoder_model = None 11 | decoder_model = None 12 | input_char2idx = None 13 | input_idx2char = None 14 | target_char2idx = None 15 | target_idx2char = None 16 | max_encoder_seq_length = None 17 | max_decoder_seq_length = None 18 | num_encoder_tokens = None 19 | num_decoder_tokens = None 20 | 21 | def __init__(self): 22 | self.input_char2idx = np.load('../translator_train/models/eng-to-fra/eng-to-fra-char-input-char2idx.npy').item() 23 | self.input_idx2char = np.load('../translator_train/models/eng-to-fra/eng-to-fra-char-input-idx2char.npy').item() 24 | self.target_char2idx = np.load('../translator_train/models/eng-to-fra/eng-to-fra-char-target-char2idx.npy').item() 25 | self.target_idx2char = np.load('../translator_train/models/eng-to-fra/eng-to-fra-char-target-idx2char.npy').item() 26 | context = np.load('../translator_train/models/eng-to-fra/eng-to-fra-char-context.npy').item() 27 | self.max_encoder_seq_length = context['max_encoder_seq_length'] 28 | self.max_decoder_seq_length = context['max_decoder_seq_length'] 29 | self.num_encoder_tokens = context['num_encoder_tokens'] 30 | self.num_decoder_tokens = context['num_decoder_tokens'] 31 | 32 | encoder_inputs = Input(shape=(None, self.num_encoder_tokens), name='encoder_inputs') 33 | encoder = LSTM(units=HIDDEN_UNITS, return_state=True, name="encoder_lstm") 34 | encoder_outputs, state_h, state_c = encoder(encoder_inputs) 35 | encoder_states = [state_h, state_c] 36 | 37 | decoder_inputs = Input(shape=(None, self.num_decoder_tokens), name='decoder_inputs') 38 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True, return_state=True, name='decoder_lstm') 39 | decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) 40 | decoder_dense = Dense(self.num_decoder_tokens, activation='softmax', name='decoder_dense') 41 | decoder_outputs = decoder_dense(decoder_outputs) 42 | 43 | self.model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 44 | 45 | # model_json = open('../translator_train/models/eng-to-fra/eng-to-fra-char-architecture.json', 'r').read() 46 | # self.model = model_from_json(model_json) 47 | self.model.load_weights('../translator_train/models/eng-to-fra/eng-to-fra-char-weights.h5') 48 | self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy') 49 | 50 | self.encoder_model = Model(encoder_inputs, encoder_states) 51 | 52 | decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] 53 | decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) 54 | decoder_states = [state_h, state_c] 55 | decoder_outputs = decoder_dense(decoder_outputs) 56 | self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) 57 | 58 | def translate_lang(self, input_text): 59 | input_seq = np.zeros((1, self.max_encoder_seq_length, self.num_encoder_tokens)) 60 | for idx, char in enumerate(input_text): 61 | if char in self.input_char2idx: 62 | idx2 = self.input_char2idx[char] 63 | input_seq[0, idx, idx2] = 1 64 | states_value = self.encoder_model.predict(input_seq) 65 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 66 | target_seq[0, 0, self.target_char2idx['\t']] = 1 67 | target_text = '' 68 | terminated = False 69 | while not terminated: 70 | output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) 71 | 72 | sample_token_idx = np.argmax(output_tokens[0, -1, :]) 73 | sample_character = self.target_idx2char[sample_token_idx] 74 | target_text += sample_character 75 | 76 | if sample_character == '\n' or len(target_text) >= self.max_decoder_seq_length: 77 | terminated = True 78 | 79 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 80 | target_seq[0, 0, sample_token_idx] = 1 81 | states_value = [h, c] 82 | return target_text.strip() 83 | 84 | def test_run(self): 85 | print(self.translate_lang('Be nice.')) 86 | print(self.translate_lang('Drop it!')) 87 | print(self.translate_lang('Get out!')) 88 | 89 | 90 | def main(): 91 | model = EngToFraCharTranslator() 92 | model.test_run() 93 | 94 | if __name__ == '__main__': 95 | main() 96 | -------------------------------------------------------------------------------- /translator_web/eng_to_fra_glove_translator_predict.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model, model_from_json 2 | from keras.layers import Input, LSTM, Dense, Embedding 3 | from keras.preprocessing.sequence import pad_sequences 4 | import numpy as np 5 | import nltk 6 | import os 7 | import sys 8 | import urllib.request 9 | import zipfile 10 | 11 | HIDDEN_UNITS = 256 12 | GLOVE_EMBEDDING_SIZE = 100 13 | 14 | MODEL_DIR_PATH = '../translator_train/models/eng-to-fra' 15 | VERY_LARGE_DATA_DIR_PATH = '../translator_train/very_large_data' 16 | GLOVE_MODEL = VERY_LARGE_DATA_DIR_PATH + "/glove.6B." + str(GLOVE_EMBEDDING_SIZE) + "d.txt" 17 | WHITELIST = 'abcdefghijklmnopqrstuvwxyz1234567890?.,' 18 | 19 | 20 | def in_white_list(_word): 21 | for char in _word: 22 | if char in WHITELIST: 23 | return True 24 | 25 | return False 26 | 27 | 28 | def reporthook(block_num, block_size, total_size): 29 | read_so_far = block_num * block_size 30 | if total_size > 0: 31 | percent = read_so_far * 1e2 / total_size 32 | s = "\r%5.1f%% %*d / %d" % ( 33 | percent, len(str(total_size)), read_so_far, total_size) 34 | sys.stderr.write(s) 35 | if read_so_far >= total_size: # near the end 36 | sys.stderr.write("\n") 37 | else: # total size is unknown 38 | sys.stderr.write("read %d\n" % (read_so_far,)) 39 | 40 | 41 | def download_glove(): 42 | if not os.path.exists(GLOVE_MODEL): 43 | 44 | glove_zip = VERY_LARGE_DATA_DIR_PATH + '//glove.6B.zip' 45 | 46 | if not os.path.exists(VERY_LARGE_DATA_DIR_PATH): 47 | os.makedirs(VERY_LARGE_DATA_DIR_PATH) 48 | 49 | if not os.path.exists(glove_zip): 50 | print('glove file does not exist, downloading from internet') 51 | urllib.request.urlretrieve(url='http://nlp.stanford.edu/data/glove.6B.zip', filename=glove_zip, 52 | reporthook=reporthook) 53 | 54 | print('unzipping glove file') 55 | zip_ref = zipfile.ZipFile(glove_zip, 'r') 56 | zip_ref.extractall(VERY_LARGE_DATA_DIR_PATH) 57 | zip_ref.close() 58 | 59 | 60 | def load_glove(): 61 | download_glove() 62 | _word2em = {} 63 | file = open(GLOVE_MODEL, mode='rt', encoding='utf8') 64 | for line in file: 65 | words = line.strip().split() 66 | word = words[0] 67 | embeds = np.array(words[1:], dtype=np.float32) 68 | _word2em[word] = embeds 69 | file.close() 70 | return _word2em 71 | 72 | 73 | class EngToFraGloveTranslator(object): 74 | model = None 75 | encoder_model = None 76 | decoder_model = None 77 | target_word2idx = None 78 | target_idx2word = None 79 | max_encoder_seq_length = None 80 | max_decoder_seq_length = None 81 | num_decoder_tokens = None 82 | word2em = None 83 | unknown_emb = None 84 | 85 | def __init__(self): 86 | self.word2em = load_glove() 87 | self.unknown_emb = np.load(MODEL_DIR_PATH + '/eng-to-fra-glove-unknown-emb.npy') 88 | self.target_word2idx = np.load( 89 | MODEL_DIR_PATH + '/eng-to-fra-glove-target-word2idx.npy').item() 90 | self.target_idx2word = np.load( 91 | MODEL_DIR_PATH + '/eng-to-fra-glove-target-idx2word.npy').item() 92 | context = np.load(MODEL_DIR_PATH + '/eng-to-fra-glove-context.npy').item() 93 | print(context) 94 | self.max_encoder_seq_length = context['encoder_max_seq_length'] 95 | self.max_decoder_seq_length = context['decoder_max_seq_length'] 96 | self.num_decoder_tokens = context['num_decoder_tokens'] 97 | 98 | encoder_inputs = Input(shape=(None, GLOVE_EMBEDDING_SIZE), name='encoder_inputs') 99 | encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name="encoder_lstm") 100 | encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs) 101 | encoder_states = [encoder_state_h, encoder_state_c] 102 | 103 | decoder_inputs = Input(shape=(None, self.num_decoder_tokens), name='decoder_inputs') 104 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True, return_state=True, name='decoder_lstm') 105 | decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) 106 | decoder_dense = Dense(self.num_decoder_tokens, activation='softmax', name='decoder_dense') 107 | decoder_outputs = decoder_dense(decoder_outputs) 108 | 109 | self.model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 110 | 111 | self.model.load_weights(MODEL_DIR_PATH + '/eng-to-fra-glove-weights.h5') 112 | self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy') 113 | 114 | self.encoder_model = Model(encoder_inputs, encoder_states) 115 | 116 | decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] 117 | decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) 118 | decoder_states = [state_h, state_c] 119 | decoder_outputs = decoder_dense(decoder_outputs) 120 | self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) 121 | 122 | def translate_lang(self, input_text): 123 | input_seq = [] 124 | input_wids = [] 125 | for word in nltk.word_tokenize(input_text.lower()): 126 | idx = self.unknown_emb 127 | if word in self.word2em: 128 | idx = self.word2em[word] 129 | input_wids.append(idx) 130 | input_seq.append(input_wids) 131 | input_seq = pad_sequences(input_seq, self.max_encoder_seq_length) 132 | states_value = self.encoder_model.predict(input_seq) 133 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 134 | target_seq[0, 0, self.target_word2idx['START']] = 1 135 | target_text = '' 136 | terminated = False 137 | target_text_len = 0 138 | while not terminated: 139 | output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) 140 | 141 | sample_token_idx = np.argmax(output_tokens[0, -1, :]) 142 | sample_word = self.target_idx2word[sample_token_idx] 143 | target_text_len += 1 144 | 145 | if sample_word != 'START' and sample_word != 'END': 146 | target_text += ' ' + sample_word 147 | 148 | if sample_word == 'END' or target_text_len >= self.max_decoder_seq_length: 149 | terminated = True 150 | 151 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 152 | target_seq[0, 0, sample_token_idx] = 1 153 | states_value = [h, c] 154 | 155 | return target_text.strip() 156 | 157 | def test_run(self): 158 | print(self.translate_lang('Be nice.')) 159 | print(self.translate_lang('Drop it!')) 160 | print(self.translate_lang('Get out!')) 161 | 162 | 163 | def main(): 164 | model = EngToFraGloveTranslator() 165 | model.test_run() 166 | 167 | 168 | if __name__ == '__main__': 169 | main() 170 | -------------------------------------------------------------------------------- /translator_web/eng_to_fra_word_translator_predict.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model, model_from_json 2 | from keras.layers import Input, LSTM, Dense, Embedding 3 | from keras.preprocessing.sequence import pad_sequences 4 | import numpy as np 5 | import nltk 6 | 7 | HIDDEN_UNITS = 256 8 | 9 | 10 | class EngToFraWordTranslator(object): 11 | model = None 12 | encoder_model = None 13 | decoder_model = None 14 | input_word2idx = None 15 | input_idx2word = None 16 | target_word2idx = None 17 | target_idx2word = None 18 | max_encoder_seq_length = None 19 | max_decoder_seq_length = None 20 | num_encoder_tokens = None 21 | num_decoder_tokens = None 22 | 23 | def __init__(self): 24 | self.input_word2idx = np.load('../translator_train/models/eng-to-fra/eng-to-fra-word-input-word2idx.npy').item() 25 | self.input_idx2word = np.load('../translator_train/models/eng-to-fra/eng-to-fra-word-input-idx2word.npy').item() 26 | self.target_word2idx = np.load('../translator_train/models/eng-to-fra/eng-to-fra-word-target-word2idx.npy').item() 27 | self.target_idx2word = np.load('../translator_train/models/eng-to-fra/eng-to-fra-word-target-idx2word.npy').item() 28 | context = np.load('../translator_train/models/eng-to-fra/eng-to-fra-word-context.npy').item() 29 | print(context) 30 | self.max_encoder_seq_length = context['encoder_max_seq_length'] 31 | self.max_decoder_seq_length = context['decoder_max_seq_length'] 32 | self.num_encoder_tokens = context['num_encoder_tokens'] 33 | self.num_decoder_tokens = context['num_decoder_tokens'] 34 | 35 | encoder_inputs = Input(shape=(None, ), name='encoder_inputs') 36 | encoder_embedding = Embedding(input_dim=self.num_encoder_tokens, output_dim=HIDDEN_UNITS, 37 | input_length=self.max_encoder_seq_length, name='encoder_embedding') 38 | encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name="encoder_lstm") 39 | encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding(encoder_inputs)) 40 | encoder_states = [encoder_state_h, encoder_state_c] 41 | 42 | decoder_inputs = Input(shape=(None, self.num_decoder_tokens), name='decoder_inputs') 43 | decoder_lstm = LSTM(units=HIDDEN_UNITS, return_sequences=True, return_state=True, name='decoder_lstm') 44 | decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) 45 | decoder_dense = Dense(self.num_decoder_tokens, activation='softmax', name='decoder_dense') 46 | decoder_outputs = decoder_dense(decoder_outputs) 47 | 48 | self.model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 49 | 50 | # model_json = open('../translator_train/models/eng-to-fra/eng-to-fra-word-architecture.json', 'r').read() 51 | # self.model = model_from_json(model_json) 52 | self.model.load_weights('../translator_train/models/eng-to-fra/eng-to-fra-word-weights.h5') 53 | self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy') 54 | 55 | self.encoder_model = Model(encoder_inputs, encoder_states) 56 | 57 | decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))] 58 | decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) 59 | decoder_states = [state_h, state_c] 60 | decoder_outputs = decoder_dense(decoder_outputs) 61 | self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states) 62 | 63 | def translate_lang(self, input_text): 64 | input_seq = [] 65 | input_wids = [] 66 | for word in nltk.word_tokenize(input_text.lower()): 67 | idx = 1 68 | if word in self.input_word2idx: 69 | idx = self.input_word2idx[word] 70 | input_wids.append(idx) 71 | input_seq.append(input_wids) 72 | input_seq = pad_sequences(input_seq, self.max_encoder_seq_length) 73 | states_value = self.encoder_model.predict(input_seq) 74 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 75 | target_seq[0, 0, self.target_word2idx['START']] = 1 76 | target_text = '' 77 | terminated = False 78 | target_text_len = 0 79 | while not terminated: 80 | output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value) 81 | 82 | sample_token_idx = np.argmax(output_tokens[0, -1, :]) 83 | sample_word = self.target_idx2word[sample_token_idx] 84 | target_text_len += 1 85 | 86 | if sample_word != 'START' and sample_word != 'END': 87 | target_text += ' ' + sample_word 88 | 89 | if sample_word == 'END' or target_text_len >= self.max_decoder_seq_length: 90 | terminated = True 91 | 92 | target_seq = np.zeros((1, 1, self.num_decoder_tokens)) 93 | target_seq[0, 0, sample_token_idx] = 1 94 | states_value = [h, c] 95 | 96 | return target_text.strip() 97 | 98 | def test_run(self): 99 | print(self.translate_lang('Be nice.')) 100 | print(self.translate_lang('Drop it!')) 101 | print(self.translate_lang('Get out!')) 102 | 103 | 104 | def main(): 105 | model = EngToFraWordTranslator() 106 | model.test_run() 107 | 108 | if __name__ == '__main__': 109 | main() 110 | -------------------------------------------------------------------------------- /translator_web/flaskr.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, send_from_directory, redirect, render_template, flash, url_for, jsonify, \ 2 | make_response, abort 3 | from translator_web.eng_to_fra_char_translator_predict import EngToFraCharTranslator 4 | from translator_web.eng_to_cmn_char_translator_predict import EngToCmnCharTranslator 5 | from translator_web.eng_to_fra_word_translator_predict import EngToFraWordTranslator 6 | from translator_web.eng_to_cmn_word_translator_predict import EngToCmnWordTranslator 7 | from translator_web.eng_to_fra_glove_translator_predict import EngToFraGloveTranslator 8 | from translator_web.eng_to_cmn_glove_translator_predict import EngToCmnGloveTranslator 9 | 10 | app = Flask(__name__) 11 | app.config.from_object(__name__) # load config from this file , flaskr.py 12 | 13 | # Load default config and override config from an environment variable 14 | app.config.from_envvar('FLASKR_SETTINGS', silent=True) 15 | app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 16 | 17 | eng_to_fra_translator_c = EngToFraCharTranslator() 18 | eng_to_cmn_translator_c = EngToCmnCharTranslator() 19 | eng_to_fra_translator_w = EngToFraWordTranslator() 20 | eng_to_cmn_translator_w = EngToCmnWordTranslator() 21 | eng_to_fra_translator_g = EngToFraGloveTranslator() 22 | eng_to_cmn_translator_g = EngToCmnGloveTranslator() 23 | 24 | 25 | @app.route('/') 26 | def home(): 27 | return render_template('home.html') 28 | 29 | 30 | @app.route('/about') 31 | def about(): 32 | return 'About Us' 33 | 34 | 35 | @app.route('/eng_to_fra_char_translator', methods=['POST', 'GET']) 36 | def eng_to_fra_char_translator(): 37 | if request.method == 'POST': 38 | if 'sentence' not in request.form: 39 | flash('No sentence post') 40 | redirect(request.url) 41 | elif request.form['sentence'] == '': 42 | flash('No sentence') 43 | redirect(request.url) 44 | else: 45 | sent = request.form['sentence'] 46 | translated = eng_to_fra_translator_c.translate_lang(sent) 47 | return render_template('eng_to_fra_char_translator_result.html', sentence=sent, translated=translated) 48 | return render_template('eng_to_fra_char_translator.html') 49 | 50 | 51 | @app.route('/eng_to_cmn_char_translator', methods=['POST', 'GET']) 52 | def eng_to_cmn_char_translator(): 53 | if request.method == 'POST': 54 | if 'sentence' not in request.form: 55 | flash('No sentence post') 56 | redirect(request.url) 57 | elif request.form['sentence'] == '': 58 | flash('No sentence') 59 | redirect(request.url) 60 | else: 61 | sent = request.form['sentence'] 62 | translated = eng_to_cmn_translator_c.translate_lang(sent) 63 | return render_template('eng_to_cmn_char_translator_result.html', sentence=sent, 64 | translated=translated) 65 | return render_template('eng_to_cmn_char_translator.html') 66 | 67 | 68 | @app.route('/eng_to_fra_word_translator', methods=['POST', 'GET']) 69 | def eng_to_fra_word_translator(): 70 | if request.method == 'POST': 71 | if 'sentence' not in request.form: 72 | flash('No sentence post') 73 | redirect(request.url) 74 | elif request.form['sentence'] == '': 75 | flash('No sentence') 76 | redirect(request.url) 77 | else: 78 | sent = request.form['sentence'] 79 | translated = eng_to_fra_translator_w.translate_lang(sent) 80 | return render_template('eng_to_fra_word_translator_result.html', sentence=sent, 81 | translated=translated) 82 | return render_template('eng_to_fra_word_translator.html') 83 | 84 | 85 | @app.route('/eng_to_cmn_word_translator', methods=['POST', 'GET']) 86 | def eng_to_cmn_word_translator(): 87 | if request.method == 'POST': 88 | if 'sentence' not in request.form: 89 | flash('No sentence post') 90 | redirect(request.url) 91 | elif request.form['sentence'] == '': 92 | flash('No sentence') 93 | redirect(request.url) 94 | else: 95 | sent = request.form['sentence'] 96 | sentiments = eng_to_cmn_translator_w.translate_lang(sent) 97 | return render_template('eng_to_cmn_word_translator_result.html', sentence=sent, 98 | sentiments=sentiments) 99 | return render_template('eng_to_cmn_word_translator.html') 100 | 101 | 102 | @app.route('/eng_to_fra_word_glove_translator', methods=['POST', 'GET']) 103 | def eng_to_fra_word_glove_translator(): 104 | if request.method == 'POST': 105 | if 'sentence' not in request.form: 106 | flash('No sentence post') 107 | redirect(request.url) 108 | elif request.form['sentence'] == '': 109 | flash('No sentence') 110 | redirect(request.url) 111 | else: 112 | sent = request.form['sentence'] 113 | translated = eng_to_fra_translator_g.translate_lang(sent) 114 | return render_template('eng_to_fra_word_glove_translator_result.html', sentence=sent, 115 | translated=translated) 116 | return render_template('eng_to_fra_word_glove_translator.html') 117 | 118 | 119 | @app.route('/eng_to_cmn_word_glove_translator', methods=['POST', 'GET']) 120 | def eng_to_cmn_word_glove_translator(): 121 | if request.method == 'POST': 122 | if 'sentence' not in request.form: 123 | flash('No sentence post') 124 | redirect(request.url) 125 | elif request.form['sentence'] == '': 126 | flash('No sentence') 127 | redirect(request.url) 128 | else: 129 | sent = request.form['sentence'] 130 | sentiments = eng_to_cmn_translator_g.translate_lang(sent) 131 | return render_template('eng_to_cmn_word_glove_translator_result.html', sentence=sent, 132 | sentiments=sentiments) 133 | return render_template('eng_to_cmn_word_glove_translator.html') 134 | 135 | 136 | @app.route('/translate_eng', methods=['POST', 'GET']) 137 | def translate_eng(): 138 | if request.method == 'POST': 139 | if not request.json or 'sentence' not in request.json or 'level' not in request.json or 'target_lang' not in request.json: 140 | abort(400) 141 | sentence = request.json['sentence'] 142 | level = request.json['level'] 143 | target_lang = request.json['target_lang'] 144 | else: 145 | sentence = request.args.get('sentence') 146 | level = request.args.get('level') 147 | target_lang = request.args.get('target_lang') 148 | 149 | target_text = sentence 150 | if level == 'char' and target_lang == 'french': 151 | target_text = eng_to_fra_translator_c.translate_lang(sentence) 152 | elif level == 'char' and target_lang == 'chinese': 153 | target_text = eng_to_cmn_translator_c.translate_lang(sentence) 154 | elif level == 'word' and target_lang == 'french': 155 | target_text = eng_to_fra_translator_w.translate_lang(sentence) 156 | elif level == 'word' and target_lang == 'chinese': 157 | target_text = eng_to_cmn_translator_w.translate_lang(sentence) 158 | elif level == 'word-glove' and target_lang == 'french': 159 | target_text = eng_to_fra_translator_g.translate_lang(sentence) 160 | elif level == 'word-glove' and target_lang == 'chinese': 161 | target_text = eng_to_cmn_translator_g.translate_lang(sentence) 162 | return jsonify({ 163 | 'sentence': sentence, 164 | 'translated': target_text, 165 | 'target_lang': target_lang, 166 | 'level': level 167 | }) 168 | 169 | 170 | @app.errorhandler(404) 171 | def not_found(error): 172 | return make_response(jsonify({'error': 'Not found'}), 404) 173 | 174 | 175 | def main(): 176 | eng_to_fra_translator_c.test_run() 177 | eng_to_cmn_translator_c.test_run() 178 | eng_to_fra_translator_w.test_run() 179 | eng_to_cmn_translator_w.test_run() 180 | eng_to_fra_translator_g.test_run() 181 | eng_to_cmn_translator_g.test_run() 182 | app.run(debug=True) 183 | 184 | 185 | if __name__ == '__main__': 186 | main() 187 | -------------------------------------------------------------------------------- /translator_web/static/style.css: -------------------------------------------------------------------------------- 1 | body { font-family: sans-serif; background: #eee; } 2 | a, h1, h2 { color: #377ba8; } 3 | h1, h2 { font-family: 'Georgia', serif; margin: 0; } 4 | h1 { border-bottom: 2px solid #eee; } 5 | h2 { font-size: 1.2em; } 6 | 7 | .page { margin: 2em auto; width: 35em; border: 5px solid #ccc; 8 | padding: 0.8em; background: white; } 9 | .entries { list-style: none; margin: 0; padding: 0; } 10 | .entries li { margin: 0.8em 1.2em; } 11 | .entries li h2 { margin-left: -1em; } 12 | .add-entry { font-size: 0.9em; border-bottom: 1px solid #ccc; } 13 | .add-entry dl { font-weight: bold; } 14 | .metanav { text-align: right; font-size: 0.8em; padding: 0.3em; 15 | margin-bottom: 1em; background: #fafafa; } 16 | .flash { background: #cee5F5; padding: 0.5em; 17 | border: 1px solid #aacbe2; } 18 | .error { background: #f0d6d6; padding: 0.5em; } -------------------------------------------------------------------------------- /translator_web/templates/eng_to_cmn_char_translator.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |

Write something here in English to translate to Chinese (Char-Level)

4 |
5 |

Write something here:

6 | 7 |
8 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/eng_to_cmn_char_translator_result.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | You said: {{ sentence }} 5 |
6 | 9 | 10 |
11 | Try another one 12 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/eng_to_cmn_word_glove_translator.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |

Write something here in English to translate to Chinese (Word-Level GloVe Encoding)

4 |
5 |

Write something here:

6 | 7 |
8 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/eng_to_cmn_word_glove_translator_result.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | You said: {{ sentence }} 5 |
6 | 9 | 10 |
11 | Try another one 12 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/eng_to_cmn_word_translator.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |

Write something here in English to translate to Chinese (Word-Level)

4 |
5 |

Write something here:

6 | 7 |
8 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/eng_to_cmn_word_translator_result.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | You said: {{ sentence }} 5 |
6 | 9 | 10 |
11 | Try another one 12 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/eng_to_fra_char_translator.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |

Write something here in English to translate to French (Char-Level)

4 |
5 |

Write something here:

6 | 7 |
8 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/eng_to_fra_char_translator_result.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | You said: {{ sentence }} 5 |
6 | 9 | 10 |
11 | Try another one 12 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/eng_to_fra_word_glove_translator.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |

Write something here in English to translate to French (Word-Level GloVe Encoding)

4 |
5 |

Write something here:

6 | 7 |
8 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/eng_to_fra_word_glove_translator_result.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | You said: {{ sentence }} 5 |
6 | 9 | 10 |
11 | Try another one 12 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/eng_to_fra_word_translator.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |

Write something here in English to translate to French (Word-Level)

4 |
5 |

Write something here:

6 | 7 |
8 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/eng_to_fra_word_translator_result.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 |
4 | You said: {{ sentence }} 5 |
6 | 9 | 10 |
11 | Try another one 12 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/home.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block body %} 3 | 11 | {% endblock %} -------------------------------------------------------------------------------- /translator_web/templates/layout.html: -------------------------------------------------------------------------------- 1 | 2 | Flask Slingshot 3 | 4 |
5 |

Language Translation

6 |
7 | About 8 |
9 | {% for message in get_flashed_messages() %} 10 |
{{ message }}
11 | {% endfor %} 12 | {% block body %}{% endblock %} 13 |
--------------------------------------------------------------------------------