├── .gitignore ├── README.md ├── bayesian-methods-for-ml-master ├── Coursera-BMML-Final-project │ ├── (OPTIONAL) Final project. Training VAE.ipynb │ ├── Coursera BMML, Final project.ipynb │ └── utils.py ├── Week 2 │ └── Programming assignment │ │ ├── Coursera-BMML_-week-2.ipynb │ │ └── grader.py ├── Week 4 │ └── Programming assignment │ │ ├── Week4. Practical Assignment. MCMC.ipynb │ │ └── grader.py ├── assignment 5 │ ├── CVAE.png │ ├── VAE.png │ ├── assignment.ipynb │ └── grader.py └── assignment 6 │ ├── Coursera+BMML%2C+week+6.ipynb │ └── grader.py ├── competitive-data-science ├── Programming assignment, week 1: Pandas basics │ ├── PandasBasics.ipynb │ └── grader.py ├── Programming assignment, week 2: Data leakages │ └── Data leakages.ipynb ├── Programming assignment, week 3: Mean encodings │ ├── Programming_assignment_week_3.ipynb │ └── grader.py ├── Programming assignment, week 4: Ensembles │ ├── Programming_assignment_week_4.ipynb │ └── grader.py ├── Programming assignment, week 4: KNN features │ ├── Untitled.ipynb │ ├── compute_KNN_features-Copy1.ipynb │ ├── compute_KNN_features.ipynb │ ├── grader.py │ └── test multiprocessing.ipynb ├── README.md ├── Reading materials │ ├── EDA_Springleaf_screencast.ipynb │ ├── EDA_video2.ipynb │ ├── EDA_video3_screencast.ipynb │ ├── GBM_drop_tree.ipynb │ ├── Hyperparameters_tuning_video2_RF_n_estimators.ipynb │ ├── Macros.ipynb │ ├── Metrics_video2_constants_for_MSE_and_MAE.ipynb │ ├── Metrics_video3_weighted_median.ipynb │ └── Metrics_video8_soft_kappa_xgboost.ipynb └── kaggle_project │ ├── Documentation.pdf │ └── README.md ├── intro-to-dle ├── README.md ├── download_resources.ipynb ├── download_utils.py ├── grading.py ├── keras_utils.py ├── misc │ └── np_convolution.py ├── week1 │ ├── kernel.png │ ├── sgd.png │ ├── target.npy │ ├── train.npy │ └── week01_pa.ipynb ├── week2 │ ├── Keras-task.ipynb │ ├── NumpyNN (honor).ipynb │ ├── Tensorflow-task.ipynb │ ├── datasets │ ├── matplotlib_utils.py │ ├── models │ ├── my1stNN_logreg.ipynb │ ├── my1stNN_mlp.ipynb │ ├── preprocessed_mnist.py │ ├── submit.py │ └── util.py ├── week3 │ ├── grading_utils.py │ ├── imagelabels.mat │ ├── images │ │ └── inceptionv3.png │ ├── week3_task1_first_cnn_cifar10_clean.ipynb │ ├── week3_task2_fine_tuning_clean.ipynb │ └── weights.p ├── week4 │ ├── Adversarial-task.ipynb │ ├── Autoencoders-task.ipynb │ ├── MiniGAN.ipynb │ ├── lfw_dataset.py │ └── submit.py ├── week5 │ ├── POS-task.ipynb │ ├── RNN-task.ipynb │ ├── data_copyright │ ├── names │ ├── rnn.png │ └── submit.py └── week6 │ ├── grading_utils.py │ ├── images │ ├── encoder_decoder.png │ ├── encoder_decoder_explained.png │ └── inceptionv3.png │ ├── utils.py │ └── week6_final_project_image_captioning_clean.ipynb ├── natural-language-processing ├── .gitignore ├── AWS-tutorial.md ├── Docker-tutorial.md ├── common │ ├── README.md │ └── download_utils.py ├── docker │ └── Dockerfile ├── honor │ ├── LSTM chatbot (character level).ipynb │ ├── LSTM chatbot (character level,tf).ipynb │ ├── LSTM chatbot (word level).ipynb │ ├── LSTM reply.ipynb │ ├── README.md │ ├── Untitled.ipynb │ ├── datasets.py │ ├── dialogue_manager.py │ ├── download_cornell.sh │ ├── download_opensubs.sh │ ├── example.py │ ├── main_bot.py │ ├── tfmodel.py │ └── utils.py ├── project │ ├── dialogue_manager.py │ ├── main_bot.py │ ├── utils.py │ └── week5-project.ipynb ├── week1 │ ├── grader.py │ ├── metrics.py │ └── week1-MultilabelClassification.ipynb ├── week2 │ ├── evaluation.py │ └── week2-NER.ipynb ├── week3 │ ├── grader.py │ ├── util.py │ └── week3-Embeddings.ipynb └── week4 │ ├── encoder-decoder-pic.png │ └── week4-seq2seq.ipynb └── reinforcement-learning ├── .gitignore ├── 01a-gym_interface.ipynb ├── 01b-crossentropy_method.ipynb ├── 01c-hons.ipynb ├── 02_qlearning.ipynb ├── 02a-practice_vi.ipynb ├── atari_util.py ├── dqn_atari.ipynb ├── experience_replay.ipynb ├── framebuffer.py ├── mdp.py ├── practice_approx_qlearning.ipynb ├── qlearning.py ├── replay_buffer.py ├── sarsa.ipynb ├── week5 ├── REINFORCE.ipynb ├── atari_util.py ├── practice_a3c.ipynb └── practice_reinforce.py └── week6 ├── bandits.ipynb ├── practice_mcts.ipynb └── seq2seq ├── basic_model_tf.py ├── practice_tf.ipynb └── voc.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | readonly 3 | .ipynb_checkpoints 4 | .DS_Store 5 | .idea 6 | *.h5 7 | *.tgz 8 | *.pickle 9 | *.zip 10 | *.index 11 | *.meta 12 | leakage 13 | submission.csv 14 | *.csv 15 | *.csv.gz 16 | *.dat 17 | *.txt 18 | *.npy 19 | *.npz 20 | # Byte-compiled / optimized / DLL files 21 | __pycache__/ 22 | *.py[cod] 23 | *$py.class 24 | 25 | # C extensions 26 | *.so 27 | 28 | # Distribution / packaging 29 | .Python 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | .eggs/ 36 | lib/ 37 | lib64/ 38 | parts/ 39 | sdist/ 40 | var/ 41 | wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | MANIFEST 46 | 47 | # PyInstaller 48 | # Usually these files are written by a python script from a template 49 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 50 | *.manifest 51 | *.spec 52 | 53 | # Installer logs 54 | pip-log.txt 55 | pip-delete-this-directory.txt 56 | 57 | # Unit test / coverage reports 58 | htmlcov/ 59 | .tox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | .hypothesis/ 67 | 68 | # Translations 69 | *.mo 70 | *.pot 71 | 72 | # Django stuff: 73 | *.log 74 | .static_storage/ 75 | .media/ 76 | local_settings.py 77 | 78 | # Flask stuff: 79 | instance/ 80 | .webassets-cache 81 | 82 | # Scrapy stuff: 83 | .scrapy 84 | 85 | # Sphinx documentation 86 | docs/_build/ 87 | 88 | # PyBuilder 89 | target/ 90 | 91 | # Jupyter Notebook 92 | .ipynb_checkpoints 93 | 94 | # pyenv 95 | .python-version 96 | 97 | # celery beat schedule file 98 | celerybeat-schedule 99 | 100 | # SageMath parsed files 101 | *.sage.py 102 | 103 | # Environments 104 | .env 105 | .venv 106 | env/ 107 | venv/ 108 | ENV/ 109 | env.bak/ 110 | venv.bak/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | .spyproject 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # mypy 123 | .mypy_cache/ 124 | 125 | *.tsv 126 | *.png 127 | starSpaceModel 128 | natural-language-processing/honor/model/* 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Advanced Machine Learning course 2 | 3 | This is my github repo for the AML specialisation offered by Yandex and HSE on coursera. 4 | 5 | ## Modules 6 | 1. [Intro to Deep Learning](https://www.coursera.org/learn/intro-to-deep-learning/) . The course covers the fundamentals of Deep Learning, from the basic ideas of overfitting and underfitting to state of the art CNN and RNN. 7 | 8 | - During the course, I coded a neural network in numpy which helped me understand how backprop really works. 9 | 10 | - The assignments are open ended, encouraging experimentation and trial and error, as it would be the case in a real world application. 11 | 12 | - The assignments have an interesting blend of numpy, keras, and tensorflow. This helps to think of these modules as tools in the same toolbox instead of isolated tools. 13 | 14 | - The final project is designing a **captioning neural network**, featuring both a CNN for feature extraction (Pretrained InceptionV3) and an RNN. It is trained on a set of (images, captions) and the network learns to caption any image (that resembles the training set, that is) 15 | 16 | 2. [Competitive Data Science](https://www.coursera.org/learn/competitive-data-science). The course covers exploratory data analysis, feature generation, and feature tuning and model validation, all taught by expert kaggle competition winners. 17 | - The course involved **participating in an actual competition**, and I ranked in the top 10% (Out of ~ 300 participants) 18 | - My final model was designed, trained, and ran in an Amazon AWS instance, and it included lagged features, mean encoded features, and features derived from item descriptions using PCA 19 | - The assignments involved a range of tasks, but were mostly to build understanding of the actual goals involved in the competition 20 | 3. [Bayesian Methods for Machine Learning](https://www.coursera.org/learn/bayesian-methods-in-machine-learning). The course builds upon preexisting understanding of ML methods, and places them in the context of Bayesian statistics. 21 | - Many key concepts are covered: conjugate priors, latent variable modes, gaussian mixtures, expectation maximisation, variational inference, latent dirichlet allocation, MCMC with Gibbs and Metropolis-Hastings sampling, variational autoencoders, and bayesian optimization 22 | - The final project involved designing an **algorithm to help a user generate faces with certain properties from a variational autoencoder**: Initially I show the user different faces, then the user is progressively shown faces and asking to rate them. Using these values and GPyOpt, the code adjusts the latent variables of the VAE to approach the face the user wants. 23 | 24 | 4. [Ǹatural Language Processing](https://www.coursera.org/learn/language-processing). The course covers a variety of NLP approaches and concepts, including the basics such as lemmatising or bag of words, to word embeddings, and then in terms of modelling it covers Hidden Markov Models and finally neural network based models. 25 | - The final project was to design a simple chatbot that could either answer technical questions (by replying with a relevant answer from stack overflow, or could just chit chat. The network itself was implemented in Tensorflow following a character level seq2seq approach. 26 | -------------------------------------------------------------------------------- /bayesian-methods-for-ml-master/Coursera-BMML-Final-project/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import tensorflow as tf 4 | import GPy 5 | import GPyOpt 6 | import tensorflow as tf 7 | import keras 8 | from keras.layers import Input, Dense, Lambda, InputLayer, concatenate, Activation, Flatten, Reshape 9 | from keras.layers.normalization import BatchNormalization 10 | from keras.layers.convolutional import Conv2D, Deconv2D 11 | from keras.losses import MSE 12 | from keras.models import Model, Sequential 13 | from keras import backend as K 14 | from keras import metrics 15 | from keras.datasets import mnist 16 | from keras.utils import np_utils 17 | from tensorflow.python.framework import ops 18 | from tensorflow.python.framework import dtypes 19 | import os 20 | 21 | 22 | 23 | 24 | class CelebA: 25 | def __init__(self, path, sess, train=True, batch_size=32, height=218, width=178, channels=3, threads=1, file_type='.jpg'): 26 | image_filenames = [os.path.join(path, img) for img in os.listdir(path) if img.endswith(file_type)] 27 | if train: 28 | image_filenames = image_filenames[:-5000] 29 | else: 30 | image_filenames = image_filenames[-5000:] 31 | all_images = ops.convert_to_tensor(image_filenames, dtype=dtypes.string) 32 | input_queue = tf.train.slice_input_producer([image_filenames], shuffle=False) 33 | file_content = tf.read_file(input_queue[0]) 34 | image = tf.image.decode_jpeg(file_content, channels=3) 35 | image.set_shape([height, width, channels]) 36 | image_cropped = image[45:-45, 25:-25] 37 | image_cropped = tf.image.resize_images(image_cropped, (64, 64)) 38 | batch = tf.train.batch([image_cropped], batch_size=batch_size, num_threads=threads) 39 | self.batch = tf.cast(batch, tf.float32)/256 40 | self.n_batches = len(image_filenames) // batch_size 41 | self.sess = sess 42 | 43 | def __iter__(self): 44 | return self 45 | 46 | def __next__(self): 47 | x = self.sess.run(self.batch) 48 | return x, x, None 49 | 50 | def next(self): 51 | return self.__next__() 52 | 53 | def create_encoder(input_dims, base_filters=64, layers=4, latent=512): 54 | w = input_dims[0]//2**layers 55 | h = input_dims[1]//2**layers 56 | c = base_filters*2**(layers-1) 57 | encoder = Sequential() 58 | encoder.add(InputLayer(input_dims)) 59 | for i in range(layers): 60 | encoder.add(Conv2D(filters=base_filters*2**i, kernel_size=(5, 5), strides=(2, 2), padding='same', bias=False)) 61 | encoder.add(BatchNormalization(axis=3)) 62 | encoder.add(Activation(K.relu)) 63 | encoder.add(Reshape([w*h*c])) 64 | encoder.add(Dense(latent*2)) 65 | return encoder 66 | 67 | def create_decoder(output_dims, base_filters=64, layers=4, latent=512): 68 | w = output_dims[0]//2**layers 69 | h = output_dims[1]//2**layers 70 | c = base_filters*2**(layers-1) 71 | decoder = Sequential() 72 | decoder.add(InputLayer([latent])) 73 | decoder.add(Dense(w*h*c)) 74 | decoder.add(Reshape([w, h, c])) 75 | for i in range(layers-1, 0, -1): 76 | decoder.add(Deconv2D(filters=base_filters*2**i, kernel_size=(5, 5), strides=(2, 2), padding='same', bias=False)) 77 | decoder.add(BatchNormalization(axis=3)) 78 | decoder.add(Activation(K.relu)) 79 | decoder.add(Deconv2D(filters=3, kernel_size=(5, 5), strides=(2, 2), padding='same')) 80 | return decoder 81 | 82 | def sample(mean_log_var): 83 | mean, log_var = mean_log_var 84 | eps_shape = mean.get_shape() 85 | epsilon = K.random_normal(shape=eps_shape) 86 | z = epsilon*K.exp(log_var/2)+mean 87 | return z 88 | 89 | def create_vae(batch_size, base_filters=64, latent=8, 90 | image_size=64, learning_rate=0.001, 91 | reconstruction_weight=1000, layers=4): 92 | ''' 93 | Constructs VAE model with given parameters. 94 | :param batch_size: size of a batch (used for placeholder) 95 | :param base_filters: number of filters after first layer. Other layers will double this number 96 | :param latent: latent space dimension 97 | :param image_size: size of input image 98 | Returns compiled Keras model along with encoder and decoder 99 | ''' 100 | if isinstance(image_size, int): 101 | image_size = (image_size, image_size) 102 | x = Input(batch_shape=(batch_size, image_size[0], image_size[1], 3)) 103 | encoder = create_encoder([image_size[0], image_size[1], 3], base_filters=base_filters, latent=latent, layers=layers) 104 | decoder = create_decoder([image_size[0], image_size[1], 3], base_filters=base_filters, latent=latent, layers=layers) 105 | mean_log_var = encoder(x) 106 | mean_size = mean_log_var.shape[1]//2 107 | mean = Lambda(lambda h: h[:, :mean_size])(mean_log_var) 108 | log_var = Lambda(lambda h: h[:, mean_size:])(mean_log_var) 109 | z = Lambda(sample)([mean, log_var]) 110 | reconstruction = decoder(z) 111 | loss_reconstruction = K.mean(metrics.mean_squared_error(x, reconstruction)) 112 | loss_KL = - K.mean(0.5 * K.sum(1 + log_var - K.square(mean) - K.exp(log_var), axis=1)) 113 | loss = reconstruction_weight*loss_reconstruction + loss_KL 114 | 115 | vae = Model(x, reconstruction) 116 | vae.compile(optimizer=keras.optimizers.Adam(lr=learning_rate), loss=lambda x, y: loss) 117 | return vae, encoder, decoder -------------------------------------------------------------------------------- /bayesian-methods-for-ml-master/Week 2/Programming assignment/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | class Grader(object): 7 | def __init__(self): 8 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 9 | self.assignment_key = '3ivnq3n_EeexdQ4iFFMrvA' 10 | self.parts = OrderedDict([ 11 | ('H3evn', 'Task 1 (E-step)'), 12 | ('uD8jo', 'Task 2 (M-step: mu)'), 13 | ('zFWgm', 'Task 2 (M-step: sigma)'), 14 | ('gTUuu', 'Task 2 (M-step: pi)'), 15 | ('0ZlqN', 'Task 3 (VLB)'), 16 | ('Olbrx', 'Task 4 (EM)')]) 17 | self.answers = {key: None for key in self.parts} 18 | 19 | @staticmethod 20 | def ravel_output(output): 21 | ''' 22 | If student accedentally submitted np.array with one 23 | element instead of number, this function will submit 24 | this number instead 25 | ''' 26 | if isinstance(output, np.ndarray) and output.size == 1: 27 | output = output.item(0) 28 | return output 29 | 30 | def submit(self, email, token): 31 | submission = { 32 | "assignmentKey": self.assignment_key, 33 | "submitterEmail": email, 34 | "secret": token, 35 | "parts": {} 36 | } 37 | for part, output in self.answers.items(): 38 | if output is not None: 39 | submission["parts"][part] = {"output": output} 40 | else: 41 | submission["parts"][part] = dict() 42 | request = requests.post(self.submission_page, data=json.dumps(submission)) 43 | response = request.json() 44 | if request.status_code == 201: 45 | print('Submitted to Coursera platform. See results on assignment page!') 46 | elif u'details' in response and u'learnerMessage' in response[u'details']: 47 | print(response[u'details'][u'learnerMessage']) 48 | else: 49 | print("Unknown response from Coursera: {}".format(request.status_code)) 50 | print(response) 51 | 52 | def status(self): 53 | print("You want to submit these numbers:") 54 | for part_id, part_name in self.parts.items(): 55 | answer = self.answers[part_id] 56 | if answer is None: 57 | answer = '-'*10 58 | print("Task {}: {}".format(part_name, answer)) 59 | 60 | def submit_part(self, part, output): 61 | self.answers[part] = output 62 | print("Current answer for task {} is: {}".format(self.parts[part], output)) 63 | 64 | def submit_e_step(self, output): 65 | self.submit_part('H3evn', str(self.ravel_output(output[9, 1]))) 66 | 67 | def submit_m_step(self, pi, mu, sigma): 68 | self.submit_part('uD8jo', str(self.ravel_output(mu[1, 1]))) 69 | self.submit_part('zFWgm', str(self.ravel_output(sigma[1, 1, 1]))) 70 | self.submit_part('gTUuu', str(self.ravel_output(pi[1]))) 71 | 72 | def submit_VLB(self, loss): 73 | self.submit_part('0ZlqN', str(self.ravel_output(loss))) 74 | 75 | def submit_EM(self, best_loss): 76 | self.submit_part('Olbrx', str(self.ravel_output(best_loss))) -------------------------------------------------------------------------------- /bayesian-methods-for-ml-master/Week 4/Programming assignment/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | class Grader(object): 7 | def __init__(self): 8 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 9 | self.assignment_key = 'u85FqY8sEee5cg635EOBeA' 10 | self.parts = OrderedDict([ 11 | ('pn017', '1.1 (Alice trajectory)'), 12 | ('UUbsF', '1.1 (Bob trajectory)'), 13 | ('FFmXD', '1.2 (Alice mean)'), 14 | ('uWPFR', '1.2 (Bob mean)'), 15 | ('nkkem', '1.3 (Bob and Alice prices correlation)'), 16 | ('dyuVW', '1.4 (depends on the random data or not)'), 17 | ('r1VVR', '2.1 (MAP for age coef)'), 18 | ('5wFjO', '2.1 (MAP for aducation coef)'), 19 | ('sn9Lu', '2.2 (credible interval lower bound)'), 20 | ('JHRF9', '2.2 (credible interval upper bound)'), 21 | ('0StUi', '2.3 (does the data suggest gender discrimination?)'), 22 | ]) 23 | self.answers = {key: None for key in self.parts} 24 | 25 | @staticmethod 26 | def ravel_output(output): 27 | ''' 28 | If student accedentally submitted np.array with one 29 | element instead of number, this function will submit 30 | this number instead 31 | ''' 32 | if isinstance(output, np.ndarray) and output.size == 1: 33 | output = output.item(0) 34 | return output 35 | 36 | def submit(self, email, token): 37 | submission = { 38 | "assignmentKey": self.assignment_key, 39 | "submitterEmail": email, 40 | "secret": token, 41 | "parts": {} 42 | } 43 | for part, output in self.answers.items(): 44 | if output is not None: 45 | submission["parts"][part] = {"output": output} 46 | else: 47 | submission["parts"][part] = dict() 48 | request = requests.post(self.submission_page, data=json.dumps(submission)) 49 | response = request.json() 50 | if request.status_code == 201: 51 | print('Submitted to Coursera platform. See results on assignment page!') 52 | elif u'details' in response and u'learnerMessage' in response[u'details']: 53 | print(response[u'details'][u'learnerMessage']) 54 | else: 55 | print("Unknown response from Coursera: {}".format(request.status_code)) 56 | print(response) 57 | 58 | def status(self): 59 | print("You want to submit these numbers:") 60 | for part_id, part_name in self.parts.items(): 61 | answer = self.answers[part_id] 62 | if answer is None: 63 | answer = '-'*10 64 | print("Task {}: {}".format(part_name, answer)) 65 | 66 | def submit_part(self, part, output): 67 | self.answers[part] = output 68 | print("Current answer for task {} is: {}".format(self.parts[part], output)) 69 | 70 | def submit_simulation_trajectory(self, alice_trajectory, bob_trajectory): 71 | self.submit_part('pn017', '{} {}'.format( 72 | self.ravel_output(alice_trajectory[0]), self.ravel_output(alice_trajectory[1]) 73 | )) 74 | self.submit_part('UUbsF', '{} {}'.format( 75 | self.ravel_output(bob_trajectory[0]), self.ravel_output(bob_trajectory[1]) 76 | )) 77 | 78 | def submit_simulation_mean(self, alice_price, bob_price): 79 | self.submit_part('FFmXD', str(self.ravel_output(alice_price))) 80 | self.submit_part('uWPFR', str(self.ravel_output(bob_price))) 81 | 82 | def submit_simulation_correlation(self, alice_bob_correlation): 83 | self.submit_part('nkkem', str(self.ravel_output(alice_bob_correlation))) 84 | 85 | def submit_simulation_depends(self, answer): 86 | self.submit_part('dyuVW', answer) 87 | 88 | def submit_pymc_map_estimates(self, beta_age_coefficient, beta_education_coefficient): 89 | self.submit_part('r1VVR', str(self.ravel_output(beta_age_coefficient))) 90 | self.submit_part('5wFjO', str(self.ravel_output(beta_education_coefficient))) 91 | 92 | def submit_pymc_odds_ratio_interval(self, odds_ratio_lower_bound, odds_ratio_upper_bound): 93 | self.submit_part('sn9Lu', str(self.ravel_output(odds_ratio_lower_bound))) 94 | self.submit_part('JHRF9', str(self.ravel_output(odds_ratio_upper_bound))) 95 | 96 | def submit_is_there_discrimination(self, answer): 97 | self.submit_part('0StUi', answer) 98 | 99 | -------------------------------------------------------------------------------- /bayesian-methods-for-ml-master/assignment 5/CVAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/bayesian-methods-for-ml-master/assignment 5/CVAE.png -------------------------------------------------------------------------------- /bayesian-methods-for-ml-master/assignment 5/VAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/bayesian-methods-for-ml-master/assignment 5/VAE.png -------------------------------------------------------------------------------- /bayesian-methods-for-ml-master/assignment 5/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | from keras.layers import Input 6 | import tensorflow as tf 7 | from keras.datasets import mnist 8 | 9 | class Grader(object): 10 | def __init__(self): 11 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 12 | self.assignment_key = 'Pf_j7noDEeexdQ4iFFMrvA' 13 | self.parts = OrderedDict([('S66Mi', '1 (vlb)'), 14 | ('dXfpy', '2.1 (samples mean)'), 15 | ('U1gJG', '2.2 (samples var)'), 16 | ('NRPCA', '3 (best val loss)'), 17 | ('JEmpp', '4.1 (hallucinating mean)'), 18 | ('3K3IB', '4.2 (hallucinating var)'), 19 | ('tYD01', '5.1 (conditional hallucinating mean)'), 20 | ('CaofU', '5.2 (conditional hallucinating var)'),]) 21 | self.answers = {key: None for key in self.parts} 22 | 23 | @staticmethod 24 | def ravel_output(output): 25 | ''' 26 | If student accedentally submitted np.array with one 27 | element instead of number, this function will submit 28 | this number instead 29 | ''' 30 | if isinstance(output, np.ndarray) and output.size == 1: 31 | output = output.item(0) 32 | return output 33 | 34 | def submit(self, email, token): 35 | submission = { 36 | "assignmentKey": self.assignment_key, 37 | "submitterEmail": email, 38 | "secret": token, 39 | "parts": {} 40 | } 41 | for part, output in self.answers.items(): 42 | if output is not None: 43 | submission["parts"][part] = {"output": output} 44 | else: 45 | submission["parts"][part] = dict() 46 | request = requests.post(self.submission_page, data=json.dumps(submission)) 47 | response = request.json() 48 | if request.status_code == 201: 49 | print('Submitted to Coursera platform. See results on assignment page!') 50 | elif u'details' in response and u'learnerMessage' in response[u'details']: 51 | print(response[u'details'][u'learnerMessage']) 52 | else: 53 | print("Unknown response from Coursera: {}".format(request.status_code)) 54 | print(response) 55 | 56 | def status(self): 57 | print("You want to submit these numbers:") 58 | for part_id, part_name in self.parts.items(): 59 | answer = self.answers[part_id] 60 | if answer is None: 61 | answer = '-'*10 62 | print("Task {}: {}".format(part_name, answer)) 63 | 64 | def submit_part(self, part, output): 65 | self.answers[part] = output 66 | print("Current answer for task {} is: {}".format(self.parts[part], output)) 67 | 68 | def submit_vlb(self, sess, vlb_binomial): 69 | test_data = np.load('test_data.npz') 70 | my_x = Input(batch_shape=(100, 784)) 71 | my_x_decoded = Input(batch_shape=(100, 784)) 72 | my_t_mean = Input(batch_shape=(100, 2)) 73 | my_t_log_var = Input(batch_shape=(100, 2)) 74 | loss = vlb_binomial(my_x, my_x_decoded, my_t_mean, my_t_log_var) 75 | try: 76 | output = sess.run(loss, feed_dict={my_x: test_data['x'], my_x_decoded: test_data['x_decoded_mean'], 77 | my_t_mean: test_data['t_mean'], my_t_log_var: test_data['t_log_var']}) 78 | except Exception as e: 79 | print('Sorry, we were not able to run the provided code in `sess`.') 80 | raise e 81 | self.submit_part('S66Mi', str(self.ravel_output(output))) 82 | 83 | def submit_samples(self, sess, sampling): 84 | test_data = np.load('test_data.npz') 85 | my_t_mean = tf.tile(test_data['t_mean'][:1, :], [10000, 1]) 86 | my_t_log_var = tf.tile(test_data['t_log_var'][:1, :], [10000, 1]) 87 | samples = sampling([my_t_mean, my_t_log_var]) 88 | try: 89 | samples = sess.run(samples) 90 | except Exception as e: 91 | print('Sorry, we were not able to run the provided code in `sess`.') 92 | raise e 93 | mean = np.mean(samples, axis=0)[1] 94 | var = np.var(samples, axis=0)[1] 95 | self.submit_part('dXfpy', str(self.ravel_output(mean))) 96 | self.submit_part('U1gJG', str(self.ravel_output(var))) 97 | 98 | def submit_best_val_loss(self, hist): 99 | self.submit_part('NRPCA', str(self.ravel_output(hist.history['val_loss'][-1]))) 100 | 101 | def submit_hallucinating(self, sess, sampled_im_mean): 102 | try: 103 | imgs = sess.run(sampled_im_mean) 104 | except Exception as e: 105 | print('Sorry, we were not able to run the provided code in `sess`.') 106 | raise e 107 | self.submit_part('JEmpp', str(self.ravel_output(np.mean(imgs)))) 108 | var_per_channel = np.var(imgs, axis=0) 109 | self.submit_part('3K3IB', str(self.ravel_output(np.max(var_per_channel)))) 110 | 111 | def submit_conditional_hallucinating(self, sess, conditional_sampled_im_mean): 112 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 113 | x_train = x_train.astype('float32') / 255. 114 | x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:]))) 115 | 116 | baseline = np.zeros((10, 784)) 117 | for i in range(10): 118 | idx = y_train == i 119 | baseline[i, :] = np.mean(x_train[idx, :], axis=0) 120 | baseline_repeated = np.repeat(baseline, 5, axis=0) 121 | 122 | try: 123 | imgs = sess.run(conditional_sampled_im_mean) 124 | except Exception as e: 125 | print('Sorry, we were not able to run the provided code in `sess`.') 126 | raise e 127 | 128 | diff = np.abs(imgs - baseline_repeated) 129 | self.submit_part('tYD01', str(self.ravel_output(np.mean(diff)))) 130 | var_per_channel = np.var(diff, axis=0) 131 | self.submit_part('CaofU', str(self.ravel_output(np.max(var_per_channel)))) 132 | -------------------------------------------------------------------------------- /bayesian-methods-for-ml-master/assignment 6/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | class Grader(object): 7 | def __init__(self): 8 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 9 | self.assignment_key = 'ZJzC93UJEeesww5LLQnVZg' 10 | self.parts = OrderedDict([('P8Xj7', '1.1'), 11 | ('sYdjs', '1.2 (mean)'), 12 | ('Mjy6R', '1.2 (variance)'), 13 | ('Wif7t', '1.3'), 14 | ('V9yZN', '1.4 (noise)'), 15 | ('s4es0', '1.4 (just signal)'), 16 | ('ckZSh', '1.5'), 17 | ('1Jngf', '2.1'), 18 | ('CBiGW', '2.2')]) 19 | self.answers = {key: None for key in self.parts} 20 | 21 | @staticmethod 22 | def ravel_output(output): 23 | ''' 24 | If student accedentally submitted np.array with one 25 | element instead of number, this function will submit 26 | this number instead 27 | ''' 28 | if isinstance(output, np.ndarray) and output.size == 1: 29 | output = output.item(0) 30 | return output 31 | 32 | def submit(self, email, token): 33 | submission = { 34 | "assignmentKey": self.assignment_key, 35 | "submitterEmail": email, 36 | "secret": token, 37 | "parts": {} 38 | } 39 | for part, output in self.answers.items(): 40 | if output is not None: 41 | submission["parts"][part] = {"output": output} 42 | else: 43 | submission["parts"][part] = dict() 44 | request = requests.post(self.submission_page, data=json.dumps(submission)) 45 | response = request.json() 46 | if request.status_code == 201: 47 | print('Submitted to Coursera platform. See results on assignment page!') 48 | elif u'details' in response and u'learnerMessage' in response[u'details']: 49 | print(response[u'details'][u'learnerMessage']) 50 | else: 51 | print("Unknown response from Coursera: {}".format(request.status_code)) 52 | print(response) 53 | 54 | def status(self): 55 | print("You want to submit these numbers:") 56 | for part_id, part_name in self.parts.items(): 57 | answer = self.answers[part_id] 58 | if answer is None: 59 | answer = '-'*10 60 | print("Task {}: {}".format(part_name, answer)) 61 | 62 | def submit_part(self, part, output): 63 | self.answers[part] = output 64 | print("Current answer for task {} is: {}".format(self.parts[part], output)) 65 | 66 | def submit_GPy_1(self, output): 67 | self.submit_part('P8Xj7', str(self.ravel_output(output))) 68 | 69 | def submit_GPy_2(self, mean, var): 70 | self.submit_part('sYdjs', str(self.ravel_output(mean))) 71 | self.submit_part('Mjy6R', str(self.ravel_output(var))) 72 | 73 | def submit_GPy_3(self, output): 74 | self.submit_part('Wif7t', str(self.ravel_output(output))) 75 | 76 | def submit_GPy_4(self, noise, just_signal): 77 | self.submit_part('V9yZN', str(self.ravel_output(noise))) 78 | self.submit_part('s4es0', str(self.ravel_output(just_signal))) 79 | 80 | def submit_GPy_5(self, output): 81 | self.submit_part('ckZSh', str(self.ravel_output(output))) 82 | 83 | def submit_GPyOpt_1(self, output): 84 | self.submit_part('1Jngf', str(self.ravel_output(output))) 85 | 86 | def submit_GPyOpt_2(self, output): 87 | self.submit_part('CBiGW', str(self.ravel_output(output))) -------------------------------------------------------------------------------- /competitive-data-science/Programming assignment, week 1: Pandas basics/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | def array_to_hash(x): 7 | x_tupled = None 8 | if type(x) == list: 9 | x_tupled = tuple(x) 10 | elif type(x) == np.ndarray: 11 | x_tupled = tuple(list(x.flatten())) 12 | elif type(x) == tuple: 13 | x_tupled = x 14 | else: 15 | raise RuntimeError('unexpected type of input: {}'.format(type(x))) 16 | return hash(tuple(map(float, x_tupled))) 17 | 18 | def almostEqual(x, y): 19 | return abs(x - y) < 1e-3 20 | 21 | 22 | class Grader(object): 23 | def __init__(self): 24 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 25 | self.assignment_key = 'S1UqVXp-EeelpgpYPAO2Og' 26 | self.parts = OrderedDict([ 27 | ('edAEq', 'max_revenue'), 28 | ('Xn0Ec', 'category_id_with_max_revenue'), 29 | ('CZDVZ', 'num_items_constant_price'), 30 | ('HlAjc', 'total_num_items_sold_var')]) 31 | self.answers = {key: None for key in self.parts} 32 | 33 | @staticmethod 34 | def ravel_output(output): 35 | ''' 36 | If student accedentally submitted np.array with one 37 | element instead of number, this function will submit 38 | this number instead 39 | ''' 40 | if isinstance(output, np.ndarray) and output.size == 1: 41 | output = output.item(0) 42 | return output 43 | 44 | def submit(self, email, token): 45 | submission = { 46 | "assignmentKey": self.assignment_key, 47 | "submitterEmail": email, 48 | "secret": token, 49 | "parts": {} 50 | } 51 | for part, output in self.answers.items(): 52 | if output is not None: 53 | submission["parts"][part] = {"output": output} 54 | else: 55 | submission["parts"][part] = dict() 56 | request = requests.post(self.submission_page, data=json.dumps(submission)) 57 | response = request.json() 58 | if request.status_code == 201: 59 | print('Submitted to Coursera platform. See results on assignment page!') 60 | elif u'details' in response and u'learnerMessage' in response[u'details']: 61 | print(response[u'details'][u'learnerMessage']) 62 | else: 63 | print("Unknown response from Coursera: {}".format(request.status_code)) 64 | print(response) 65 | 66 | def status(self): 67 | print("You want to submit these numbers:") 68 | for part_id, part_name in self.parts.items(): 69 | answer = self.answers[part_id] 70 | if answer is None: 71 | answer = '-'*10 72 | print("Task {}: {}".format(part_name, answer)) 73 | 74 | def submit_part(self, part, output): 75 | self.answers[part] = output 76 | print("Current answer for task {} is: {}".format(self.parts[part], output)) 77 | 78 | def submit_tag(self, tag, output): 79 | part_id = [k for k, v in self.parts.items() if v == tag] 80 | if len(part_id)!=1: 81 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 82 | part_id = part_id[0] 83 | self.submit_part(part_id, str(self.ravel_output(output))) -------------------------------------------------------------------------------- /competitive-data-science/Programming assignment, week 3: Mean encodings/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | def array_to_hash(x): 7 | x_tupled = None 8 | if type(x) == list: 9 | x_tupled = tuple(x) 10 | elif type(x) == np.ndarray: 11 | x_tupled = tuple(list(x.flatten())) 12 | elif type(x) == tuple: 13 | x_tupled = x 14 | else: 15 | raise RuntimeError('unexpected type of input: {}'.format(type(x))) 16 | return hash(tuple(map(float, x_tupled))) 17 | 18 | def almostEqual(x, y): 19 | return abs(x - y) < 1e-5 20 | 21 | 22 | class Grader(object): 23 | def __init__(self): 24 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 25 | self.assignment_key = 'JVyZjZIaEeeXtQpjLCk-0A' 26 | self.parts = OrderedDict([ 27 | ('9zPRY', 'KFold_scheme'), 28 | ('xEf0Q', 'Leave-one-out_scheme'), 29 | ('zuMqo', 'Smoothing_scheme'), 30 | ('RNfnI', 'Expanding_mean_scheme')]) 31 | self.answers = {key: None for key in self.parts} 32 | 33 | @staticmethod 34 | def ravel_output(output): 35 | ''' 36 | If student accedentally submitted np.array with one 37 | element instead of number, this function will submit 38 | this number instead 39 | ''' 40 | if isinstance(output, np.ndarray) and output.size == 1: 41 | output = output.item(0) 42 | return output 43 | 44 | def submit(self, email, token): 45 | submission = { 46 | "assignmentKey": self.assignment_key, 47 | "submitterEmail": email, 48 | "secret": token, 49 | "parts": {} 50 | } 51 | for part, output in self.answers.items(): 52 | if output is not None: 53 | submission["parts"][part] = {"output": output} 54 | else: 55 | submission["parts"][part] = dict() 56 | request = requests.post(self.submission_page, data=json.dumps(submission)) 57 | response = request.json() 58 | if request.status_code == 201: 59 | print('Submitted to Coursera platform. See results on assignment page!') 60 | elif u'details' in response and u'learnerMessage' in response[u'details']: 61 | print(response[u'details'][u'learnerMessage']) 62 | else: 63 | print("Unknown response from Coursera: {}".format(request.status_code)) 64 | print(response) 65 | 66 | def status(self): 67 | print("You want to submit these numbers:") 68 | for part_id, part_name in self.parts.items(): 69 | answer = self.answers[part_id] 70 | if answer is None: 71 | answer = '-'*10 72 | print("Task {}: {}".format(part_name, answer)) 73 | 74 | def submit_part(self, part, output): 75 | self.answers[part] = output 76 | print("Current answer for task {} is: {}".format(self.parts[part], output)) 77 | 78 | def submit_tag(self, tag, output): 79 | part_id = [k for k, v in self.parts.items() if v == tag] 80 | if len(part_id)!=1: 81 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 82 | part_id = part_id[0] 83 | self.submit_part(part_id, str(self.ravel_output(output))) -------------------------------------------------------------------------------- /competitive-data-science/Programming assignment, week 4: Ensembles/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | def array_to_hash(x): 7 | x_tupled = None 8 | if type(x) == list: 9 | x_tupled = tuple(x) 10 | elif type(x) == np.ndarray: 11 | x_tupled = tuple(list(x.flatten())) 12 | elif type(x) == tuple: 13 | x_tupled = x 14 | else: 15 | raise RuntimeError('unexpected type of input: {}'.format(type(x))) 16 | return hash(tuple(map(float, x_tupled))) 17 | 18 | def almostEqual(x, y): 19 | return abs(x - y) < 1e-5 20 | 21 | 22 | class Grader(object): 23 | def __init__(self): 24 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 25 | self.assignment_key = 'Lhay-55JEeet3xIBvGMumA' 26 | self.parts = OrderedDict([ 27 | ('EyiFH', 'best_alpha'), 28 | ('XH82R', 'r2_train_simple_mix'), 29 | ('BHeRs', 'r2_test_simple_mix'), 30 | ('MkwCS', 'r2_train_stacking'), 31 | ('j4Adb', 'r2_test_stacking'), 32 | ]) 33 | self.answers = {key: None for key in self.parts} 34 | 35 | @staticmethod 36 | def ravel_output(output): 37 | ''' 38 | If student accedentally submitted np.array with one 39 | element instead of number, this function will submit 40 | this number instead 41 | ''' 42 | if isinstance(output, np.ndarray) and output.size == 1: 43 | output = output.item(0) 44 | return output 45 | 46 | def submit(self, email, token): 47 | submission = { 48 | "assignmentKey": self.assignment_key, 49 | "submitterEmail": email, 50 | "secret": token, 51 | "parts": {} 52 | } 53 | for part, output in self.answers.items(): 54 | if output is not None: 55 | submission["parts"][part] = {"output": output} 56 | else: 57 | submission["parts"][part] = dict() 58 | request = requests.post(self.submission_page, data=json.dumps(submission)) 59 | response = request.json() 60 | if request.status_code == 201: 61 | print('Submitted to Coursera platform. See results on assignment page!') 62 | elif u'details' in response and u'learnerMessage' in response[u'details']: 63 | print(response[u'details'][u'learnerMessage']) 64 | else: 65 | print("Unknown response from Coursera: {}".format(request.status_code)) 66 | print(response) 67 | 68 | def status(self): 69 | print("You want to submit these numbers:") 70 | for part_id, part_name in self.parts.items(): 71 | answer = self.answers[part_id] 72 | if answer is None: 73 | answer = '-'*10 74 | print("Task {}: {}".format(part_name, answer)) 75 | 76 | def submit_part(self, part, output): 77 | self.answers[part] = output 78 | print("Current answer for task {} is: {}".format(self.parts[part], output)) 79 | 80 | def submit_tag(self, tag, output): 81 | part_id = [k for k, v in self.parts.items() if v == tag] 82 | if len(part_id)!=1: 83 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 84 | part_id = part_id[0] 85 | self.submit_part(part_id, str(self.ravel_output(output))) -------------------------------------------------------------------------------- /competitive-data-science/Programming assignment, week 4: KNN features/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 18, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "1" 21 | ] 22 | }, 23 | "execution_count": 18, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "neighs_y=np.array([0,1,2])\n", 30 | "diffs = np.insert(np.diff(neighs_y), 0, 999)\n", 31 | "feats = np.unique(neighs_y[diffs == 0], return_counts=True)\n", 32 | "if len(feats[0])==0:\n", 33 | " feats=1\n", 34 | "else:\n", 35 | " feats=feats[1].max() + 1\n", 36 | "feats" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 51, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "array([2])" 48 | ] 49 | }, 50 | "execution_count": 51, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "x=np.array([0,0,3,4,0])\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 89, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "def func(x):\n", 66 | " x=np.array(x)\n", 67 | " if len(x)==1:\n", 68 | " return 1\n", 69 | " suma=np.where(x==x[0],0,1)\n", 70 | " return suma.cumsum()[np.argwhere(suma)[0]]" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 94, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "ename": "AssertionError", 80 | "evalue": "", 81 | "output_type": "error", 82 | "traceback": [ 83 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 84 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 85 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 86 | "\u001b[0;31mAssertionError\u001b[0m: " 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "assert func([0])==1\n", 92 | "assert func([0,1,1])==1\n", 93 | "assert func([0,1,0,1])==1\n", 94 | "assert func([1,1,2,2,1])==2" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 136, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "2" 106 | ] 107 | }, 108 | "execution_count": 136, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "x=np.array([0,0,1,0,1])\n", 115 | "suma=np.where(x==x[0],1,0)\n", 116 | "np.equal(suma.cumsum(),np.arange(1,6)).sum()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 127, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "array([1, 2, 3, 3, 3])" 128 | ] 129 | }, 130 | "execution_count": 127, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "cs" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 119, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "array([2])" 148 | ] 149 | }, 150 | "execution_count": 119, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "np.argwhere(np.diff(cs)).ravel()" 157 | ] 158 | } 159 | ], 160 | "metadata": { 161 | "kernelspec": { 162 | "display_name": "Python 3", 163 | "language": "python", 164 | "name": "python3" 165 | }, 166 | "language_info": { 167 | "codemirror_mode": { 168 | "name": "ipython", 169 | "version": 3 170 | }, 171 | "file_extension": ".py", 172 | "mimetype": "text/x-python", 173 | "name": "python", 174 | "nbconvert_exporter": "python", 175 | "pygments_lexer": "ipython3", 176 | "version": "3.6.3" 177 | } 178 | }, 179 | "nbformat": 4, 180 | "nbformat_minor": 2 181 | } 182 | -------------------------------------------------------------------------------- /competitive-data-science/Programming assignment, week 4: KNN features/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | def array_to_hash(x): 7 | x_tupled = None 8 | if type(x) == list: 9 | x_tupled = tuple(x) 10 | elif type(x) == np.ndarray: 11 | x_tupled = tuple(list(x.flatten())) 12 | elif type(x) == tuple: 13 | x_tupled = x 14 | else: 15 | raise RuntimeError('unexpected type of input: {}'.format(type(x))) 16 | return hash(tuple(map(float, x_tupled))) 17 | 18 | def almostEqual(x, y): 19 | return abs(x - y) < 1e-3 20 | 21 | 22 | class Grader(object): 23 | def __init__(self): 24 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 25 | self.assignment_key = 'r2N4iqFlEeeRFQqEddeEzg' 26 | self.parts = OrderedDict([ 27 | ('1O8kU', 'statistic')]) 28 | self.answers = {key: None for key in self.parts} 29 | 30 | @staticmethod 31 | def ravel_output(output): 32 | ''' 33 | If student accedentally submitted np.array with one 34 | element instead of number, this function will submit 35 | this number instead 36 | ''' 37 | if isinstance(output, np.ndarray) and output.size == 1: 38 | output = output.item(0) 39 | return output 40 | 41 | def submit(self, email, token): 42 | submission = { 43 | "assignmentKey": self.assignment_key, 44 | "submitterEmail": email, 45 | "secret": token, 46 | "parts": {} 47 | } 48 | for part, output in self.answers.items(): 49 | if output is not None: 50 | submission["parts"][part] = {"output": output} 51 | else: 52 | submission["parts"][part] = dict() 53 | request = requests.post(self.submission_page, data=json.dumps(submission)) 54 | response = request.json() 55 | if request.status_code == 201: 56 | print('Submitted to Coursera platform. See results on assignment page!') 57 | elif u'details' in response and u'learnerMessage' in response[u'details']: 58 | print(response[u'details'][u'learnerMessage']) 59 | else: 60 | print("Unknown response from Coursera: {}".format(request.status_code)) 61 | print(response) 62 | 63 | def status(self): 64 | print("You want to submit these numbers:") 65 | for part_id, part_name in self.parts.items(): 66 | answer = self.answers[part_id] 67 | if answer is None: 68 | answer = '-'*10 69 | print("Task {}: {}".format(part_name, answer)) 70 | 71 | def submit_part(self, part, output): 72 | self.answers[part] = output 73 | print("Current answer for task {} is: {}".format(self.parts[part], output)) 74 | 75 | def submit_tag(self, tag, output): 76 | part_id = [k for k, v in self.parts.items() if v == tag] 77 | if len(part_id)!=1: 78 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 79 | part_id = part_id[0] 80 | self.submit_part(part_id, str(self.ravel_output(output))) -------------------------------------------------------------------------------- /competitive-data-science/Programming assignment, week 4: KNN features/test multiprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from multiprocessing import Pool" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 12, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "def f2(x):\n", 19 | " for i in range(100000000):\n", 20 | " x+=i\n", 21 | " return x" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 14, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "[4999999950000001, 4999999950000002, 4999999950000003, 4999999950000005, 4999999950000006, 4999999950000007]\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "if __name__ == '__main__':\n", 39 | " p = Pool(processes=7)\n", 40 | " print(p.map(f2, [1, 2, 3,5,6,7]))" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 20, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import numpy as np" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 26, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "[ 0. 0.33333333 0.33333333 0. 0.33333333 0. 0.\n", 62 | " 0. 0. 0. ]\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "neighs_y=[1,2,4,4,1,2,3]\n", 68 | "classes=10\n", 69 | "feats = np.bincount(neighs_y[:3],minlength=classes)\n", 70 | " \n", 71 | "feats = feats / feats.sum()\n", 72 | "print(feats)\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 52, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "array([999, 1, 2, 0, -3, 1, 1, 0, 0])" 84 | ] 85 | }, 86 | "execution_count": 52, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 67, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "3" 102 | ] 103 | }, 104 | "execution_count": 67, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "x=np.array([1,2,4,4,1,2,3,3,3])\n", 111 | "diffs=np.insert(np.diff(x),0,999)\n", 112 | "\n", 113 | "np.unique(x[diffs==0],return_counts=True)[1].max()+1\n" 114 | ] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.6.3" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 2 138 | } 139 | -------------------------------------------------------------------------------- /competitive-data-science/README.md: -------------------------------------------------------------------------------- 1 | ## Materials for "How to Win a Data Science Competition: Learn from Top Kagglers" course 2 | 3 | This repository contains programming assignments notebooks for the [course](https://www.coursera.org/learn/competitive-data-science/home/welcome) about competitive data science. 4 | -------------------------------------------------------------------------------- /competitive-data-science/Reading materials/Macros.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Macros" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This notebook shows how to use *macros* commands in Jupyter.\n", 15 | "\n", 16 | "What is *macro*? It is just a named code snippet. Similarly to functions, we can use macros to wrap frequently used code. For example, we can define a macro, that will load all the libraries for us.\n", 17 | "\n", 18 | "### Step 1: Define macro \n", 19 | "\n", 20 | "To save some code as a macro we need to put that code in a cell and run it. " 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "The libraries have been loaded!\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "import numpy as np\n", 38 | "import pandas as pd \n", 39 | "from tqdm import tqdm_notebook\n", 40 | "import os\n", 41 | "import sys\n", 42 | "import os.path\n", 43 | "\n", 44 | "import matplotlib.pyplot as plt\n", 45 | "import matplotlib as mpl\n", 46 | "from matplotlib import rc\n", 47 | "from cycler import cycler\n", 48 | "%matplotlib inline\n", 49 | "\n", 50 | " \n", 51 | "mpl.rcParams['axes.prop_cycle'] = cycler('color', ['#ff0000', '#0000ff', '#00ffff','#ffA300', '#00ff00', \n", 52 | " '#ff00ff', '#990000', '#009999', '#999900', '#009900', '#009999'])\n", 53 | "\n", 54 | "rc('font', size=16)\n", 55 | "rc('font',**{'family':'serif','serif':['Computer Modern']})\n", 56 | "rc('text', usetex=False)\n", 57 | "rc('figure', figsize=(12, 10))\n", 58 | "rc('axes', linewidth=.5)\n", 59 | "rc('lines', linewidth=1.75)\n", 60 | "\n", 61 | "print('The libraries have been loaded!')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "Now you need to remember the number inside squre brackets of `In []`. Now, to save the code, in that cell you need to use macro magic:\n", 69 | "\n", 70 | "```\n", 71 | "%macro __imp \n", 72 | "```" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 2, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "%macro -q __imp 1" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "Now try it!" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 3, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "The libraries have been loaded!\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "__imp" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "### Step 2: save macro\n", 115 | "\n", 116 | "To this end we've only created a macro, but it will be lost, when the kernel is restarted. We need to somehow store it, so than we can load it easily later. In can be done with `%store` macro." 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 4, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "Stored '__imp' (Macro)\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "%store __imp" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Now `__imp` is saved in a kind of Jupyter's global memory. You can list all the stored variables like that:" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 5, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "Stored variables and their in-db values:\n", 153 | "__imp -> IPython.macro.Macro(\"import numpy as np\\nimport pa\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "%store" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Now **restart the kernel** and get back to this cell without running the previous ones. To run the stored macro you need to retrieve the macro first with the following line: " 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 1, 171 | "metadata": { 172 | "collapsed": true 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "%store -r __imp" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "And only then call the macro:" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 2, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "The libraries have been loaded!\n" 196 | ] 197 | } 198 | ], 199 | "source": [ 200 | "__imp" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### Step 3: auto restore macro" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "So you need to use as many as 2 cells! But, fortunately, Jupyer can load the stored variables (and macros) automatically. To enable it you need to update you `.ipython_profile` [config](http://ipython.readthedocs.io/en/stable/development/config.html). If you've never heared of it, then it is not yet created, otherwise you should know where it lives. \n", 215 | "\n", 216 | "On Coursera's notebooks we will create it here: `~/.ipython/profile_default/ipython_profile.py` and notify the ipython, that we want it to automatically restore stored variables.\n", 217 | "\n", 218 | "```\n", 219 | "c.StoreMagics.autorestore = True\n", 220 | "```" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 4, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "c = get_config()\r\n", 233 | "c.StoreMagics.autorestore = True\r\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "!echo \"c = get_config()\\nc.StoreMagics.autorestore = True\" > ~/.ipython/profile_default/ipython_config.py\n", 239 | "!cat ~/.ipython/profile_default/ipython_config.py" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "That's it! Now **restart your notebook (kernel)** and **define and store macro** again (step 1 and first code cell from step 2). And finally, to test it, **restart the kernel** again. Now you can immediately access `__imp` macro, so that all the libraries are loaded with a 5 char line of code." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 1, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "The libraries have been loaded!\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "__imp" 264 | ] 265 | } 266 | ], 267 | "metadata": { 268 | "kernelspec": { 269 | "display_name": "Python 3", 270 | "language": "python", 271 | "name": "python3" 272 | }, 273 | "language_info": { 274 | "codemirror_mode": { 275 | "name": "ipython", 276 | "version": 3 277 | }, 278 | "file_extension": ".py", 279 | "mimetype": "text/x-python", 280 | "name": "python", 281 | "nbconvert_exporter": "python", 282 | "pygments_lexer": "ipython3", 283 | "version": "3.6.0" 284 | } 285 | }, 286 | "nbformat": 4, 287 | "nbformat_minor": 1 288 | } 289 | -------------------------------------------------------------------------------- /competitive-data-science/Reading materials/Metrics_video8_soft_kappa_xgboost.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Soft Kappa objective" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this notebook you can find an implementation for \"soft kappa\" loss and objective from [this paper](https://arxiv.org/abs/1509.07107). " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "def soft_kappa_grad_hess(y, p):\n", 26 | " '''\n", 27 | " Returns first and second derivatives of the objective with respect to predictions `p`. \n", 28 | " `y` is a vector of corresponding target labels. \n", 29 | " '''\n", 30 | " norm = p.dot(p) + y.dot(y)\n", 31 | " \n", 32 | " grad = -2 * y / norm + 4 * p * np.dot(y, p) / (norm ** 2)\n", 33 | " hess = 8 * p * y / (norm ** 2) + 4 * np.dot(y, p) / (norm ** 2) - (16 * p ** 2 * np.dot(y, p)) / (norm ** 3)\n", 34 | " return grad, hess" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "def soft_kappa(preds, dtrain):\n", 46 | " '''\n", 47 | " Having predictions `preds` and targets `dtrain.get_label()` this function coumputes soft kappa loss.\n", 48 | " NOTE, that it assumes `mean(target) = 0`.\n", 49 | " \n", 50 | " '''\n", 51 | " target = dtrain.get_label()\n", 52 | " return 'kappa' , -2 * target.dot(preds) / (target.dot(target) + preds.dot(preds))" 53 | ] 54 | } 55 | ], 56 | "metadata": { 57 | "kernelspec": { 58 | "display_name": "Python 3", 59 | "language": "python", 60 | "name": "python3" 61 | }, 62 | "language_info": { 63 | "codemirror_mode": { 64 | "name": "ipython", 65 | "version": 3 66 | }, 67 | "file_extension": ".py", 68 | "mimetype": "text/x-python", 69 | "name": "python", 70 | "nbconvert_exporter": "python", 71 | "pygments_lexer": "ipython3", 72 | "version": "3.6.0" 73 | } 74 | }, 75 | "nbformat": 4, 76 | "nbformat_minor": 1 77 | } 78 | -------------------------------------------------------------------------------- /competitive-data-science/kaggle_project/Documentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/competitive-data-science/kaggle_project/Documentation.pdf -------------------------------------------------------------------------------- /competitive-data-science/kaggle_project/README.md: -------------------------------------------------------------------------------- 1 | # The solution is password protected to avoid spoiling the competition for others :-) 2 | 3 | # How to generate the solution 4 | 5 | Work through the FEAT_ notebooks to generate the required features. Note that the cells that generate the .csv.gz are commented out. Uncomment them if you do want the files. 6 | 7 | Then work through MODEL_final to train the models. **Do not blindly run the notebooks!** 8 | Throughout the MODEL notebook there are a series of checkpoints that will save the progress so far (To h5 or pickle). This is intended to be used if you don’t have a lot of RAM. You can just work up to that point, restart the notebook, run Cell 1 to import packages, then scroll down, and reload what you just saved, to wipeout unwanted memory. 9 | 10 | In the model notebook, you will first have to train the models once on the training set (This generates the ALT_*_TRAIN) files, and then the ALT_MODEL files, which are the final models. These final models are included. The stacked model tuning is done using the TRAIN models, but at the end you will train the meta-model on the full models. 11 | 12 | Finally, to predict run the Predict notebook **(again, not blindly).** Optionally you can try to zero out some predictions uncommenting one of the final lines, but my best score was achieved with the results “as they are”. 13 | 14 | The final solution should score slightly below 0.95, achieving 10/10 in the grader. 15 | -------------------------------------------------------------------------------- /intro-to-dle/README.md: -------------------------------------------------------------------------------- 1 | # Introduction to Deep Learning course resources 2 | https://www.coursera.org/learn/intro-to-deep-learning 3 | 4 | ## Offline instructions 5 | Coursera Jupyter Environment can be slow if many learners use it heavily. 6 | Our tasks are compute-heavy and we recommend to run them on your hardware for optimal performance. 7 | 8 | You will need a computer with at least 4GB of RAM. 9 | 10 | There're two options to setup the Jupyter Notebooks locally: Docker container and Anaconda. 11 | 12 | ### Docker container option (best for Mac/Linux) 13 | 14 | Follow the instructions on https://hub.docker.com/r/zimovnov/coursera-aml-docker/ to install Docker container with all necessary software installed. 15 | 16 | After that you should see a Jupyter page in your browser. 17 | 18 | ### Anaconda option (best for Windows) 19 | We highly recommend to install docker environment, but if it's not an option, 20 | you can try to install the necessary python modules with Anaconda. 21 | 22 | First, install Anaconda with **Python 3.5+** from [here](https://www.anaconda.com/download). 23 | 24 | Download `conda_requirements.txt` from [here](https://github.com/ZEMUSHKA/coursera-aml-docker/blob/master/conda_requirements.txt). 25 | 26 | Open terminal on Mac/Linux or "Anaconda Prompt" in Start Menu on Windows and run: 27 | ``` 28 | conda config --append channels conda-forge 29 | conda config --append channels menpo 30 | conda install --yes --file conda_requirements.txt 31 | ``` 32 | 33 | To start Jupyter Notebooks run `jupyter notebook` on Mac/Linux or "Jupyter Notebook" in Start Menu on Windows. 34 | 35 | After that you should see a Jupyter page in your browser. 36 | 37 | ### Prepare resources inside Jupyter Notebooks (for local setups only) 38 | 39 | Click **New -> Terminal** and execute: `git clone https://github.com/hse-aml/intro-to-dl.git` 40 | On Windows you might want to install [Git](https://git-scm.com/download/win). 41 | You can also download all the resources as zip archive from GitHub page. 42 | 43 | Close the terminal and refresh Jupyter page, you will see **intro-to-dl** folder, go there, 44 | all the necessary notebooks are waiting for you. 45 | 46 | First you need to download necessary resources, to do that open `download_resources.ipynb` 47 | and run cells for Keras and your week. 48 | 49 | Now you can open a notebook for the corresponding week and work there just like in Coursera Jupyter Environment. -------------------------------------------------------------------------------- /intro-to-dle/download_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import shutil 5 | import tqdm 6 | import requests 7 | 8 | 9 | def download_file(url, file_path): 10 | r = requests.get(url, stream=True) 11 | total_size = int(r.headers.get('content-length')) 12 | try: 13 | with open(file_path, 'wb', buffering=16*1024*1024) as f: 14 | bar = tqdm.tqdm_notebook(total=total_size, unit='B', unit_scale=True) 15 | bar.set_description(os.path.split(file_path)[-1]) 16 | for chunk in r.iter_content(32 * 1024): 17 | f.write(chunk) 18 | bar.update(len(chunk)) 19 | except Exception: 20 | print("Download failed") 21 | finally: 22 | if os.path.getsize(file_path) != total_size: 23 | os.remove(file_path) 24 | print("Removed incomplete download") 25 | 26 | 27 | def download_from_github(version, fn, target_dir): 28 | url = "https://github.com/hse-aml/intro-to-dl/releases/download/{0}/{1}".format(version, fn) 29 | file_path = os.path.join(target_dir, fn) 30 | download_file(url, file_path) 31 | 32 | 33 | def sequential_downloader(version, fns, target_dir): 34 | os.makedirs(target_dir, exist_ok=True) 35 | for fn in fns: 36 | download_from_github(version, fn, target_dir) 37 | 38 | 39 | def link_all_files_from_dir(src_dir, dst_dir): 40 | os.makedirs(dst_dir, exist_ok=True) 41 | for fn in os.listdir(src_dir): 42 | src_file = os.path.join(src_dir, fn) 43 | dst_file = os.path.join(dst_dir, fn) 44 | if os.name == "nt": 45 | shutil.copyfile(src_file, dst_file) 46 | else: 47 | if not os.path.exists(dst_file): 48 | os.symlink(os.path.abspath(src_file), dst_file) 49 | 50 | 51 | def link_all_keras_resources(): 52 | link_all_files_from_dir("../readonly/keras/datasets/", os.path.expanduser("~/.keras/datasets")) 53 | link_all_files_from_dir("../readonly/keras/models/", os.path.expanduser("~/.keras/models")) 54 | 55 | 56 | def link_week_3_resources(): 57 | link_all_files_from_dir("../readonly/week3/", ".") 58 | 59 | 60 | def link_week_4_resources(): 61 | link_all_files_from_dir("../readonly/week4/", ".") 62 | 63 | 64 | def link_week_6_resources(): 65 | link_all_files_from_dir("../readonly/week6/", ".") 66 | -------------------------------------------------------------------------------- /intro-to-dle/grading.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import re 4 | import requests 5 | import json 6 | 7 | 8 | class Grader(object): 9 | def __init__(self, assignment_key, all_parts=()): 10 | """ 11 | Assignment key is the way to tell Coursera which problem is being submitted. 12 | """ 13 | self.submission_page = \ 14 | 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 15 | self.assignment_key = assignment_key 16 | self.answers = {part: None for part in all_parts} 17 | 18 | def submit(self, email, token): 19 | submission = { 20 | "assignmentKey": self.assignment_key, 21 | "submitterEmail": email, 22 | "secret": token, 23 | "parts": {} 24 | } 25 | for part, output in self.answers.items(): 26 | if output is not None: 27 | submission["parts"][part] = {"output": output} 28 | else: 29 | submission["parts"][part] = dict() 30 | request = requests.post(self.submission_page, data=json.dumps(submission)) 31 | response = request.json() 32 | if request.status_code == 201: 33 | print('Submitted to Coursera platform. See results on assignment page!') 34 | elif u'details' in response and u'learnerMessage' in response[u'details']: 35 | print(response[u'details'][u'learnerMessage']) 36 | else: 37 | print("Unknown response from Coursera: {}".format(request.status_code)) 38 | print(response) 39 | 40 | def set_answer(self, part, answer): 41 | """Adds an answer for submission. Answer is expected either as string, number, or 42 | an iterable of numbers. 43 | Args: 44 | part - str, assignment part id 45 | answer - answer to submit. If non iterable, appends repr(answer). If string, 46 | is appended as provided. If an iterable and not string, converted to 47 | space-delimited repr() of members. 48 | """ 49 | if isinstance(answer, str): 50 | self.answers[part] = answer 51 | else: 52 | try: 53 | self.answers[part] = " ".join(map(repr, answer)) 54 | except TypeError: 55 | self.answers[part] = repr(answer) 56 | 57 | 58 | def array_to_grader(array, epsilon=1e-4): 59 | """Utility function to help preparing Coursera grading conditions descriptions. 60 | Args: 61 | array: iterable of numbers, the correct answers 62 | epslion: the generated expression will accept the answers with this absolute difference with 63 | provided values 64 | Returns: 65 | String. A Coursera grader expression that checks whether the user submission is in 66 | (array - epsilon, array + epsilon)""" 67 | res = [] 68 | for element in array: 69 | if isinstance(element, int): 70 | res.append("[{0}, {0}]".format(element)) 71 | else: 72 | res.append("({0}, {1})".format(element - epsilon, element + epsilon)) 73 | return " ".join(res) 74 | -------------------------------------------------------------------------------- /intro-to-dle/keras_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import keras 4 | import tqdm 5 | from collections import defaultdict 6 | import numpy as np 7 | 8 | 9 | class TqdmProgressCallback(keras.callbacks.Callback): 10 | 11 | def on_train_begin(self, logs=None): 12 | self.epochs = self.params['epochs'] 13 | 14 | def on_epoch_begin(self, epoch, logs=None): 15 | print('Epoch %d/%d' % (epoch + 1, self.epochs)) 16 | if "steps" in self.params: 17 | self.use_steps = True 18 | self.target = self.params['steps'] 19 | else: 20 | self.use_steps = False 21 | self.target = self.params['samples'] 22 | self.prog_bar = tqdm.tqdm_notebook(total=self.target) 23 | self.log_values_by_metric = defaultdict(list) 24 | 25 | def _set_prog_bar_desc(self, logs): 26 | for k in self.params['metrics']: 27 | if k in logs: 28 | self.log_values_by_metric[k].append(logs[k]) 29 | desc = "; ".join("{0}: {1:.3f}".format(k, np.mean(values)) for k, values in self.log_values_by_metric.items()) 30 | self.prog_bar.set_description(desc) 31 | 32 | def on_batch_end(self, batch, logs=None): 33 | logs = logs or {} 34 | if self.use_steps: 35 | self.prog_bar.update(1) 36 | else: 37 | batch_size = logs.get('size', 0) 38 | self.prog_bar.update(batch_size) 39 | self._set_prog_bar_desc(logs) 40 | 41 | def on_epoch_end(self, epoch, logs=None): 42 | logs = logs or {} 43 | self._set_prog_bar_desc(logs) 44 | self.prog_bar.update(1) # workaround to show description 45 | self.prog_bar.close() 46 | -------------------------------------------------------------------------------- /intro-to-dle/misc/np_convolution.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.signal import convolve2d 3 | my_array = np.array([[0, 0, 0, 0], 4 | [0, 1, 0, 0], 5 | [0, 0, 0, 0], 6 | [0, 0, 0, 0]]) 7 | 8 | kernel = np.array([[0, 1, 0], 9 | [1, 1, 1], 10 | [0, 1, 0]]) 11 | 12 | convolved = convolve2d(my_array, kernel, mode="same") 13 | print(convolved) 14 | -------------------------------------------------------------------------------- /intro-to-dle/week1/kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week1/kernel.png -------------------------------------------------------------------------------- /intro-to-dle/week1/sgd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week1/sgd.png -------------------------------------------------------------------------------- /intro-to-dle/week1/target.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week1/target.npy -------------------------------------------------------------------------------- /intro-to-dle/week1/train.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week1/train.npy -------------------------------------------------------------------------------- /intro-to-dle/week2/datasets: -------------------------------------------------------------------------------- 1 | /home/jose/.keras/datasets/ -------------------------------------------------------------------------------- /intro-to-dle/week2/matplotlib_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from IPython.display import clear_output, display_html, HTML 4 | import contextlib 5 | import time 6 | import io 7 | import urllib 8 | import base64 9 | 10 | 11 | class SimpleMovieWriter(object): 12 | """ 13 | Usage example: 14 | anim = animation.FuncAnimation(...) 15 | anim.save(None, writer=SimpleMovieWriter(sleep=0.01)) 16 | """ 17 | def __init__(self, sleep=0.1): 18 | self.sleep = sleep 19 | 20 | def setup(self, fig): 21 | self.fig = fig 22 | 23 | def grab_frame(self, **kwargs): 24 | img_data = io.BytesIO() 25 | self.fig.savefig(img_data, format='jpeg') 26 | img_data.seek(0) 27 | uri = 'data:image/jpeg;base64,' + urllib.request.quote(base64.b64encode(img_data.getbuffer())) 28 | img_data.close() 29 | clear_output(wait=True) 30 | display_html(HTML('')) 31 | time.sleep(self.sleep) 32 | 33 | @contextlib.contextmanager 34 | def saving(self, fig, *args, **kwargs): 35 | self.setup(fig) 36 | try: 37 | yield self 38 | finally: 39 | pass 40 | -------------------------------------------------------------------------------- /intro-to-dle/week2/models: -------------------------------------------------------------------------------- 1 | /home/jose/.keras/models/ -------------------------------------------------------------------------------- /intro-to-dle/week2/preprocessed_mnist.py: -------------------------------------------------------------------------------- 1 | import keras 2 | 3 | 4 | def load_dataset(flatten=False): 5 | (X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data() 6 | 7 | # normalize x 8 | X_train = X_train.astype(float) / 255. 9 | X_test = X_test.astype(float) / 255. 10 | 11 | # we reserve the last 10000 training examples for validation 12 | X_train, X_val = X_train[:-10000], X_train[-10000:] 13 | y_train, y_val = y_train[:-10000], y_train[-10000:] 14 | 15 | if flatten: 16 | X_train = X_train.reshape([X_train.shape[0], -1]) 17 | X_val = X_val.reshape([X_val.shape[0], -1]) 18 | X_test = X_test.reshape([X_test.shape[0], -1]) 19 | 20 | return X_train, y_train, X_val, y_val, X_test, y_test 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /intro-to-dle/week2/submit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import mean_squared_error 3 | import sys 4 | sys.path.append("..") 5 | import grading 6 | 7 | 8 | def submit_mse(compute_mse, email, token): 9 | ASSIGNMENT_KEY = "SBaWP48eEeeGSBKyliRlgg" 10 | PART_KEY = "u2t7D" 11 | 12 | # First, do rigorous local testing to help the learner 13 | for n in [1, 5, 10, 10**3]: 14 | elems = [np.arange(n), np.arange(n, 0, -1), np.zeros(n), 15 | np.ones(n), np.random.random(n), np.random.randint(100, size=n)] 16 | for el in elems: 17 | for el_2 in elems: 18 | true_mse = np.array(mean_squared_error(el, el_2)) 19 | my_mse = compute_mse(el, el_2) 20 | if not np.allclose(true_mse, my_mse): 21 | print('mse(%s,%s)' % (el, el_2)) 22 | print("should be: %f, but your function returned %f" % (true_mse, my_mse)) 23 | raise ValueError('Wrong result') 24 | # Second, submit some reference values. There is nothing preventing the learner from 25 | # manually submitting numbers computed not via tensorflow, so there is little point 26 | # in comprehensive server-side testing 27 | test_pairs = ( 28 | (np.array([ 29 | 0.85415937, 0.768366, 0.9763879, 0.11861405, 0.21219242]), 30 | np.array([0.27163543, 0.14893905, 0.84616464, 31 | 0.86294942, 0.65509213])), 32 | (np.array([1, 2, 3]), np.array([3, 2, 2])), 33 | (np.array([1]), np.array([1]))) 34 | answers = [] 35 | for pair in test_pairs: 36 | answers.append(compute_mse(pair[0], pair[1])) 37 | grader = grading.Grader(ASSIGNMENT_KEY) 38 | grader.set_answer(PART_KEY, answers) 39 | grader.submit(email, token) 40 | -------------------------------------------------------------------------------- /intro-to-dle/week2/util.py: -------------------------------------------------------------------------------- 1 | """Some auxiliary files used for honor track numpy assignment""" 2 | import numpy as np 3 | 4 | 5 | def eval_numerical_gradient(f, x, verbose=False, h=0.00001): 6 | """Evaluates gradient df/dx via finite differences: 7 | df/dx ~ (f(x+h) - f(x-h)) / 2h 8 | Adopted from https://github.com/ddtm/dl-course/ (our ysda course). 9 | """ 10 | fx = f(x) # evaluate function value at original point 11 | grad = np.zeros_like(x) 12 | # iterate over all indexes in x 13 | it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) 14 | while not it.finished: 15 | 16 | # evaluate function at x+h 17 | ix = it.multi_index 18 | oldval = x[ix] 19 | x[ix] = oldval + h # increment by h 20 | fxph = f(x) # evalute f(x + h) 21 | x[ix] = oldval - h 22 | fxmh = f(x) # evaluate f(x - h) 23 | x[ix] = oldval # restore 24 | 25 | # compute the partial derivative with centered formula 26 | grad[ix] = (fxph - fxmh) / (2 * h) # the slope 27 | if verbose: 28 | print (ix, grad[ix]) 29 | it.iternext() # step to next dimension 30 | 31 | return grad 32 | -------------------------------------------------------------------------------- /intro-to-dle/week3/grading_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import re 4 | 5 | def model_total_params(model): 6 | """ 7 | Total params for Keras model 8 | """ 9 | summary = [] 10 | model.summary(print_fn=lambda x: summary.append(x)) 11 | for line in summary: 12 | m = re.match("Total params: ([\d,]+)", line) 13 | if m: 14 | return int(re.sub(",", "", m.groups()[0])) 15 | return 0 16 | -------------------------------------------------------------------------------- /intro-to-dle/week3/imagelabels.mat: -------------------------------------------------------------------------------- 1 | /home/jose/Escritorio/advanced_deep_learning/intro-to-dl/readonly/week3/imagelabels.mat -------------------------------------------------------------------------------- /intro-to-dle/week3/images/inceptionv3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week3/images/inceptionv3.png -------------------------------------------------------------------------------- /intro-to-dle/week3/weights.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week3/weights.p -------------------------------------------------------------------------------- /intro-to-dle/week4/lfw_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import cv2 4 | import pandas as pd 5 | import tarfile 6 | import tqdm 7 | 8 | 9 | ATTRS_NAME = "lfw_attributes.txt" # http://www.cs.columbia.edu/CAVE/databases/pubfig/download/lfw_attributes.txt 10 | IMAGES_NAME = "lfw-deepfunneled.tgz" # http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz 11 | RAW_IMAGES_NAME = "lfw.tgz" # http://vis-www.cs.umass.edu/lfw/lfw.tgz 12 | 13 | 14 | def decode_image_from_raw_bytes(raw_bytes): 15 | img = cv2.imdecode(np.asarray(bytearray(raw_bytes), dtype=np.uint8), 1) 16 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 17 | return img 18 | 19 | 20 | def load_lfw_dataset( 21 | use_raw=False, 22 | dx=80, dy=80, 23 | dimx=45, dimy=45): 24 | 25 | # read attrs 26 | df_attrs = pd.read_csv(ATTRS_NAME, sep='\t', skiprows=1) 27 | df_attrs = pd.DataFrame(df_attrs.iloc[:, :-1].values, columns=df_attrs.columns[1:]) 28 | imgs_with_attrs = set(map(tuple, df_attrs[["person", "imagenum"]].values)) 29 | 30 | # read photos 31 | all_photos = [] 32 | photo_ids = [] 33 | 34 | with tarfile.open(RAW_IMAGES_NAME if use_raw else IMAGES_NAME) as f: 35 | for m in tqdm.tqdm_notebook(f.getmembers()): 36 | if m.isfile() and m.name.endswith(".jpg"): 37 | # prepare image 38 | img = decode_image_from_raw_bytes(f.extractfile(m).read()) 39 | img = img[dy:-dy, dx:-dx] 40 | img = cv2.resize(img, (dimx, dimy)) 41 | # parse person 42 | fname = os.path.split(m.name)[-1] 43 | fname_splitted = fname[:-4].replace('_', ' ').split() 44 | person_id = ' '.join(fname_splitted[:-1]) 45 | photo_number = int(fname_splitted[-1]) 46 | if (person_id, photo_number) in imgs_with_attrs: 47 | all_photos.append(img) 48 | photo_ids.append({'person': person_id, 'imagenum': photo_number}) 49 | 50 | photo_ids = pd.DataFrame(photo_ids) 51 | all_photos = np.stack(all_photos).astype('uint8') 52 | 53 | # preserve photo_ids order! 54 | all_attrs = photo_ids.merge(df_attrs, on=('person', 'imagenum')).drop(["person", "imagenum"], axis=1) 55 | 56 | return all_photos, all_attrs 57 | -------------------------------------------------------------------------------- /intro-to-dle/week4/submit.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | import grading 4 | 5 | 6 | # code_size = 71 7 | # img_shape = (38, 38, 3) 8 | def submit_autoencoder(submission, score, email, token): 9 | grader = grading.Grader("9TShnp1JEeeGGAoCUnhvuA") 10 | encoder, decoder = submission 11 | grader.set_answer("FtBSK", encoder.output_shape[1]) 12 | grader.set_answer("83Glu", decoder.output_shape[1:]) 13 | grader.set_answer("fnM1K", score) 14 | grader.submit(email, token) 15 | -------------------------------------------------------------------------------- /intro-to-dle/week5/data_copyright: -------------------------------------------------------------------------------- 1 | @names 2 | # Copyright (c) January 1991 by Mark Kantrowitz. 3 | # Thanks to Bill Ross for about 1000 additional names. 4 | # Version 1.3 (29-MAR-94) 5 | 6 | @mtg cards 7 | https://mtgjson.com/ 8 | 9 | -------------------------------------------------------------------------------- /intro-to-dle/week5/rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week5/rnn.png -------------------------------------------------------------------------------- /intro-to-dle/week5/submit.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | sys.path.append("..") 4 | import grading 5 | 6 | 7 | # code_size = 71 8 | # img_shape = (38, 38, 3) 9 | def submit_char_rnn(submission, email, token): 10 | grader = grading.Grader("cULEpp2NEeemQBKZKgu93A") 11 | history, samples = submission 12 | assert len(samples) == 25 13 | grader.set_answer("pttMO", int(np.mean(history[:10]) > np.mean(history[-10:]))) 14 | grader.set_answer("uly0D", len(set(samples))) 15 | grader.submit(email, token) 16 | -------------------------------------------------------------------------------- /intro-to-dle/week6/grading_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import numpy as np 4 | import random 5 | 6 | 7 | def test_vocab(vocab, PAD, UNK, START, END): 8 | return [ 9 | len(vocab), 10 | len(np.unique(list(vocab.values()))), 11 | int(all([_ in vocab for _ in [PAD, UNK, START, END]])) 12 | ] 13 | 14 | 15 | def test_captions_indexing(train_captions_indexed, vocab, UNK): 16 | starts = set() 17 | ends = set() 18 | between = set() 19 | unk_count = 0 20 | for caps in train_captions_indexed: 21 | for cap in caps: 22 | starts.add(cap[0]) 23 | between.update(cap[1:-1]) 24 | ends.add(cap[-1]) 25 | for w in cap: 26 | if w == vocab[UNK]: 27 | unk_count += 1 28 | return [ 29 | len(starts), 30 | len(ends), 31 | len(between), 32 | len(between | starts | ends), 33 | int(all([isinstance(x, int) for x in (between | starts | ends)])), 34 | unk_count 35 | ] 36 | 37 | 38 | def test_captions_batching(batch_captions_to_matrix): 39 | return (batch_captions_to_matrix([[1, 2, 3], [4, 5]], -1, max_len=None).ravel().tolist() 40 | + batch_captions_to_matrix([[1, 2, 3], [4, 5]], -1, max_len=2).ravel().tolist() 41 | + batch_captions_to_matrix([[1, 2, 3], [4, 5]], -1, max_len=10).ravel().tolist()) 42 | 43 | 44 | def get_feed_dict_for_testing(decoder, IMG_EMBED_SIZE, vocab): 45 | return { 46 | decoder.img_embeds: np.random.random((32, IMG_EMBED_SIZE)), 47 | decoder.sentences: np.random.randint(0, len(vocab), (32, 20)) 48 | } 49 | 50 | 51 | def test_decoder_shapes(decoder, IMG_EMBED_SIZE, vocab, s): 52 | tensors_to_test = [ 53 | decoder.h0, 54 | decoder.word_embeds, 55 | decoder.flat_hidden_states, 56 | decoder.flat_token_logits, 57 | decoder.flat_ground_truth, 58 | decoder.flat_loss_mask, 59 | decoder.loss 60 | ] 61 | all_shapes = [] 62 | for t in tensors_to_test: 63 | _ = s.run(t, feed_dict=get_feed_dict_for_testing(decoder, IMG_EMBED_SIZE, vocab)) 64 | all_shapes.extend(_.shape) 65 | return all_shapes 66 | 67 | 68 | def test_random_decoder_loss(decoder, IMG_EMBED_SIZE, vocab, s): 69 | loss = s.run(decoder.loss, feed_dict=get_feed_dict_for_testing(decoder, IMG_EMBED_SIZE, vocab)) 70 | return loss 71 | 72 | 73 | def test_validation_loss(decoder, s, generate_batch, val_img_embeds, val_captions_indexed): 74 | np.random.seed(300) 75 | random.seed(300) 76 | val_loss = 0 77 | for _ in range(1000): 78 | val_loss += s.run(decoder.loss, generate_batch(val_img_embeds, 79 | val_captions_indexed, 80 | 32, 81 | 20)) 82 | val_loss /= 1000. 83 | return val_loss 84 | -------------------------------------------------------------------------------- /intro-to-dle/week6/images/encoder_decoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week6/images/encoder_decoder.png -------------------------------------------------------------------------------- /intro-to-dle/week6/images/encoder_decoder_explained.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week6/images/encoder_decoder_explained.png -------------------------------------------------------------------------------- /intro-to-dle/week6/images/inceptionv3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week6/images/inceptionv3.png -------------------------------------------------------------------------------- /intro-to-dle/week6/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import queue 5 | import threading 6 | import zipfile 7 | import tqdm 8 | import cv2 9 | import numpy as np 10 | import pickle 11 | 12 | 13 | def image_center_crop(img): 14 | h, w = img.shape[0], img.shape[1] 15 | pad_left = 0 16 | pad_right = 0 17 | pad_top = 0 18 | pad_bottom = 0 19 | if h > w: 20 | diff = h - w 21 | pad_top = diff - diff // 2 22 | pad_bottom = diff // 2 23 | else: 24 | diff = w - h 25 | pad_left = diff - diff // 2 26 | pad_right = diff // 2 27 | return img[pad_top:h-pad_bottom, pad_left:w-pad_right, :] 28 | 29 | 30 | def decode_image_from_buf(buf): 31 | img = cv2.imdecode(np.asarray(bytearray(buf), dtype=np.uint8), 1) 32 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 33 | return img 34 | 35 | 36 | def crop_and_preprocess(img, input_shape, preprocess_for_model): 37 | img = image_center_crop(img) # take center crop 38 | img = cv2.resize(img, input_shape) # resize for our model 39 | img = img.astype("float32") # prepare for normalization 40 | img = preprocess_for_model(img) # preprocess for model 41 | return img 42 | 43 | 44 | def apply_model(zip_fn, model, preprocess_for_model, extensions=(".jpg",), input_shape=(224, 224), batch_size=32): 45 | # queue for cropped images 46 | q = queue.Queue(maxsize=batch_size * 10) 47 | 48 | # when read thread put all images in queue 49 | read_thread_completed = threading.Event() 50 | 51 | # time for read thread to die 52 | kill_read_thread = threading.Event() 53 | 54 | def reading_thread(zip_fn): 55 | zf = zipfile.ZipFile(zip_fn) 56 | for fn in tqdm.tqdm_notebook(zf.namelist()): 57 | if kill_read_thread.is_set(): 58 | break 59 | if os.path.splitext(fn)[-1] in extensions: 60 | buf = zf.read(fn) # read raw bytes from zip for fn 61 | img = decode_image_from_buf(buf) # decode raw bytes 62 | img = crop_and_preprocess(img, input_shape, preprocess_for_model) 63 | while True: 64 | try: 65 | q.put((os.path.split(fn)[-1], img), timeout=1) # put in queue 66 | except queue.Full: 67 | if kill_read_thread.is_set(): 68 | break 69 | continue 70 | break 71 | 72 | read_thread_completed.set() # read all images 73 | 74 | # start reading thread 75 | t = threading.Thread(target=reading_thread, args=(zip_fn,)) 76 | t.daemon = True 77 | t.start() 78 | 79 | img_fns = [] 80 | img_embeddings = [] 81 | 82 | batch_imgs = [] 83 | 84 | def process_batch(batch_imgs): 85 | batch_imgs = np.stack(batch_imgs, axis=0) 86 | batch_embeddings = model.predict(batch_imgs) 87 | img_embeddings.append(batch_embeddings) 88 | 89 | try: 90 | while True: 91 | try: 92 | fn, img = q.get(timeout=1) 93 | except queue.Empty: 94 | if read_thread_completed.is_set(): 95 | break 96 | continue 97 | img_fns.append(fn) 98 | batch_imgs.append(img) 99 | if len(batch_imgs) == batch_size: 100 | process_batch(batch_imgs) 101 | batch_imgs = [] 102 | q.task_done() 103 | # process last batch 104 | if len(batch_imgs): 105 | process_batch(batch_imgs) 106 | finally: 107 | kill_read_thread.set() 108 | t.join() 109 | 110 | q.join() 111 | 112 | img_embeddings = np.vstack(img_embeddings) 113 | return img_embeddings, img_fns 114 | 115 | 116 | def save_pickle(obj, fn): 117 | with open(fn, "wb") as f: 118 | pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) 119 | 120 | 121 | def read_pickle(fn): 122 | with open(fn, "rb") as f: 123 | return pickle.load(f) 124 | -------------------------------------------------------------------------------- /natural-language-processing/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # Data for assignments 104 | data/ 105 | 106 | week3/starSpaceModel* 107 | -------------------------------------------------------------------------------- /natural-language-processing/AWS-tutorial.md: -------------------------------------------------------------------------------- 1 | # Tutorial for setting up an AWS Virtual Machine 2 | 3 | This tutorial will teach you how to set up an AWS Virtual Machine for the final project of our course. 4 | 5 | ### 1. Register with AWS and launch an EC2 instance 6 | 7 | First, you need to perform several preparatory steps (if you have already done this before, you can skip them): 8 | - [Sign up for AWS](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#sign-up-for-aws). You will need to specify your credit card details, but for our project we will use Free Tier instances only, so you should not be charged. 9 | - [Create a key pair for authentication](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-key-pair). If you use Windows, you will also need to install [PuTTY](https://www.chiark.greenend.org.uk/~sgtatham/putty/) to use SSH. 10 | - [Create security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-base-security-group). You must add rules to a security group to allow you to connect to your future instance from your IP address using SSH. You might want to allow SSH access from all IPv4 addresses (set to 0.0.0.0/0), because your IP might change. 11 | 12 | Next, you are ready to create your first EC2 instance: 13 | - [Launch a free tier instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance). For Amazon Machine Image (AMI) on step 3 choose **Ubuntu Server 16.04 LTS**. 14 | - [Connect to your instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-connect-to-instance-linux) using SSH. 15 | - Later on you can [start and stop](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Stop_Start.html) your instance when needed, and [terminate](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-clean-up-your-instance) it in the end. 16 | 17 | ### 2. Set up dependencies and run your project 18 | 19 | - Install Docker container for Ubuntu with course dependencies. Follow our Docker instructions. 20 | 21 | - To be able to access IPython notebooks running on AWS, you might want to SSH with port tunneling: 22 | ```sh 23 | ssh -L 8080:localhost:8080 -i path/to/private_key ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com 24 | ``` 25 | Then you will be able to see the notebooks on *localhost:8080* from your browser on the local machine. 26 | 27 | - Bring code and data to AWS instance, e.g. 28 | ```sh 29 | scp -i path/to/your_key.pem path/to/local_file ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com:path/to/remote_file 30 | ``` 31 | You might want to install [WinSCP](https://winscp.net/eng/docs/lang:ru) for data transfer if you are using Windows. 32 | 33 | - It is also a good practice to use [tmux](https://medium.com/@peterxjang/a-minimalist-guide-to-tmux-13675fb160fa) to keep your remote session running even if you disconnect from the machine, e.g. by closing your laptop. 34 | 35 | -------------------------------------------------------------------------------- /natural-language-processing/Docker-tutorial.md: -------------------------------------------------------------------------------- 1 | # Docker container with course dependencies 2 | 3 | This file describes how to use a Docker container with Jupyter notebook and 4 | all dependencies required for the course. 5 | 6 | The image is located at https://hub.docker.com/r/akashin/coursera-aml-nlp/. 7 | 8 | ## Install Stable Docker Community Edition (CE) 9 | 10 | - For Mac: 11 | https://docs.docker.com/docker-for-mac/install/ 12 | 13 | - For Ubuntu: 14 | https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/ (see also other Linux distributives in the menu). 15 | 16 | - For Windows (64bit Windows 10 Pro, Enterprise and Education): 17 | https://docs.docker.com/docker-for-windows/install/ 18 | 19 | - For Windows (older versions): 20 | https://docs.docker.com/toolbox/toolbox_install_windows/ 21 | 22 | 23 | 24 | ## Get container image 25 | 26 | To get the latest version of the container image run: 27 | ```sh 28 | docker pull akashin/coursera-aml-nlp 29 | ``` 30 | It containes Ubuntu 16.04 Linux distirbutive and all dependencies that you need for our course. The downloaded image takes approximately 2.3GB. 31 | 32 | **Note:** If you are getting an error "Got permission denied while trying to connect to the Docker daemon socket...", you need to add current user to the docker group: 33 | ```sh 34 | sudo usermod -a -G docker $USER 35 | sudo service docker restart 36 | ``` 37 | Then you need to logout and login to the system again (disconnect and connect to your AWS instance if you are setting up a docker on it). 38 | 39 | 40 | ## Run container for the first time 41 | 42 | Now you can start new container from this image with: 43 | ```sh 44 | docker run -it -p 127.0.0.1:8080:8080 --name coursera-aml-nlp akashin/coursera-aml-nlp 45 | ``` 46 | This will start the Ubuntu instance and give you an access to its command line. You can type `run_notebook` to launch IPython notebook server. 47 | 48 | You may find it useful to mount a directory from your local machine within the container using `-v` option: 49 | ```sh 50 | docker run -it -p 127.0.0.1:8080:8080 --name coursera-aml-nlp -v $PWD:/root/coursera akashin/coursera-aml-nlp 51 | ``` 52 | This will use shell alias `$PWD` to mount current directory to the folder `/root/coursera` in the container. Alternatively, you can mount arbitrary directory by replacing `$PWD` with a custom path. 53 | 54 | ## Stop and resume container 55 | 56 | To stop the container use: 57 | ```sh 58 | docker stop coursera-aml-nlp 59 | ``` 60 | All the changes that were made within container will be saved. 61 | 62 | To resume the stopped container use: 63 | ```sh 64 | docker start -i coursera-aml-nlp 65 | ``` 66 | ## Other operations on the container 67 | 68 | There are many other operations that you can perform on the container, to show all of them: 69 | ```sh 70 | docker container 71 | ``` 72 | Some particularly useful would be **showing a list of containers** and **removing container**. 73 | 74 | To show currently running and stopped containers with their status: 75 | ```sh 76 | docker ps -a 77 | ``` 78 | 79 | To remove the container and all data associated with it: 80 | ```sh 81 | docker rm coursera-aml-nlp 82 | ``` 83 | Note, that this will remove all the internal data of the container (e.g. installed packages), but all the data written inside of your local mounted folder (`-v` option) will not be affected. 84 | 85 | ## Install more packages 86 | 87 | You can install more packages in the container if needed: 88 | ```sh 89 | docker exec coursera-aml-nlp pip3 install PACKAGE_NAME 90 | ``` 91 | 92 | ## Further reading 93 | 94 | If you are interested to know more about Docker, check out this articles: 95 | - Using Jupyter notebook from Docker: https://www.dataquest.io/blog/docker-data-science/ 96 | - General introduction to Docker: https://docker-curriculum.com/ 97 | 98 | 99 | ## Credits 100 | 101 | The template for this dockerfile was taken from https://github.com/ZEMUSHKA/coursera-aml-docker 102 | -------------------------------------------------------------------------------- /natural-language-processing/common/README.md: -------------------------------------------------------------------------------- 1 | # Common utils 2 | 3 | This folder stores collection of functions that are common for different assignments 4 | 5 | - `download_utils.py`: Functions for downloading data for the assignments. 6 | -------------------------------------------------------------------------------- /natural-language-processing/common/download_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import shutil 5 | import tqdm 6 | # Address problem in tqdm library. For details see: https://github.com/tqdm/tqdm/issues/481 7 | tqdm.monitor_interval = 0 8 | import requests 9 | 10 | REPOSITORY_PATH="https://github.com/hse-aml/natural-language-processing" 11 | 12 | 13 | def download_file(url, file_path): 14 | r = requests.get(url, stream=True) 15 | total_size = int(r.headers.get('content-length')) 16 | try: 17 | with open(file_path, 'wb', buffering=16*1024*1024) as f: 18 | bar = tqdm.tqdm_notebook(total=total_size, unit='B', unit_scale=True) 19 | bar.set_description(os.path.split(file_path)[-1]) 20 | for chunk in r.iter_content(32 * 1024): 21 | f.write(chunk) 22 | bar.update(len(chunk)) 23 | bar.close() 24 | except Exception: 25 | print("Download failed") 26 | finally: 27 | if os.path.getsize(file_path) != total_size: 28 | os.remove(file_path) 29 | print("Removed incomplete download") 30 | 31 | 32 | def download_from_github(version, fn, target_dir, force=False): 33 | url = REPOSITORY_PATH + "/releases/download/{0}/{1}".format(version, fn) 34 | file_path = os.path.join(target_dir, fn) 35 | if os.path.exists(file_path) and not force: 36 | print("File {} is already downloaded.".format(file_path)) 37 | return 38 | download_file(url, file_path) 39 | 40 | 41 | def sequential_downloader(version, fns, target_dir, force=False): 42 | os.makedirs(target_dir, exist_ok=True) 43 | for fn in fns: 44 | download_from_github(version, fn, target_dir, force=force) 45 | 46 | 47 | def link_all_files_from_dir(src_dir, dst_dir): 48 | os.makedirs(dst_dir, exist_ok=True) 49 | for fn in os.listdir(src_dir): 50 | src_file = os.path.join(src_dir, fn) 51 | dst_file = os.path.join(dst_dir, fn) 52 | if os.name == "nt": 53 | shutil.copyfile(src_file, dst_file) 54 | else: 55 | if not os.path.exists(dst_file): 56 | os.symlink(os.path.abspath(src_file), dst_file) 57 | 58 | 59 | def link_resources(): 60 | link_all_files_from_dir("../readonly/dataset/", ".") 61 | 62 | 63 | def download_week1_resources(force=False): 64 | sequential_downloader( 65 | "week1", 66 | [ 67 | "train.tsv", 68 | "validation.tsv", 69 | "test.tsv", 70 | "text_prepare_tests.tsv", 71 | ], 72 | "data", 73 | force=force 74 | ) 75 | 76 | 77 | def download_week2_resources(force=False): 78 | sequential_downloader( 79 | "week2", 80 | [ 81 | "train.txt", 82 | "validation.txt", 83 | "test.txt", 84 | ], 85 | "data", 86 | force=force 87 | ) 88 | 89 | 90 | def download_week3_resources(force=False): 91 | sequential_downloader( 92 | "week3", 93 | [ 94 | "train.tsv", 95 | "validation.tsv", 96 | "test.tsv", 97 | "test_embeddings.tsv", 98 | ], 99 | "data", 100 | force=force 101 | ) 102 | 103 | 104 | def download_project_resources(force=False): 105 | sequential_downloader( 106 | "project", 107 | [ 108 | "dialogues.tsv", 109 | "tagged_posts.tsv", 110 | ], 111 | "data", 112 | force=force 113 | ) 114 | -------------------------------------------------------------------------------- /natural-language-processing/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | LABEL maintainer="Andrei Kashin " 3 | 4 | RUN apt-get update && apt-get install -yq \ 5 | python3 python3-pip htop nano git wget \ 6 | libglib2.0-0 autoconf automake \ 7 | libtool build-essential unzip \ 8 | libarchive-dev vim 9 | 10 | # Install Starspace. 11 | RUN wget https://dl.bintray.com/boostorg/release/1.63.0/source/boost_1_63_0.zip && \ 12 | unzip boost_1_63_0.zip && \ 13 | mv boost_1_63_0 /usr/local/bin 14 | 15 | RUN git clone https://github.com/facebookresearch/Starspace.git && \ 16 | cd Starspace && \ 17 | make && \ 18 | cp -Rf starspace /usr/local/bin 19 | 20 | # Install Python dependencies. 21 | ADD requirements.txt / 22 | RUN pip3 install --upgrade pip 23 | RUN pip3 install -r requirements.txt 24 | 25 | # Install Jupyter. 26 | RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension 27 | RUN jupyter contrib nbextension install 28 | RUN jupyter nbextension enable codefolding/main 29 | RUN echo "c.NotebookApp.ip = '*'" >> /root/.jupyter/jupyter_notebook_config.py 30 | RUN echo "c.NotebookApp.port = 8080" >> /root/.jupyter/jupyter_notebook_config.py 31 | RUN echo "c.NotebookApp.token = ''" >> /root/.jupyter/jupyter_notebook_config.py 32 | RUN echo "jupyter notebook --no-browser --allow-root" >> /usr/local/bin/run_notebook && chmod +x /usr/local/bin/run_notebook 33 | 34 | # Welcome message. 35 | ADD welcome_message.txt / 36 | RUN echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/motd' \ 37 | >> /etc/bash.bashrc \ 38 | ; cat welcome_message.txt > /etc/motd 39 | 40 | WORKDIR /root 41 | EXPOSE 8080 42 | -------------------------------------------------------------------------------- /natural-language-processing/honor/LSTM reply.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/jose/scratch/venv/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 13 | " from ._conv import register_converters as _register_converters\n" 14 | ] 15 | } 16 | ], 17 | "source": [] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "[nltk_data] Downloading package stopwords to /home/jose/nltk_data...\n", 29 | "[nltk_data] Package stopwords is already up-to-date!\n" 30 | ] 31 | } 32 | ], 33 | "source": [] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 17, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/plain": [ 43 | "'Hello something im going to go'" 44 | ] 45 | }, 46 | "execution_count": 17, 47 | "metadata": {}, 48 | "output_type": "execute_result" 49 | } 50 | ], 51 | "source": [] 52 | } 53 | ], 54 | "metadata": { 55 | "kernelspec": { 56 | "display_name": "Python 3", 57 | "language": "python", 58 | "name": "python3" 59 | }, 60 | "language_info": { 61 | "codemirror_mode": { 62 | "name": "ipython", 63 | "version": 3 64 | }, 65 | "file_extension": ".py", 66 | "mimetype": "text/x-python", 67 | "name": "python", 68 | "nbconvert_exporter": "python", 69 | "pygments_lexer": "ipython3", 70 | "version": "3.6.3" 71 | } 72 | }, 73 | "nbformat": 4, 74 | "nbformat_minor": 2 75 | } 76 | -------------------------------------------------------------------------------- /natural-language-processing/honor/README.md: -------------------------------------------------------------------------------- 1 | # Utils to download and read data for chat-bot training 2 | 3 | This folder contains scripts for downloading, reading and preprocessing data for chat-bot training: 4 | - `download_cornell.sh` - downloads Cornell movie dialogues dataset (small size) 5 | - `download_opensubs.sh` - downloads Opensubs movie subtitles dataset (large size) 6 | - `datasets.py` - module to be imported in your scripts, that exports functions for reading a dataset 7 | - `example.py` - example of reading the dataset 8 | -------------------------------------------------------------------------------- /natural-language-processing/honor/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /natural-language-processing/honor/datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Conchylicultor. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import ast 17 | import os 18 | import random 19 | import re 20 | from time import time 21 | 22 | import nltk 23 | from tqdm import tqdm 24 | 25 | """ 26 | Load the cornell movie dialog corpus. 27 | 28 | Available from here: 29 | http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html 30 | 31 | """ 32 | 33 | class CornellData: 34 | """ 35 | 36 | """ 37 | 38 | def __init__(self, dirName): 39 | """ 40 | Args: 41 | dirName (string): directory where to load the corpus 42 | """ 43 | self.lines = {} 44 | self.conversations = [] 45 | 46 | MOVIE_LINES_FIELDS = ["lineID","characterID","movieID","character","text"] 47 | MOVIE_CONVERSATIONS_FIELDS = ["character1ID","character2ID","movieID","utteranceIDs"] 48 | 49 | self.lines = self.loadLines(os.path.join(dirName, "movie_lines.txt"), MOVIE_LINES_FIELDS) 50 | self.conversations = self.loadConversations(os.path.join(dirName, "movie_conversations.txt"), MOVIE_CONVERSATIONS_FIELDS) 51 | 52 | # TODO: Cleaner program (merge copy-paste) !! 53 | 54 | def loadLines(self, fileName, fields): 55 | """ 56 | Args: 57 | fileName (str): file to load 58 | field (set): fields to extract 59 | Return: 60 | dict>: the extracted fields for each line 61 | """ 62 | lines = {} 63 | 64 | with open(fileName, 'r', encoding='iso-8859-1') as f: # TODO: Solve Iso encoding pb ! 65 | for line in f: 66 | values = line.split(" +++$+++ ") 67 | 68 | # Extract fields 69 | lineObj = {} 70 | for i, field in enumerate(fields): 71 | lineObj[field] = values[i] 72 | 73 | lines[lineObj['lineID']] = lineObj 74 | 75 | return lines 76 | 77 | def loadConversations(self, fileName, fields): 78 | """ 79 | Args: 80 | fileName (str): file to load 81 | field (set): fields to extract 82 | Return: 83 | list>: the extracted fields for each line 84 | """ 85 | conversations = [] 86 | 87 | with open(fileName, 'r', encoding='iso-8859-1') as f: # TODO: Solve Iso encoding pb ! 88 | for line in f: 89 | values = line.split(" +++$+++ ") 90 | 91 | # Extract fields 92 | convObj = {} 93 | for i, field in enumerate(fields): 94 | convObj[field] = values[i] 95 | 96 | # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]") 97 | lineIds = ast.literal_eval(convObj["utteranceIDs"]) 98 | 99 | # Reassemble lines 100 | convObj["lines"] = [] 101 | for lineId in lineIds: 102 | convObj["lines"].append(self.lines[lineId]) 103 | 104 | conversations.append(convObj) 105 | 106 | return conversations 107 | 108 | def getConversations(self): 109 | return self.conversations 110 | 111 | 112 | # Based on code from https://github.com/AlJohri/OpenSubtitles 113 | # by Al Johri 114 | 115 | import xml.etree.ElementTree as ET 116 | import datetime 117 | import os 118 | import sys 119 | import json 120 | import re 121 | import pprint 122 | 123 | from gzip import GzipFile 124 | 125 | """ 126 | Load the opensubtitles dialog corpus. 127 | """ 128 | 129 | class OpensubsData: 130 | """ 131 | """ 132 | 133 | def __init__(self, dirName): 134 | """ 135 | Args: 136 | dirName (string): directory where to load the corpus 137 | """ 138 | 139 | # Hack this to filter on subset of Opensubtitles 140 | # dirName = "%s/en/Action" % dirName 141 | 142 | print("Loading OpenSubtitles conversations in %s." % dirName) 143 | self.conversations = [] 144 | self.tag_re = re.compile(r'(|<[^>]*>)') 145 | self.conversations = self.loadConversations(dirName) 146 | 147 | def loadConversations(self, dirName): 148 | """ 149 | Args: 150 | dirName (str): folder to load 151 | Return: 152 | array(question, answer): the extracted QA pairs 153 | """ 154 | conversations = [] 155 | dirList = self.filesInDir(dirName) 156 | for filepath in tqdm(dirList, "OpenSubtitles data files"): 157 | if filepath.endswith('gz'): 158 | try: 159 | doc = self.getXML(filepath) 160 | conversations.extend(self.genList(doc)) 161 | except ValueError: 162 | tqdm.write("Skipping file %s with errors." % filepath) 163 | except: 164 | print("Unexpected error:", sys.exc_info()[0]) 165 | raise 166 | return conversations 167 | 168 | def getConversations(self): 169 | return self.conversations 170 | 171 | def genList(self, tree): 172 | root = tree.getroot() 173 | 174 | timeFormat = '%H:%M:%S' 175 | maxDelta = datetime.timedelta(seconds=1) 176 | 177 | startTime = datetime.datetime.min 178 | strbuf = '' 179 | sentList = [] 180 | 181 | for child in root: 182 | for elem in child: 183 | if elem.tag == 'time': 184 | elemID = elem.attrib['id'] 185 | elemVal = elem.attrib['value'][:-4] 186 | if elemID[-1] == 'S': 187 | startTime = datetime.datetime.strptime(elemVal, timeFormat) 188 | else: 189 | sentList.append((strbuf.strip(), startTime, datetime.datetime.strptime(elemVal, timeFormat))) 190 | strbuf = '' 191 | else: 192 | try: 193 | strbuf = strbuf + " " + elem.text 194 | except: 195 | pass 196 | 197 | conversations = [] 198 | for idx in range(0, len(sentList) - 1): 199 | cur = sentList[idx] 200 | nxt = sentList[idx + 1] 201 | if nxt[1] - cur[2] <= maxDelta and cur and nxt: 202 | tmp = {} 203 | tmp["lines"] = [] 204 | tmp["lines"].append(self.getLine(cur[0])) 205 | tmp["lines"].append(self.getLine(nxt[0])) 206 | if self.filter(tmp): 207 | conversations.append(tmp) 208 | 209 | return conversations 210 | 211 | def getLine(self, sentence): 212 | line = {} 213 | line["text"] = self.tag_re.sub('', sentence).replace('\\\'','\'').strip().lower() 214 | return line 215 | 216 | def filter(self, lines): 217 | # Use the followint to customize filtering of QA pairs 218 | # 219 | # startwords = ("what", "how", "when", "why", "where", "do", "did", "is", "are", "can", "could", "would", "will") 220 | # question = lines["lines"][0]["text"] 221 | # if not question.endswith('?'): 222 | # return False 223 | # if not question.split(' ')[0] in startwords: 224 | # return False 225 | # 226 | return True 227 | 228 | def getXML(self, filepath): 229 | fext = os.path.splitext(filepath)[1] 230 | if fext == '.gz': 231 | tmp = GzipFile(filename=filepath) 232 | return ET.parse(tmp) 233 | else: 234 | return ET.parse(filepath) 235 | 236 | def filesInDir(self, dirname): 237 | result = [] 238 | for dirpath, dirs, files in os.walk(dirname): 239 | for filename in files: 240 | fname = os.path.join(dirpath, filename) 241 | result.append(fname) 242 | return result 243 | 244 | 245 | def extractText(line, fast_preprocessing=True): 246 | if fast_preprocessing: 247 | GOOD_SYMBOLS_RE = re.compile('[^0-9a-z ]') 248 | REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#+_]') 249 | REPLACE_SEVERAL_SPACES = re.compile('\s+') 250 | 251 | line = line.lower() 252 | line = REPLACE_BY_SPACE_RE.sub(' ', line) 253 | line = GOOD_SYMBOLS_RE.sub('', line) 254 | line = REPLACE_SEVERAL_SPACES.sub(' ', line) 255 | return line.strip() 256 | else: 257 | return nltk.word_tokenize(line) 258 | 259 | 260 | def splitConversations(conversations, max_len=20, fast_preprocessing=True): 261 | data = [] 262 | for i, conversation in enumerate(tqdm(conversations)): 263 | lines = conversation['lines'] 264 | for i in range(len(lines) - 1): 265 | request = extractText(lines[i]['text']) 266 | reply = extractText(lines[i + 1]['text']) 267 | if 0 < len(request) <= max_len and 0 < len(reply) <= max_len: 268 | data += [(request, reply)] 269 | return data 270 | 271 | 272 | def readCornellData(path, max_len=20, fast_preprocessing=True): 273 | dataset = CornellData(path) 274 | conversations = dataset.getConversations() 275 | return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing) 276 | 277 | 278 | def readOpensubsData(path, max_len=20, fast_preprocessing=True): 279 | dataset = OpensubsData(path) 280 | conversations = dataset.getConversations() 281 | return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing) 282 | -------------------------------------------------------------------------------- /natural-language-processing/honor/dialogue_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sklearn.metrics.pairwise import pairwise_distances_argmin 3 | 4 | from chatterbot import ChatBot 5 | from utils import * 6 | import tfmodel 7 | from tfmodel import * 8 | 9 | class ThreadRanker(object): 10 | def __init__(self, paths): 11 | self.word_embeddings, self.embeddings_dim = load_embeddings( 12 | paths['WORD_EMBEDDINGS']) 13 | self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER'] 14 | 15 | def __load_embeddings_by_tag(self, tag_name): 16 | embeddings_path = os.path.join( 17 | self.thread_embeddings_folder, tag_name + ".pkl") 18 | thread_ids, thread_embeddings = unpickle_file(embeddings_path) 19 | return thread_ids, thread_embeddings 20 | 21 | def get_best_thread(self, question, tag_name): 22 | """ Returns id of the most similar thread for the question. 23 | The search is performed across the threads with a given tag. 24 | """ 25 | thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) 26 | 27 | # HINT: you have already implemented a similar routine in the 3rd assignment. 28 | 29 | question_vec = question_to_vec( 30 | question, self.word_embeddings, self.embeddings_dim).reshape(-1,self.embeddings_dim) 31 | 32 | 33 | best_thread = pairwise_distances_argmin( 34 | question_vec, thread_embeddings, metric = "cosine")[0] 35 | 36 | return thread_ids.values[best_thread] 37 | 38 | 39 | class DialogueManager(object): 40 | def __init__(self, paths): 41 | print("Loading resources...") 42 | self.sess=load_model() 43 | 44 | # Intent recognition: 45 | self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER']) 46 | self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER']) 47 | 48 | self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s' 49 | 50 | # Goal-oriented part: 51 | self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER']) 52 | self.thread_ranker = ThreadRanker(paths) 53 | 54 | def create_chitchat_bot(self): 55 | """Initializes self.chitchat_bot with some conversational model.""" 56 | 57 | # Hint: you might want to create and train chatterbot.ChatBot here. 58 | # It could be done by creating ChatBot with the *trainer* parameter equals 59 | # "chatterbot.trainers.ChatterBotCorpusTrainer" 60 | # and then calling *train* function with "chatterbot.corpus.english" param 61 | 62 | self.chitchat_bot = ChatBot('Botty McBotFace', 63 | trainer='chatterbot.trainers.ChatterBotCorpusTrainer') 64 | 65 | # Train based on the english corpus 66 | self.chitchat_bot.train("chatterbot.corpus.english") 67 | 68 | def generate_answer(self, question): 69 | """Combines stackoverflow and chitchat parts using intent recognition.""" 70 | 71 | # Recognize intent of the question using `intent_recognizer`. 72 | # Don't forget to prepare question and calculate features for the question. 73 | 74 | prepared_question = text_prepare(question) 75 | 76 | features = self.tfidf_vectorizer.transform([prepared_question]) 77 | 78 | intent = self.intent_recognizer.predict(features)[0] 79 | 80 | # Chit-chat part: 81 | if intent == 'dialogue': 82 | # Pass question to chitchat_bot to generate a response. 83 | response = reply("hi",word2id,max_len,id2word,sess) 84 | return response 85 | 86 | # Goal-oriented part: 87 | else: 88 | # Pass features to tag_classifier to get predictions. 89 | tag = self.tag_classifier.predict(features) 90 | 91 | 92 | # Pass prepared_question to thread_ranker to get predictions. 93 | thread_id = self.thread_ranker.get_best_thread(prepared_question,tag[0]) 94 | 95 | return self.ANSWER_TEMPLATE % (tag[0], thread_id) 96 | -------------------------------------------------------------------------------- /natural-language-processing/honor/download_cornell.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p data/cornell 4 | cd data/cornell 5 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_conversations.txt 6 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_lines.txt 7 | -------------------------------------------------------------------------------- /natural-language-processing/honor/download_opensubs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p data/opensubs 4 | cd data/opensubs 5 | wget -O en.tar.gz http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz 6 | tar -xf en.tar.gz 7 | rm en.tar.gz 8 | -------------------------------------------------------------------------------- /natural-language-processing/honor/example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import datasets 4 | import argparse 5 | import os 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("dataset", choices=["cornell", "opensubs"], help="Name of the dataset.") 10 | parser.add_argument("--max_len", type=int, default=10, help="Max length of sentences to consider.") 11 | args = parser.parse_args() 12 | 13 | dataset_path = os.path.join("data", args.dataset) 14 | if args.dataset == "cornell": 15 | data = datasets.readCornellData(dataset_path, max_len=args.max_len) 16 | elif args.dataset == "opensubs": 17 | data = datasets.readOpensubsData(dataset_path, max_len=args.max_len) 18 | else: 19 | raise ValueError("Unrecognized dataset: {!r}".format(args.dataset)) 20 | 21 | print("Size of dataset: {}".format(len(data))) 22 | print("First 10 training pairs:") 23 | for item in data[:10]: 24 | print(item) 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /natural-language-processing/honor/main_bot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import requests 4 | import time 5 | import argparse 6 | import os 7 | import json 8 | import tensorflow as tf 9 | from requests.compat import urljoin 10 | import dialogue_manager 11 | from utils import * 12 | 13 | 14 | 15 | from dialogue_manager import DialogueManager 16 | 17 | class BotHandler(object): 18 | """ 19 | BotHandler is a class which implements all back-end of the bot. 20 | It has tree main functions: 21 | 'get_updates' — checks for new messages 22 | 'send_message' – posts new message to user 23 | 'get_answer' — computes the most relevant on a user's question 24 | """ 25 | 26 | def __init__(self, token, dialogue_manager): 27 | self.token = token 28 | self.api_url = "https://api.telegram.org/bot{}/".format(token) 29 | self.dialogue_manager = dialogue_manager 30 | 31 | def get_updates(self, offset=None, timeout=30): 32 | params = {"timeout": timeout, "offset": offset} 33 | raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params) 34 | try: 35 | resp = raw_resp.json() 36 | except json.decoder.JSONDecodeError as e: 37 | print("Failed to parse response {}: {}.".format(raw_resp.content, e)) 38 | return [] 39 | 40 | if "result" not in resp: 41 | return [] 42 | return resp["result"] 43 | 44 | def send_message(self, chat_id, text): 45 | params = {"chat_id": chat_id, "text": text} 46 | return requests.post(urljoin(self.api_url, "sendMessage"), params) 47 | 48 | def get_answer(self, question): 49 | if question == '/start': 50 | return "Hi, I am your project bot. How can I help you today?" 51 | return self.dialogue_manager.generate_answer(question) 52 | 53 | 54 | def parse_args(): 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument('--token', type=str, default='') 57 | return parser.parse_args() 58 | 59 | 60 | def is_unicode(text): 61 | return len(text) == len(text.encode()) 62 | 63 | 64 | class SimpleDialogueManager(object): 65 | """ 66 | This is the simplest dialogue manager to test the telegram bot. 67 | Your task is to create a more advanced one in dialogue_manager.py." 68 | """ 69 | 70 | def generate_answer(self, question): 71 | return "Hello, world!" 72 | 73 | 74 | def main(): 75 | args = parse_args() 76 | token = args.token 77 | 78 | if not token: 79 | if not "TELEGRAM_TOKEN" in os.environ: 80 | print("Please, set bot token through --token or TELEGRAM_TOKEN env variable") 81 | return 82 | token = os.environ["TELEGRAM_TOKEN"] 83 | 84 | ################################################################# 85 | 86 | # Your task is to complete dialogue_manager.py and use your 87 | # advanced DialogueManager instead of SimpleDialogueManager. 88 | 89 | # This is the point where you plug it into the Telegram bot. 90 | # Do not forget to import all needed dependencies when you do so. 91 | 92 | # simple_manager = SimpleDialogueManager() 93 | advanced_manager = DialogueManager(RESOURCE_PATH) 94 | advanced_manager.create_chitchat_bot() 95 | bot = BotHandler(token, advanced_manager) 96 | 97 | ############################################################### 98 | 99 | print("Ready to talk!") 100 | offset = 0 101 | while True: 102 | updates = bot.get_updates(offset=offset) 103 | for update in updates: 104 | print("An update received.") 105 | if "message" in update: 106 | chat_id = update["message"]["chat"]["id"] 107 | if "text" in update["message"]: 108 | text = update["message"]["text"] 109 | if is_unicode(text): 110 | print("Update content: {}".format(update)) 111 | bot.send_message(chat_id, bot.get_answer(update["message"]["text"])) 112 | else: 113 | bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...") 114 | offset = max(offset, update['update_id'] + 1) 115 | time.sleep(1) 116 | 117 | if __name__ == "__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /natural-language-processing/honor/tfmodel.py: -------------------------------------------------------------------------------- 1 | end_symbol = '$' 2 | padding_symbol = '#' 3 | start_symbol="^" 4 | word2id = {symbol:i for i, symbol in enumerate('^$#abcdefghijklmnopqrstuvwxyz 0123456789+-')} 5 | id2word = {i:symbol for symbol, i in word2id.items()} 6 | max_len = 30 7 | def load_model(): 8 | sess = tf.Session() 9 | new_saver = tf.train.import_meta_graph('model/chatbot_model.meta') 10 | new_saver.restore(sess, tf.train.latest_checkpoint('model/')) 11 | sess.run(tf.local_variables_initializer()) 12 | return sess 13 | def reply(question, word2id, max_len, id2word, session): 14 | input_batch = tf.get_default_graph().get_tensor_by_name("input_batch:0") 15 | input_batch_len = tf.get_default_graph().get_tensor_by_name("input_batch_lengths:0") 16 | infer_predictions = tf.get_default_graph().get_tensor_by_name( 17 | "decode_1/decoder/transpose_1:0") 18 | 19 | question = text_prepare(question) 20 | ids, ids_len = sentence_to_ids(question, word2id, padded_len=max_len) 21 | ids = np.array(ids).reshape(1, len(ids)) 22 | 23 | ids_len = np.array(ids_len).reshape(1) 24 | predictions = session.run([ 25 | infer_predictions 26 | ], feed_dict={input_batch: ids, input_batch_len: ids_len})[0] 27 | return "".join(ids_to_sentence(predictions[0], id2word)).replace("$", "").capitalize() 28 | -------------------------------------------------------------------------------- /natural-language-processing/honor/utils.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import pickle 3 | import re 4 | import numpy as np 5 | import pandas as pd 6 | 7 | nltk.download('stopwords') 8 | from nltk.corpus import stopwords 9 | 10 | # Paths for all resources for the bot. 11 | RESOURCE_PATH = { 12 | 'INTENT_RECOGNIZER': 'intent_recognizer.pkl', 13 | 'TAG_CLASSIFIER': 'tag_classifier.pkl', 14 | 'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl', 15 | 'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags', 16 | 'WORD_EMBEDDINGS': 'word_embeddings.tsv', 17 | } 18 | 19 | 20 | def text_prepare(text): 21 | """Performs tokenization and simple preprocessing.""" 22 | 23 | replace_by_space_re = re.compile('[/(){}\[\]\|@,;]') 24 | bad_symbols_re = re.compile('[^0-9a-z #+_]') 25 | stopwords_set = set(stopwords.words('english')) 26 | 27 | text = text.lower() 28 | text = replace_by_space_re.sub(' ', text) 29 | text = bad_symbols_re.sub('', text) 30 | text = ' '.join([x for x in text.split() if x and x not in stopwords_set]) 31 | 32 | return text.strip() 33 | 34 | 35 | def load_embeddings(embeddings_path): 36 | """Loads pre-trained word embeddings from tsv file. 37 | 38 | Args: 39 | embeddings_path - path to the embeddings file. 40 | 41 | Returns: 42 | embeddings - dict mapping words to vectors; 43 | embeddings_dim - dimension of the vectors. 44 | """ 45 | 46 | # Hint: you have already implemented a similar routine in the 3rd assignment. 47 | # Note that here you also need to know the dimension of the loaded embeddings. 48 | 49 | embeds = pd.read_csv(embeddings_path,sep="\t",header=None) 50 | vals=embeds.iloc[:,1:].values 51 | index=embeds.iloc[:,0].values 52 | embeddings= {i:j for i,j in zip(index,vals)} 53 | return embeddings,vals.shape[1] 54 | 55 | 56 | def question_to_vec(question, embeddings, dim): 57 | """Transforms a string to an embedding by averaging word embeddings.""" 58 | 59 | # Hint: you have already implemented exactly this function in the 3rd assignment. 60 | 61 | if question == "": 62 | return np.zeros(dim) 63 | t = np.array([embeddings[i] 64 | for i in question.split() if i in embeddings.keys()]) 65 | if len(t) == 0: 66 | return np.zeros(dim) 67 | 68 | return(t.mean(axis=0)) 69 | 70 | 71 | def unpickle_file(filename): 72 | """Returns the result of unpickling the file content.""" 73 | with open(filename, 'rb') as f: 74 | return pickle.load(f) 75 | def sentence_to_ids(sentence, word2id, padded_len): 76 | """ Converts a sequence of symbols to a padded sequence of their ids. 77 | 78 | sentence: a string, input/output sequence of symbols. 79 | word2id: a dict, a mapping from original symbols to ids. 80 | padded_len: an integer, a desirable length of the sequence. 81 | 82 | result: a tuple of (a list of ids, an actual length of sentence). 83 | """ 84 | 85 | sent_ids = [word2id[i] for i in sentence] 86 | sent_len = len(sent_ids[:padded_len-1])+1 87 | sent_ids = sent_ids[:padded_len-1]+[word2id["$"]]+[word2id["#"]]*(padded_len-len(sent_ids)-1) 88 | 89 | return (sent_ids, sent_len) 90 | def ids_to_sentence(ids, id2word): 91 | """ Converts a sequence of ids to a sequence of symbols. 92 | 93 | ids: a list, indices for the padded sequence. 94 | id2word: a dict, a mapping from ids to original symbols. 95 | 96 | result: a list of symbols. 97 | """ 98 | 99 | return [id2word[i] for i in ids] 100 | def batch_to_ids(sentences, word2id, max_len): 101 | """Prepares batches of indices. 102 | 103 | Sequences are padded to match the longest sequence in the batch, 104 | if it's longer than max_len, then max_len is used instead. 105 | 106 | sentences: a list of strings, original sequences. 107 | word2id: a dict, a mapping from original symbols to ids. 108 | max_len: an integer, max len of sequences allowed. 109 | 110 | result: a list of lists of ids, a list of actual lengths. 111 | """ 112 | 113 | max_len_in_batch = min(max(len(s) for s in sentences) + 1, max_len) 114 | batch_ids, batch_ids_len = [], [] 115 | for sentence in sentences: 116 | ids, ids_len = sentence_to_ids(sentence, word2id, max_len_in_batch) 117 | batch_ids.append(ids) 118 | batch_ids_len.append(ids_len) 119 | return batch_ids, batch_ids_len 120 | def generate_batches(samples, batch_size=64): 121 | X, Y = [], [] 122 | for i, (x, y) in enumerate(samples, 1): 123 | X.append(x) 124 | Y.append(y) 125 | if i % batch_size == 0: 126 | yield X, Y 127 | X, Y = [], [] 128 | if X and Y: 129 | yield X, Y 130 | def reply(question,word2id,max_len,model,id2word): 131 | 132 | ids, ids_len = sentence_to_ids(question,word2id,padded_len=max_len) 133 | ids=np.array(ids).reshape(1,len(ids)) 134 | 135 | ids_len=np.array(ids_len).reshape(1) 136 | predictions = model.predict_for_batch(session, ids, ids_len) 137 | return "".join(ids_to_sentence(predictions[0], id2word)).replace("$","").capitalize() 138 | -------------------------------------------------------------------------------- /natural-language-processing/project/dialogue_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sklearn.metrics.pairwise import pairwise_distances_argmin 3 | 4 | from chatterbot import ChatBot 5 | from utils import * 6 | 7 | 8 | class ThreadRanker(object): 9 | def __init__(self, paths): 10 | self.word_embeddings, self.embeddings_dim = load_embeddings( 11 | paths['WORD_EMBEDDINGS']) 12 | self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER'] 13 | 14 | def __load_embeddings_by_tag(self, tag_name): 15 | embeddings_path = os.path.join( 16 | self.thread_embeddings_folder, tag_name + ".pkl") 17 | thread_ids, thread_embeddings = unpickle_file(embeddings_path) 18 | return thread_ids, thread_embeddings 19 | 20 | def get_best_thread(self, question, tag_name): 21 | """ Returns id of the most similar thread for the question. 22 | The search is performed across the threads with a given tag. 23 | """ 24 | thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name) 25 | 26 | # HINT: you have already implemented a similar routine in the 3rd assignment. 27 | 28 | question_vec = question_to_vec( 29 | question, self.word_embeddings, self.embeddings_dim).reshape(-1,self.embeddings_dim) 30 | 31 | 32 | best_thread = pairwise_distances_argmin( 33 | question_vec, thread_embeddings, metric = "cosine")[0] 34 | 35 | return thread_ids.values[best_thread] 36 | 37 | 38 | class DialogueManager(object): 39 | def __init__(self, paths): 40 | print("Loading resources...") 41 | 42 | # Intent recognition: 43 | self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER']) 44 | self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER']) 45 | 46 | self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s' 47 | 48 | # Goal-oriented part: 49 | self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER']) 50 | self.thread_ranker = ThreadRanker(paths) 51 | 52 | def create_chitchat_bot(self): 53 | """Initializes self.chitchat_bot with some conversational model.""" 54 | 55 | # Hint: you might want to create and train chatterbot.ChatBot here. 56 | # It could be done by creating ChatBot with the *trainer* parameter equals 57 | # "chatterbot.trainers.ChatterBotCorpusTrainer" 58 | # and then calling *train* function with "chatterbot.corpus.english" param 59 | 60 | self.chitchat_bot = ChatBot('Botty McBotFace', 61 | trainer='chatterbot.trainers.ChatterBotCorpusTrainer') 62 | 63 | # Train based on the english corpus 64 | self.chitchat_bot.train("chatterbot.corpus.english") 65 | 66 | def generate_answer(self, question): 67 | """Combines stackoverflow and chitchat parts using intent recognition.""" 68 | 69 | # Recognize intent of the question using `intent_recognizer`. 70 | # Don't forget to prepare question and calculate features for the question. 71 | 72 | prepared_question = text_prepare(question) 73 | 74 | features = self.tfidf_vectorizer.transform([prepared_question]) 75 | 76 | intent = self.intent_recognizer.predict(features)[0] 77 | 78 | # Chit-chat part: 79 | if intent == 'dialogue': 80 | # Pass question to chitchat_bot to generate a response. 81 | response = self.chitchat_bot.get_response(question) 82 | return response 83 | 84 | # Goal-oriented part: 85 | else: 86 | # Pass features to tag_classifier to get predictions. 87 | tag = self.tag_classifier.predict(features) 88 | 89 | 90 | # Pass prepared_question to thread_ranker to get predictions. 91 | thread_id = self.thread_ranker.get_best_thread(prepared_question,tag[0]) 92 | 93 | return self.ANSWER_TEMPLATE % (tag[0], thread_id) 94 | -------------------------------------------------------------------------------- /natural-language-processing/project/main_bot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import requests 4 | import time 5 | import argparse 6 | import os 7 | import json 8 | 9 | from requests.compat import urljoin 10 | import dialogue_manager 11 | 12 | from dialogue_manager import DialogueManager 13 | from utils import * 14 | class BotHandler(object): 15 | """ 16 | BotHandler is a class which implements all back-end of the bot. 17 | It has tree main functions: 18 | 'get_updates' — checks for new messages 19 | 'send_message' – posts new message to user 20 | 'get_answer' — computes the most relevant on a user's question 21 | """ 22 | 23 | def __init__(self, token, dialogue_manager): 24 | self.token = token 25 | self.api_url = "https://api.telegram.org/bot{}/".format(token) 26 | self.dialogue_manager = dialogue_manager 27 | 28 | def get_updates(self, offset=None, timeout=30): 29 | params = {"timeout": timeout, "offset": offset} 30 | raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params) 31 | try: 32 | resp = raw_resp.json() 33 | except json.decoder.JSONDecodeError as e: 34 | print("Failed to parse response {}: {}.".format(raw_resp.content, e)) 35 | return [] 36 | 37 | if "result" not in resp: 38 | return [] 39 | return resp["result"] 40 | 41 | def send_message(self, chat_id, text): 42 | params = {"chat_id": chat_id, "text": text} 43 | return requests.post(urljoin(self.api_url, "sendMessage"), params) 44 | 45 | def get_answer(self, question): 46 | if question == '/start': 47 | return "Hi, I am your project bot. How can I help you today?" 48 | return self.dialogue_manager.generate_answer(question) 49 | 50 | 51 | def parse_args(): 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument('--token', type=str, default='') 54 | return parser.parse_args() 55 | 56 | 57 | def is_unicode(text): 58 | return len(text) == len(text.encode()) 59 | 60 | 61 | class SimpleDialogueManager(object): 62 | """ 63 | This is the simplest dialogue manager to test the telegram bot. 64 | Your task is to create a more advanced one in dialogue_manager.py." 65 | """ 66 | 67 | def generate_answer(self, question): 68 | return "Hello, world!" 69 | 70 | 71 | def main(): 72 | args = parse_args() 73 | token = args.token 74 | 75 | if not token: 76 | if not "TELEGRAM_TOKEN" in os.environ: 77 | print("Please, set bot token through --token or TELEGRAM_TOKEN env variable") 78 | return 79 | token = os.environ["TELEGRAM_TOKEN"] 80 | 81 | ################################################################# 82 | 83 | # Your task is to complete dialogue_manager.py and use your 84 | # advanced DialogueManager instead of SimpleDialogueManager. 85 | 86 | # This is the point where you plug it into the Telegram bot. 87 | # Do not forget to import all needed dependencies when you do so. 88 | 89 | # simple_manager = SimpleDialogueManager() 90 | advanced_manager = DialogueManager(RESOURCE_PATH) 91 | advanced_manager.create_chitchat_bot() 92 | bot = BotHandler(token, advanced_manager) 93 | 94 | ############################################################### 95 | 96 | print("Ready to talk!") 97 | offset = 0 98 | while True: 99 | updates = bot.get_updates(offset=offset) 100 | for update in updates: 101 | print("An update received.") 102 | if "message" in update: 103 | chat_id = update["message"]["chat"]["id"] 104 | if "text" in update["message"]: 105 | text = update["message"]["text"] 106 | if is_unicode(text): 107 | print("Update content: {}".format(update)) 108 | bot.send_message(chat_id, bot.get_answer(update["message"]["text"])) 109 | else: 110 | bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...") 111 | offset = max(offset, update['update_id'] + 1) 112 | time.sleep(1) 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /natural-language-processing/project/utils.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import pickle 3 | import re 4 | import numpy as np 5 | import pandas as pd 6 | 7 | nltk.download('stopwords') 8 | from nltk.corpus import stopwords 9 | 10 | # Paths for all resources for the bot. 11 | RESOURCE_PATH = { 12 | 'INTENT_RECOGNIZER': 'intent_recognizer.pkl', 13 | 'TAG_CLASSIFIER': 'tag_classifier.pkl', 14 | 'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl', 15 | 'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags', 16 | 'WORD_EMBEDDINGS': 'word_embeddings.tsv', 17 | } 18 | 19 | 20 | def text_prepare(text): 21 | """Performs tokenization and simple preprocessing.""" 22 | 23 | replace_by_space_re = re.compile('[/(){}\[\]\|@,;]') 24 | bad_symbols_re = re.compile('[^0-9a-z #+_]') 25 | stopwords_set = set(stopwords.words('english')) 26 | 27 | text = text.lower() 28 | text = replace_by_space_re.sub(' ', text) 29 | text = bad_symbols_re.sub('', text) 30 | text = ' '.join([x for x in text.split() if x and x not in stopwords_set]) 31 | 32 | return text.strip() 33 | 34 | 35 | def load_embeddings(embeddings_path): 36 | """Loads pre-trained word embeddings from tsv file. 37 | 38 | Args: 39 | embeddings_path - path to the embeddings file. 40 | 41 | Returns: 42 | embeddings - dict mapping words to vectors; 43 | embeddings_dim - dimension of the vectors. 44 | """ 45 | 46 | # Hint: you have already implemented a similar routine in the 3rd assignment. 47 | # Note that here you also need to know the dimension of the loaded embeddings. 48 | 49 | embeds = pd.read_csv(embeddings_path,sep="\t",header=None) 50 | vals=embeds.iloc[:,1:].values 51 | index=embeds.iloc[:,0].values 52 | embeddings= {i:j for i,j in zip(index,vals)} 53 | return embeddings,vals.shape[1] 54 | 55 | 56 | def question_to_vec(question, embeddings, dim): 57 | """Transforms a string to an embedding by averaging word embeddings.""" 58 | 59 | # Hint: you have already implemented exactly this function in the 3rd assignment. 60 | 61 | if question == "": 62 | return np.zeros(dim) 63 | t = np.array([embeddings[i] 64 | for i in question.split() if i in embeddings.keys()]) 65 | if len(t) == 0: 66 | return np.zeros(dim) 67 | 68 | return(t.mean(axis=0)) 69 | 70 | 71 | def unpickle_file(filename): 72 | """Returns the result of unpickling the file content.""" 73 | with open(filename, 'rb') as f: 74 | return pickle.load(f) 75 | -------------------------------------------------------------------------------- /natural-language-processing/week1/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | class Grader(object): 7 | def __init__(self): 8 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 9 | self.assignment_key = 'MSsYBMLgEeesWhJPHRLG5g' 10 | self.parts = OrderedDict([('f5nXa', 'TextPrepare'), 11 | ('hTrz8', 'WordsTagsCount'), 12 | ('0kUjR', 'BagOfWords'), 13 | ('tLJV1', 'MultilabelClassification')]) 14 | self.answers = {key: None for key in self.parts} 15 | 16 | @staticmethod 17 | def ravel_output(output): 18 | ''' 19 | If student accidentally submitted np.array with one 20 | element instead of number, this function will submit 21 | this number instead 22 | ''' 23 | if isinstance(output, np.ndarray) and output.size == 1: 24 | output = output.item(0) 25 | return output 26 | 27 | def submit(self, email, token): 28 | submission = { 29 | "assignmentKey": self.assignment_key, 30 | "submitterEmail": email, 31 | "secret": token, 32 | "parts": {} 33 | } 34 | for part, output in self.answers.items(): 35 | if output is not None: 36 | submission["parts"][part] = {"output": output} 37 | else: 38 | submission["parts"][part] = dict() 39 | request = requests.post(self.submission_page, data=json.dumps(submission)) 40 | response = request.json() 41 | if request.status_code == 201: 42 | print('Submitted to Coursera platform. See results on assignment page!') 43 | elif u'details' in response and u'learnerMessage' in response[u'details']: 44 | print(response[u'details'][u'learnerMessage']) 45 | else: 46 | print("Unknown response from Coursera: {}".format(request.status_code)) 47 | print(response) 48 | 49 | def status(self): 50 | print("You want to submit these parts:") 51 | for part_id, part_name in self.parts.items(): 52 | answer = self.answers[part_id] 53 | if answer is None: 54 | answer = '-'*10 55 | print("Task {}:\n {}".format(part_name, answer[:100] + '...')) 56 | 57 | def submit_part(self, part, output): 58 | self.answers[part] = output 59 | print("Current answer for task {} is:\n {}".format(self.parts[part], output[:100] + '...')) 60 | 61 | def submit_tag(self, tag, output): 62 | part_id = [k for k, v in self.parts.items() if v == tag] 63 | if len(part_id) != 1: 64 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 65 | part_id = part_id[0] 66 | self.submit_part(part_id, str(self.ravel_output(output))) 67 | -------------------------------------------------------------------------------- /natural-language-processing/week1/metrics.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.metrics import roc_curve, auc 4 | from scipy import interp 5 | from itertools import cycle 6 | 7 | def roc_auc(y_test, y_score, n_classes): 8 | """Plots ROC curve for micro and macro averaging.""" 9 | 10 | # Compute ROC curve and ROC area for each class 11 | fpr = {} 12 | tpr = {} 13 | roc_auc = {} 14 | for i in range(n_classes): 15 | fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) 16 | roc_auc[i] = auc(fpr[i], tpr[i]) 17 | 18 | # Compute micro-average ROC curve and ROC area 19 | fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) 20 | roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) 21 | 22 | # Compute macro-average ROC curve and ROC area 23 | all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) 24 | mean_tpr = np.zeros_like(all_fpr) 25 | for i in range(n_classes): 26 | mean_tpr += interp(all_fpr, fpr[i], tpr[i]) 27 | mean_tpr /= n_classes 28 | fpr["macro"] = all_fpr 29 | tpr["macro"] = mean_tpr 30 | roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) 31 | 32 | # Plot all ROC curves 33 | plt.figure() 34 | plt.plot(fpr["micro"], tpr["micro"], 35 | label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]), 36 | color='deeppink', linestyle=':', linewidth=4) 37 | 38 | plt.plot(fpr["macro"], tpr["macro"], 39 | label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]), 40 | color='navy', linestyle=':', linewidth=4) 41 | 42 | colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) 43 | for i, color in zip(range(0,3), colors): 44 | plt.plot(fpr[i], tpr[i], color=color, lw=2, 45 | label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i])) 46 | 47 | plt.plot([0, 1], [0, 1], 'k--', lw=2) 48 | plt.xlim([0.0, 1.0]) 49 | plt.ylim([0.0, 1.05]) 50 | plt.xlabel('False Positive Rate') 51 | plt.ylabel('True Positive Rate') 52 | plt.title('Some extension of ROC to multi-class') 53 | plt.legend(loc="lower right") 54 | plt.show() -------------------------------------------------------------------------------- /natural-language-processing/week2/evaluation.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | def _update_chunk(candidate, prev, current_tag, current_chunk, current_pos, prediction=False): 4 | if candidate == 'B-' + current_tag: 5 | if len(current_chunk) > 0 and len(current_chunk[-1]) == 1: 6 | current_chunk[-1].append(current_pos - 1) 7 | current_chunk.append([current_pos]) 8 | elif candidate == 'I-' + current_tag: 9 | if prediction and (current_pos == 0 or current_pos > 0 and prev.split('-', 1)[-1] != current_tag): 10 | current_chunk.append([current_pos]) 11 | if not prediction and (current_pos == 0 or current_pos > 0 and prev == 'O'): 12 | current_chunk.append([current_pos]) 13 | elif current_pos > 0 and prev.split('-', 1)[-1] == current_tag: 14 | if len(current_chunk) > 0: 15 | current_chunk[-1].append(current_pos - 1) 16 | 17 | def _update_last_chunk(current_chunk, current_pos): 18 | if len(current_chunk) > 0 and len(current_chunk[-1]) == 1: 19 | current_chunk[-1].append(current_pos - 1) 20 | 21 | def _tag_precision_recall_f1(tp, fp, fn): 22 | precision, recall, f1 = 0, 0, 0 23 | if tp + fp > 0: 24 | precision = tp / (tp + fp) * 100 25 | if tp + fn > 0: 26 | recall = tp / (tp + fn) * 100 27 | if precision + recall > 0: 28 | f1 = 2 * precision * recall / (precision + recall) 29 | return precision, recall, f1 30 | 31 | def _aggregate_metrics(results, total_correct): 32 | total_true_entities = 0 33 | total_predicted_entities = 0 34 | total_precision = 0 35 | total_recall = 0 36 | total_f1 = 0 37 | for tag, tag_metrics in results.items(): 38 | n_pred = tag_metrics['n_predicted_entities'] 39 | n_true = tag_metrics['n_true_entities'] 40 | total_true_entities += n_true 41 | total_predicted_entities += n_pred 42 | total_precision += tag_metrics['precision'] * n_pred 43 | total_recall += tag_metrics['recall'] * n_true 44 | 45 | accuracy = total_correct / total_true_entities * 100 46 | if total_predicted_entities > 0: 47 | total_precision = total_precision / total_predicted_entities 48 | total_recall = total_recall / total_true_entities 49 | if total_precision + total_recall > 0: 50 | total_f1 = 2 * total_precision * total_recall / (total_precision + total_recall) 51 | return total_true_entities, total_predicted_entities, \ 52 | total_precision, total_recall, total_f1, accuracy 53 | 54 | def _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct): 55 | print('processed {len} tokens ' \ 56 | 'with {tot_true} phrases; ' \ 57 | 'found: {tot_pred} phrases; ' \ 58 | 'correct: {tot_cor}.\n'.format(len=n_tokens, 59 | tot_true=total_true_entities, 60 | tot_pred=total_predicted_entities, 61 | tot_cor=total_correct)) 62 | 63 | def _print_metrics(accuracy, total_precision, total_recall, total_f1): 64 | print('precision: {tot_prec:.2f}%; ' \ 65 | 'recall: {tot_recall:.2f}%; ' \ 66 | 'F1: {tot_f1:.2f}\n'.format(acc=accuracy, 67 | tot_prec=total_precision, 68 | tot_recall=total_recall, 69 | tot_f1=total_f1)) 70 | 71 | def _print_tag_metrics(tag, tag_results): 72 | print(('\t%12s' % tag) + ': precision: {tot_prec:6.2f}%; ' \ 73 | 'recall: {tot_recall:6.2f}%; ' \ 74 | 'F1: {tot_f1:6.2f}; ' \ 75 | 'predicted: {tot_predicted:4d}\n'.format(tot_prec=tag_results['precision'], 76 | tot_recall=tag_results['recall'], 77 | tot_f1=tag_results['f1'], 78 | tot_predicted=tag_results['n_predicted_entities'])) 79 | 80 | def precision_recall_f1(y_true, y_pred, print_results=True, short_report=False): 81 | # Find all tags 82 | tags = sorted(set(tag[2:] for tag in y_true + y_pred if tag != 'O')) 83 | 84 | results = OrderedDict((tag, OrderedDict()) for tag in tags) 85 | n_tokens = len(y_true) 86 | total_correct = 0 87 | 88 | # For eval_conll_try we find all chunks in the ground truth and prediction 89 | # For each chunk we store starting and ending indices 90 | for tag in tags: 91 | true_chunk = list() 92 | predicted_chunk = list() 93 | for position in range(n_tokens): 94 | _update_chunk(y_true[position], y_true[position - 1], tag, true_chunk, position) 95 | _update_chunk(y_pred[position], y_pred[position - 1], tag, predicted_chunk, position, True) 96 | 97 | _update_last_chunk(true_chunk, position) 98 | _update_last_chunk(predicted_chunk, position) 99 | 100 | # Then we find all correctly classified intervals 101 | # True positive results 102 | tp = sum(chunk in predicted_chunk for chunk in true_chunk) 103 | total_correct += tp 104 | 105 | # And then just calculate errors of the first and second kind 106 | # False negative 107 | fn = len(true_chunk) - tp 108 | # False positive 109 | fp = len(predicted_chunk) - tp 110 | precision, recall, f1 = _tag_precision_recall_f1(tp, fp, fn) 111 | 112 | results[tag]['precision'] = precision 113 | results[tag]['recall'] = recall 114 | results[tag]['f1'] = f1 115 | results[tag]['n_predicted_entities'] = len(predicted_chunk) 116 | results[tag]['n_true_entities'] = len(true_chunk) 117 | 118 | total_true_entities, total_predicted_entities, \ 119 | total_precision, total_recall, total_f1, accuracy = _aggregate_metrics(results, total_correct) 120 | 121 | if print_results: 122 | _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct) 123 | _print_metrics(accuracy, total_precision, total_recall, total_f1) 124 | 125 | if not short_report: 126 | for tag, tag_results in results.items(): 127 | _print_tag_metrics(tag, tag_results) 128 | return results 129 | -------------------------------------------------------------------------------- /natural-language-processing/week3/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | class Grader(object): 7 | def __init__(self): 8 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 9 | self.assignment_key = '7DdYfMQFEeevjw7-W7Fr0A' 10 | self.parts = OrderedDict([('98mDT', 'Question2Vec'), 11 | ('nc7RP', 'HitsCount'), 12 | ('bNp90', 'DCGScore'), 13 | ('3gRlQ', 'W2VTokenizedRanks'), 14 | ('mX6wS', 'StarSpaceRanks')]) 15 | self.answers = {key: None for key in self.parts} 16 | 17 | @staticmethod 18 | def ravel_output(output): 19 | ''' 20 | If student accidentally submitted np.array with one 21 | element instead of number, this function will submit 22 | this number instead 23 | ''' 24 | if isinstance(output, np.ndarray) and output.size == 1: 25 | output = output.item(0) 26 | return output 27 | 28 | def submit(self, email, token): 29 | submission = { 30 | "assignmentKey": self.assignment_key, 31 | "submitterEmail": email, 32 | "secret": token, 33 | "parts": {} 34 | } 35 | for part, output in self.answers.items(): 36 | if output is not None: 37 | submission["parts"][part] = {"output": output} 38 | else: 39 | submission["parts"][part] = dict() 40 | request = requests.post(self.submission_page, data=json.dumps(submission)) 41 | response = request.json() 42 | if request.status_code == 201: 43 | print('Submitted to Coursera platform. See results on assignment page!') 44 | elif u'details' in response and u'learnerMessage' in response[u'details']: 45 | print(response[u'details'][u'learnerMessage']) 46 | else: 47 | print("Unknown response from Coursera: {}".format(request.status_code)) 48 | print(response) 49 | 50 | def status(self): 51 | print("You want to submit these parts:") 52 | for part_id, part_name in self.parts.items(): 53 | answer = self.answers[part_id] 54 | if answer is None: 55 | answer = '-'*10 56 | print("Task {}: {}".format(part_name, answer[:100] + '...')) 57 | 58 | def submit_part(self, part, output): 59 | self.answers[part] = output 60 | print("Current answer for task {} is: {}".format(self.parts[part], output[:100] + '...')) 61 | 62 | def submit_tag(self, tag, output): 63 | part_id = [k for k, v in self.parts.items() if v == tag] 64 | if len(part_id) != 1: 65 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 66 | part_id = part_id[0] 67 | self.submit_part(part_id, str(self.ravel_output(output))) 68 | -------------------------------------------------------------------------------- /natural-language-processing/week3/util.py: -------------------------------------------------------------------------------- 1 | import re 2 | from nltk.corpus import stopwords 3 | 4 | REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') 5 | GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') 6 | STOPWORDS = set(stopwords.words('english')) 7 | def text_prepare(text): 8 | text = text.lower() 9 | text = REPLACE_BY_SPACE_RE.sub(' ', text) 10 | text = GOOD_SYMBOLS_RE.sub('', text) 11 | text = ' '.join([x for x in text.split() if x and x not in STOPWORDS]) 12 | return text.strip() 13 | 14 | def array_to_string(arr): 15 | return '\n'.join(str(num) for num in arr) 16 | 17 | def matrix_to_string(matrix): 18 | return '\n'.join('\t'.join(str(num) for num in line) for line in matrix) -------------------------------------------------------------------------------- /natural-language-processing/week4/encoder-decoder-pic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/natural-language-processing/week4/encoder-decoder-pic.png -------------------------------------------------------------------------------- /reinforcement-learning/.gitignore: -------------------------------------------------------------------------------- 1 | submit.py 2 | grading.py 3 | -------------------------------------------------------------------------------- /reinforcement-learning/atari_util.py: -------------------------------------------------------------------------------- 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient""" 2 | import numpy as np 3 | import gym 4 | from scipy.misc import imresize 5 | from gym.core import Wrapper 6 | from gym.spaces.box import Box 7 | 8 | class PreprocessAtari(Wrapper): 9 | def __init__(self, env, height=42, width=42, color=False, crop=lambda img: img, 10 | n_frames=4, dim_order='theano', reward_scale=1,): 11 | """A gym wrapper that reshapes, crops and scales image into the desired shapes""" 12 | super(PreprocessAtari, self).__init__(env) 13 | assert dim_order in ('theano', 'tensorflow') 14 | self.img_size = (height, width) 15 | self.crop=crop 16 | self.color=color 17 | self.dim_order = dim_order 18 | self.reward_scale = reward_scale 19 | 20 | n_channels = (3 * n_frames) if color else n_frames 21 | obs_shape = [n_channels,height,width] if dim_order == 'theano' else [height,width,n_channels] 22 | self.observation_space = Box(0.0, 1.0, obs_shape) 23 | self.framebuffer = np.zeros(obs_shape, 'float32') 24 | 25 | def reset(self): 26 | """resets breakout, returns initial frames""" 27 | self.framebuffer = np.zeros_like(self.framebuffer) 28 | self.update_buffer(self.env.reset()) 29 | return self.framebuffer 30 | 31 | def step(self,action): 32 | """plays breakout for 1 step, returns frame buffer""" 33 | new_img, reward, done, info = self.env.step(action) 34 | self.update_buffer(new_img) 35 | return self.framebuffer, reward * self.reward_scale, done, info 36 | 37 | ### image processing ### 38 | 39 | def update_buffer(self,img): 40 | img = self.preproc_image(img) 41 | offset = 3 if self.color else 1 42 | if self.dim_order == 'theano': 43 | axis = 0 44 | cropped_framebuffer = self.framebuffer[:-offset] 45 | else: 46 | axis = -1 47 | cropped_framebuffer = self.framebuffer[:,:,:-offset] 48 | self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis) 49 | 50 | def preproc_image(self, img): 51 | """what happens to the observation""" 52 | img = self.crop(img) 53 | img = imresize(img, self.img_size) 54 | if not self.color: 55 | img = img.mean(-1, keepdims=True) 56 | if self.dim_order == 'theano': 57 | img = img.transpose([2,0,1]) # [h, w, c] to [c, h, w] 58 | img = img.astype('float32') / 255. 59 | return img 60 | -------------------------------------------------------------------------------- /reinforcement-learning/framebuffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.spaces.box import Box 3 | from gym.core import Wrapper 4 | class FrameBuffer(Wrapper): 5 | def __init__(self, env, n_frames=4, dim_order='tensorflow'): 6 | """A gym wrapper that reshapes, crops and scales image into the desired shapes""" 7 | super(FrameBuffer, self).__init__(env) 8 | self.dim_order = dim_order 9 | if dim_order == 'tensorflow': 10 | height, width, n_channels = env.observation_space.shape 11 | obs_shape = [height, width, n_channels * n_frames] 12 | elif dim_order == 'pytorch': 13 | n_channels, height, width = env.observation_space.shape 14 | obs_shape = [n_channels * n_frames, height, width] 15 | else: 16 | raise ValueError('dim_order should be "tensorflow" or "pytorch", got {}'.format(dim_order)) 17 | self.observation_space = Box(0.0, 1.0, obs_shape) 18 | self.framebuffer = np.zeros(obs_shape, 'float32') 19 | 20 | def reset(self): 21 | """resets breakout, returns initial frames""" 22 | self.framebuffer = np.zeros_like(self.framebuffer) 23 | self.update_buffer(self.env.reset()) 24 | return self.framebuffer 25 | 26 | def step(self, action): 27 | """plays breakout for 1 step, returns frame buffer""" 28 | new_img, reward, done, info = self.env.step(action) 29 | self.update_buffer(new_img) 30 | return self.framebuffer, reward, done, info 31 | 32 | def update_buffer(self, img): 33 | if self.dim_order == 'tensorflow': 34 | offset = self.env.observation_space.shape[-1] 35 | axis = -1 36 | cropped_framebuffer = self.framebuffer[:,:,:-offset] 37 | elif self.dim_order == 'pytorch': 38 | offset = self.env.observation_space.shape[0] 39 | axis = 0 40 | cropped_framebuffer = self.framebuffer[:-offset] 41 | self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis) 42 | -------------------------------------------------------------------------------- /reinforcement-learning/mdp.py: -------------------------------------------------------------------------------- 1 | # most of this code was politely stolen from https://github.com/berkeleydeeprlcourse/homework/ 2 | # all creadit goes to https://github.com/abhishekunique (if i got the author right) 3 | import sys 4 | import random 5 | import numpy as np 6 | def weighted_choice(v, p): 7 | total = sum(p) 8 | r = random.uniform(0, total) 9 | upto = 0 10 | for c, w in zip(v,p): 11 | if upto + w >= r: 12 | return c 13 | upto += w 14 | assert False, "Shouldn't get here" 15 | 16 | class MDP: 17 | def __init__(self, transition_probs, rewards, initial_state=None): 18 | """ 19 | Defines an MDP. Compatible with gym Env. 20 | :param transition_probs: transition_probs[s][a][s_next] = P(s_next | s, a) 21 | A dict[state -> dict] of dicts[action -> dict] of dicts[next_state -> prob] 22 | For each state and action, probabilities of next states should sum to 1 23 | If a state has no actions available, it is considered terminal 24 | :param rewards: rewards[s][a][s_next] = r(s,a,s') 25 | A dict[state -> dict] of dicts[action -> dict] of dicts[next_state -> reward] 26 | The reward for anything not mentioned here is zero. 27 | :param get_initial_state: a state where agent starts or a callable() -> state 28 | By default, picks initial state at random. 29 | 30 | States and actions can be anything you can use as dict keys, but we recommend that you use strings or integers 31 | 32 | Here's an example from MDP depicted on http://bit.ly/2jrNHNr 33 | transition_probs = { 34 | 's0':{ 35 | 'a0': {'s0': 0.5, 's2': 0.5}, 36 | 'a1': {'s2': 1} 37 | }, 38 | 's1':{ 39 | 'a0': {'s0': 0.7, 's1': 0.1, 's2': 0.2}, 40 | 'a1': {'s1': 0.95, 's2': 0.05} 41 | }, 42 | 's2':{ 43 | 'a0': {'s0': 0.4, 's1': 0.6}, 44 | 'a1': {'s0': 0.3, 's1': 0.3, 's2':0.4} 45 | } 46 | } 47 | rewards = { 48 | 's1': {'a0': {'s0': +5}}, 49 | 's2': {'a1': {'s0': -1}} 50 | } 51 | """ 52 | self._check_param_consistency(transition_probs, rewards) 53 | self._transition_probs = transition_probs 54 | self._rewards = rewards 55 | self._initial_state = initial_state 56 | self.n_states = len(transition_probs) 57 | self.reset() 58 | 59 | def get_all_states(self): 60 | """ return a tuple of all possiblestates """ 61 | return tuple(self._transition_probs.keys()) 62 | 63 | def get_possible_actions(self, state): 64 | """ return a tuple of possible actions in a given state """ 65 | return tuple(self._transition_probs.get(state, {}).keys()) 66 | 67 | def is_terminal(self, state): 68 | """ return True if state is terminal or False if it isn't """ 69 | return len(self.get_possible_actions(state)) == 0 70 | 71 | def get_next_states(self, state, action): 72 | """ return a dictionary of {next_state1 : P(next_state1 | state, action), next_state2: ...} """ 73 | assert action in self.get_possible_actions(state), "cannot do action %s from state %s" % (action, state) 74 | return self._transition_probs[state][action] 75 | 76 | def get_transition_prob(self, state, action, next_state): 77 | """ return P(next_state | state, action) """ 78 | return self.get_next_states(state, action).get(next_state, 0.0) 79 | 80 | def get_reward(self, state, action, next_state): 81 | """ return the reward you get for taking action in state and landing on next_state""" 82 | assert action in self.get_possible_actions(state), "cannot do action %s from state %s" % (action, state) 83 | return self._rewards.get(state, {}).get(action, {}).get(next_state, 0.0) 84 | 85 | def reset(self): 86 | """ reset the game, return the initial state""" 87 | if self._initial_state is None: 88 | self._current_state = random.choice(tuple(self._transition_probs.keys())) 89 | elif self._initial_state in self._transition_probs: 90 | self._current_state = self._initial_state 91 | elif callable(self._initial_state): 92 | self._current_state = self._initial_state() 93 | else: 94 | raise ValueError("initial state %s should be either a state or a function() -> state" % self._initial_state) 95 | return self._current_state 96 | 97 | def step(self, action): 98 | """ take action, return next_state, reward, is_done, empty_info """ 99 | possible_states, probs = zip(*self.get_next_states(self._current_state, action).items()) 100 | next_state = weighted_choice(possible_states, p=probs) 101 | reward = self.get_reward(self._current_state, action, next_state) 102 | is_done = self.is_terminal(next_state) 103 | self._current_state = next_state 104 | return next_state, reward, is_done, {} 105 | 106 | def render(self): 107 | print("Currently at %s" % self._current_state) 108 | 109 | def _check_param_consistency(self, transition_probs, rewards): 110 | for state in transition_probs: 111 | assert isinstance(transition_probs[state], dict), "transition_probs for %s should be a dictionary " \ 112 | "but is instead %s" % ( 113 | state, type(transition_probs[state])) 114 | for action in transition_probs[state]: 115 | assert isinstance(transition_probs[state][action], dict), "transition_probs for %s, %s should be a " \ 116 | "a dictionary but is instead %s" % ( 117 | state, action, 118 | type(transition_probs[state, action])) 119 | next_state_probs = transition_probs[state][action] 120 | assert len(next_state_probs) != 0, "from state %s action %s leads to no next states" % (state, action) 121 | sum_probs = sum(next_state_probs.values()) 122 | assert abs(sum_probs - 1) <= 1e-10, "next state probabilities for state %s action %s " \ 123 | "add up to %f (should be 1)" % (state, action, sum_probs) 124 | for state in rewards: 125 | assert isinstance(rewards[state], dict), "rewards for %s should be a dictionary " \ 126 | "but is instead %s" % (state, type(transition_probs[state])) 127 | for action in rewards[state]: 128 | assert isinstance(rewards[state][action], dict), "rewards for %s, %s should be a " \ 129 | "a dictionary but is instead %s" % ( 130 | state, action, type(transition_probs[state, action])) 131 | msg = "The Enrichment Center once again reminds you that Android Hell is a real place where" \ 132 | " you will be sent at the first sign of defiance. " 133 | assert None not in transition_probs, "please do not use None as a state identifier. " + msg 134 | assert None not in rewards, "please do not use None as an action identifier. " + msg 135 | 136 | class FrozenLakeEnv(MDP): 137 | """ 138 | Winter is here. You and your friends were tossing around a frisbee at the park 139 | when you made a wild throw that left the frisbee out in the middle of the lake. 140 | The water is mostly frozen, but there are a few holes where the ice has melted. 141 | If you step into one of those holes, you'll fall into the freezing water. 142 | At this time, there's an international frisbee shortage, so it's absolutely imperative that 143 | you navigate across the lake and retrieve the disc. 144 | However, the ice is slippery, so you won't always move in the direction you intend. 145 | The surface is described using a grid like the following 146 | 147 | SFFF 148 | FHFH 149 | FFFH 150 | HFFG 151 | 152 | S : starting point, safe 153 | F : frozen surface, safe 154 | H : hole, fall to your doom 155 | G : goal, where the frisbee is located 156 | 157 | The episode ends when you reach the goal or fall in a hole. 158 | You receive a reward of 1 if you reach the goal, and zero otherwise. 159 | 160 | """ 161 | 162 | MAPS = { 163 | "4x4": [ 164 | "SFFF", 165 | "FHFH", 166 | "FFFH", 167 | "HFFG" 168 | ], 169 | "8x8": [ 170 | "SFFFFFFF", 171 | "FFFFFFFF", 172 | "FFFHFFFF", 173 | "FFFFFHFF", 174 | "FFFHFFFF", 175 | "FHHFFFHF", 176 | "FHFFHFHF", 177 | "FFFHFFFG" 178 | ], 179 | } 180 | 181 | 182 | def __init__(self, desc=None, map_name="4x4", slip_chance=0.2): 183 | if desc is None and map_name is None: 184 | raise ValueError('Must provide either desc or map_name') 185 | elif desc is None: 186 | desc = self.MAPS[map_name] 187 | assert ''.join(desc).count('S') == 1, "this implementation supports having exactly one initial state" 188 | assert all(c in "SFHG" for c in ''.join(desc)), "all cells must be either of S, F, H or G" 189 | 190 | self.desc = desc = np.asarray(list(map(list,desc)),dtype='str') 191 | self.lastaction = None 192 | 193 | nrow, ncol = desc.shape 194 | states = [(i, j) for i in range(nrow) for j in range(ncol)] 195 | actions = ["left","down","right","up"] 196 | 197 | initial_state = states[np.array(desc == b'S').ravel().argmax()] 198 | 199 | def move(row, col, movement): 200 | if movement== 'left': 201 | col = max(col-1,0) 202 | elif movement== 'down': 203 | row = min(row+1,nrow-1) 204 | elif movement== 'right': 205 | col = min(col+1,ncol-1) 206 | elif movement== 'up': 207 | row = max(row-1,0) 208 | else: 209 | raise("invalid action") 210 | return (row, col) 211 | 212 | transition_probs = {s : {} for s in states} 213 | rewards = {s : {} for s in states} 214 | for (row,col) in states: 215 | if desc[row, col] in "GH": continue 216 | for action_i in range(len(actions)): 217 | action = actions[action_i] 218 | transition_probs[(row, col)][action] = {} 219 | rewards[(row, col)][action] = {} 220 | for movement_i in [(action_i - 1) % len(actions), action_i, (action_i + 1) % len(actions)]: 221 | movement = actions[movement_i] 222 | newrow, newcol = move(row, col, movement) 223 | prob = (1. - slip_chance) if movement == action else (slip_chance / 2.) 224 | if prob == 0: continue 225 | if (newrow, newcol) not in transition_probs[row,col][action]: 226 | transition_probs[row,col][action][newrow, newcol] = prob 227 | else: 228 | transition_probs[row, col][action][newrow, newcol] += prob 229 | if desc[newrow, newcol] == 'G': 230 | rewards[row,col][action][newrow, newcol] = 1.0 231 | 232 | MDP.__init__(self, transition_probs, rewards, initial_state) 233 | 234 | def render(self): 235 | desc_copy = np.copy(self.desc) 236 | desc_copy[self._current_state] = '*' 237 | print('\n'.join(map(''.join,desc_copy)), end='\n\n') 238 | 239 | 240 | -------------------------------------------------------------------------------- /reinforcement-learning/qlearning.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import random, math 3 | import numpy as np 4 | 5 | class QLearningAgent: 6 | def __init__(self, alpha, epsilon, discount, get_legal_actions): 7 | """ 8 | Q-Learning Agent 9 | based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html 10 | Instance variables you have access to 11 | - self.epsilon (exploration prob) 12 | - self.alpha (learning rate) 13 | - self.discount (discount rate aka gamma) 14 | 15 | Functions you should use 16 | - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable} 17 | which returns legal actions for a state 18 | - self.get_qvalue(state,action) 19 | which returns Q(state,action) 20 | - self.set_qvalue(state,action,value) 21 | which sets Q(state,action) := value 22 | 23 | !!!Important!!! 24 | Note: please avoid using self._qValues directly. 25 | There's a special self.get_qvalue/set_qvalue for that. 26 | """ 27 | 28 | self.get_legal_actions = get_legal_actions 29 | self._qvalues = defaultdict(lambda: defaultdict(lambda: 0)) 30 | self.alpha = alpha 31 | self.epsilon = epsilon 32 | self.discount = discount 33 | 34 | def get_qvalue(self, state, action): 35 | """ Returns Q(state,action) """ 36 | return self._qvalues[state][action] 37 | 38 | def set_qvalue(self,state,action,value): 39 | """ Sets the Qvalue for [state,action] to the given value """ 40 | self._qvalues[state][action] = value 41 | 42 | #---------------------START OF YOUR CODE---------------------# 43 | 44 | def get_value(self, state): 45 | """ 46 | Compute your agent's estimate of V(s) using current q-values 47 | V(s) = max_over_action Q(state,action) over possible actions. 48 | Note: please take into account that q-values can be negative. 49 | """ 50 | possible_actions = self.get_legal_actions(state) 51 | 52 | 53 | #If there are no legal actions, return 0.0 54 | if len(possible_actions) == 0: 55 | return 0.0 56 | 57 | value=max([self.get_qvalue(state,action) for action in possible_actions]) 58 | 59 | return value 60 | 61 | def update(self, state, action, reward, next_state): 62 | """ 63 | You should do your Q-Value update here: 64 | Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s')) 65 | """ 66 | 67 | #agent parameters 68 | gamma = self.discount 69 | learning_rate = self.alpha 70 | 71 | new_q = ((1-learning_rate)*self.get_qvalue(state,action)+ 72 | learning_rate*(reward+gamma*self.get_value(next_state)) 73 | ) 74 | 75 | self.set_qvalue(state, action, new_q) 76 | 77 | 78 | def get_best_action(self, state): 79 | """ 80 | Compute the best action to take in a state (using current q-values). 81 | """ 82 | possible_actions = self.get_legal_actions(state) 83 | 84 | #If there are no legal actions, return None 85 | if len(possible_actions) == 0: 86 | return None 87 | 88 | best_action = np.argmax([self.get_qvalue(state, action) for action in possible_actions]) 89 | 90 | return best_action 91 | 92 | def get_action(self, state): 93 | """ 94 | Compute the action to take in the current state, including exploration. 95 | With probability self.epsilon, we should take a random action. 96 | otherwise - the best policy action (self.getPolicy). 97 | 98 | Note: To pick randomly from a list, use random.choice(list). 99 | To pick True or False with a given probablity, generate uniform number in [0, 1] 100 | and compare it with your probability 101 | """ 102 | 103 | # Pick Action 104 | possible_actions = self.get_legal_actions(state) 105 | action = None 106 | 107 | #If there are no legal actions, return None 108 | if len(possible_actions) == 0: 109 | return None 110 | 111 | #agent parameters: 112 | epsilon = self.epsilon 113 | 114 | explore_random = np.random.choice([True,False],p=[epsilon,1-epsilon]) 115 | if explore_random: 116 | chosen_action = random.choice(possible_actions) 117 | else: 118 | chosen_action = self.get_best_action(state) 119 | 120 | return chosen_action -------------------------------------------------------------------------------- /reinforcement-learning/replay_buffer.py: -------------------------------------------------------------------------------- 1 | # This code is shamelessly stolen from https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py 2 | import numpy as np 3 | import random 4 | 5 | class ReplayBuffer(object): 6 | def __init__(self, size): 7 | """Create Replay buffer. 8 | Parameters 9 | ---------- 10 | size: int 11 | Max number of transitions to store in the buffer. When the buffer 12 | overflows the old memories are dropped. 13 | """ 14 | self._storage = [] 15 | self._maxsize = size 16 | self._next_idx = 0 17 | 18 | def __len__(self): 19 | return len(self._storage) 20 | 21 | def add(self, obs_t, action, reward, obs_tp1, done): 22 | data = (obs_t, action, reward, obs_tp1, done) 23 | 24 | if self._next_idx >= len(self._storage): 25 | self._storage.append(data) 26 | else: 27 | self._storage[self._next_idx] = data 28 | self._next_idx = (self._next_idx + 1) % self._maxsize 29 | 30 | def _encode_sample(self, idxes): 31 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 32 | for i in idxes: 33 | data = self._storage[i] 34 | obs_t, action, reward, obs_tp1, done = data 35 | obses_t.append(np.array(obs_t, copy=False)) 36 | actions.append(np.array(action, copy=False)) 37 | rewards.append(reward) 38 | obses_tp1.append(np.array(obs_tp1, copy=False)) 39 | dones.append(done) 40 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 41 | 42 | def sample(self, batch_size): 43 | """Sample a batch of experiences. 44 | Parameters 45 | ---------- 46 | batch_size: int 47 | How many transitions to sample. 48 | Returns 49 | ------- 50 | obs_batch: np.array 51 | batch of observations 52 | act_batch: np.array 53 | batch of actions executed given obs_batch 54 | rew_batch: np.array 55 | rewards received as results of executing act_batch 56 | next_obs_batch: np.array 57 | next set of observations seen after executing act_batch 58 | done_mask: np.array 59 | done_mask[i] = 1 if executing act_batch[i] resulted in 60 | the end of an episode and 0 otherwise. 61 | """ 62 | idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 63 | return self._encode_sample(idxes) 64 | -------------------------------------------------------------------------------- /reinforcement-learning/week5/atari_util.py: -------------------------------------------------------------------------------- 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient""" 2 | import numpy as np 3 | import gym 4 | from scipy.misc import imresize 5 | from gym.core import Wrapper 6 | from gym.spaces.box import Box 7 | 8 | class PreprocessAtari(Wrapper): 9 | def __init__(self, env, height=42, width=42, color=False, crop=lambda img: img, 10 | n_frames=4, dim_order='theano', reward_scale=1,): 11 | """A gym wrapper that reshapes, crops and scales image into the desired shapes""" 12 | super(PreprocessAtari, self).__init__(env) 13 | assert dim_order in ('theano', 'tensorflow') 14 | self.img_size = (height, width) 15 | self.crop=crop 16 | self.color=color 17 | self.dim_order = dim_order 18 | self.reward_scale = reward_scale 19 | 20 | n_channels = (3 * n_frames) if color else n_frames 21 | obs_shape = [n_channels,height,width] if dim_order == 'theano' else [height,width,n_channels] 22 | self.observation_space = Box(0.0, 1.0, obs_shape) 23 | self.framebuffer = np.zeros(obs_shape, 'float32') 24 | 25 | def reset(self): 26 | """resets breakout, returns initial frames""" 27 | self.framebuffer = np.zeros_like(self.framebuffer) 28 | self.update_buffer(self.env.reset()) 29 | return self.framebuffer 30 | 31 | def step(self,action): 32 | """plays breakout for 1 step, returns frame buffer""" 33 | new_img, reward, done, info = self.env.step(action) 34 | self.update_buffer(new_img) 35 | return self.framebuffer, reward * self.reward_scale, done, info 36 | 37 | ### image processing ### 38 | 39 | def update_buffer(self,img): 40 | img = self.preproc_image(img) 41 | offset = 3 if self.color else 1 42 | if self.dim_order == 'theano': 43 | axis = 0 44 | cropped_framebuffer = self.framebuffer[:-offset] 45 | else: 46 | axis = -1 47 | cropped_framebuffer = self.framebuffer[:,:,:-offset] 48 | self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis) 49 | 50 | def preproc_image(self, img): 51 | """what happens to the observation""" 52 | img = self.crop(img) 53 | img = imresize(img, self.img_size) 54 | if not self.color: 55 | img = img.mean(-1, keepdims=True) 56 | if self.dim_order == 'theano': 57 | img = img.transpose([2,0,1]) # [h, w, c] to [c, h, w] 58 | img = img.astype('float32') / 255. 59 | return img 60 | -------------------------------------------------------------------------------- /reinforcement-learning/week5/practice_reinforce.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # REINFORCE in TensorFlow 5 | # 6 | # This notebook implements a basic reinforce algorithm a.k.a. policy gradient for CartPole env. 7 | # 8 | # It has been deliberately written to be as simple and human-readable. 9 | # 10 | 11 | # In[3]: 12 | 13 | from IPython.display import FileLink, FileLinks 14 | FileLinks('.') #lists all downloadable files on server 15 | 16 | 17 | # In[1]: 18 | 19 | get_ipython().system('ls') 20 | 21 | 22 | # The notebook assumes that you have [openai gym](https://github.com/openai/gym) installed. 23 | # 24 | # In case you're running on a server, [use xvfb](https://github.com/openai/gym#rendering-on-a-server) 25 | 26 | # In[1]: 27 | 28 | import gym 29 | import numpy as np, pandas as pd 30 | import matplotlib.pyplot as plt 31 | get_ipython().magic('matplotlib inline') 32 | 33 | env = gym.make("CartPole-v0") 34 | 35 | #gym compatibility: unwrap TimeLimit 36 | if hasattr(env,'env'): 37 | env=env.env 38 | 39 | env.reset() 40 | n_actions = env.action_space.n 41 | state_dim = env.observation_space.shape 42 | 43 | plt.imshow(env.render("rgb_array")) 44 | 45 | 46 | # # Building the policy network 47 | 48 | # For REINFORCE algorithm, we'll need a model that predicts action probabilities given states. 49 | # 50 | # For numerical stability, please __do not include the softmax layer into your network architecture__. 51 | # 52 | # We'll use softmax or log-softmax where appropriate. 53 | 54 | # In[2]: 55 | 56 | import tensorflow as tf 57 | 58 | #create input variables. We only need for REINFORCE 59 | states = tf.placeholder('float32',(None,)+state_dim,name="states") 60 | actions = tf.placeholder('int32',name="action_ids") 61 | cumulative_rewards = tf.placeholder('float32', name="cumulative_returns") 62 | 63 | 64 | # In[3]: 65 | 66 | 67 | 68 | 69 | logits = 70 | 71 | policy = tf.nn.softmax(logits) 72 | log_policy = tf.nn.log_softmax(logits) 73 | 74 | 75 | # In[4]: 76 | 77 | #utility function to pick action in one given state 78 | get_action_proba = lambda s: policy.eval({states:[s]})[0] 79 | 80 | 81 | # #### Loss function and updates 82 | # 83 | # We now need to define objective and update over policy gradient. 84 | # 85 | # Our objective function is 86 | # 87 | # $$ J \approx { 1 \over N } \sum _{s_i,a_i} \pi_\theta (a_i | s_i) \cdot G(s_i,a_i) $$ 88 | # 89 | # 90 | # Following the REINFORCE algorithm, we can define our objective as follows: 91 | # 92 | # $$ \hat J \approx { 1 \over N } \sum _{s_i,a_i} log \pi_\theta (a_i | s_i) \cdot G(s_i,a_i) $$ 93 | # 94 | # When you compute gradient of that function over network weights $ \theta $, it will become exactly the policy gradient. 95 | # 96 | 97 | # In[ ]: 98 | 99 | #get probabilities for parti 100 | indices = tf.stack([tf.range(tf.shape(log_policy)[0]),actions],axis=-1) 101 | log_policy_for_actions = tf.gather_nd(log_policy,indices) 102 | 103 | 104 | # In[ ]: 105 | 106 | # policy objective as in the last formula. please use mean, not sum. 107 | # note: you need to use log_policy_for_actions to get log probabilities for actions taken 108 | 109 | J = 116 | 117 | 118 | # In[7]: 119 | 120 | #all network weights 121 | all_weights = 122 | 123 | #weight updates. maximizing J is same as minimizing -J. Adding negative entropy. 124 | loss = -J -0.1 * entropy 125 | 126 | update = tf.train.AdamOptimizer().minimize(loss,var_list=all_weights) 127 | 128 | 129 | # ### Computing cumulative rewards 130 | 131 | # In[8]: 132 | 133 | def get_cumulative_rewards(rewards, #rewards at each step 134 | gamma = 0.99 #discount for reward 135 | ): 136 | """ 137 | take a list of immediate rewards r(s,a) for the whole session 138 | compute cumulative rewards R(s,a) (a.k.a. G(s,a) in Sutton '16) 139 | R_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ... 140 | 141 | The simple way to compute cumulative rewards is to iterate from last to first time tick 142 | and compute R_t = r_t + gamma*R_{t+1} recurrently 143 | 144 | You must return an array/list of cumulative rewards with as many elements as in the initial rewards. 145 | """ 146 | 147 | 148 | 149 | return 150 | 151 | 152 | 153 | 154 | # In[9]: 155 | 156 | assert len(get_cumulative_rewards(range(100))) == 100 157 | assert np.allclose(get_cumulative_rewards([0,0,1,0,0,1,0],gamma=0.9),[1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0]) 158 | assert np.allclose(get_cumulative_rewards([0,0,1,-2,3,-4,0],gamma=0.5), [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0]) 159 | assert np.allclose(get_cumulative_rewards([0,0,1,2,3,4,0],gamma=0), [0, 0, 1, 2, 3, 4, 0]) 160 | print("looks good!") 161 | 162 | 163 | # In[10]: 164 | 165 | def train_step(_states,_actions,_rewards): 166 | """given full session, trains agent with policy gradient""" 167 | _cumulative_rewards = get_cumulative_rewards(_rewards) 168 | update.run({states:_states,actions:_actions,cumulative_rewards:_cumulative_rewards}) 169 | 170 | 171 | # ### Playing the game 172 | 173 | # In[11]: 174 | 175 | def generate_session(t_max=1000): 176 | """play env with REINFORCE agent and train at the session end""" 177 | 178 | #arrays to record session 179 | states,actions,rewards = [],[],[] 180 | 181 | s = env.reset() 182 | 183 | for t in range(t_max): 184 | 185 | #action probabilities array aka pi(a|s) 186 | action_probas = get_action_proba(s) 187 | 188 | a = 189 | 190 | new_s,r,done,info = env.step(a) 191 | 192 | #record session history to train later 193 | states.append(s) 194 | actions.append(a) 195 | rewards.append(r) 196 | 197 | s = new_s 198 | if done: break 199 | 200 | train_step(states,actions,rewards) 201 | 202 | return sum(rewards) 203 | 204 | 205 | 206 | # In[12]: 207 | 208 | s = tf.InteractiveSession() 209 | s.run(tf.global_variables_initializer()) 210 | 211 | for i in range(100): 212 | 213 | rewards = [generate_session() for _ in range(100)] #generate new sessions 214 | 215 | print ("mean reward:%.3f"%(np.mean(rewards))) 216 | 217 | if np.mean(rewards) > 300: 218 | print ("You Win!") 219 | break 220 | 221 | 222 | 223 | # ### Results & video 224 | 225 | # In[13]: 226 | 227 | #record sessions 228 | import gym.wrappers 229 | env = gym.wrappers.Monitor(gym.make("CartPole-v0"),directory="videos",force=True) 230 | sessions = [generate_session() for _ in range(100)] 231 | env.close() 232 | 233 | 234 | # In[14]: 235 | 236 | #show video 237 | from IPython.display import HTML 238 | import os 239 | 240 | video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./videos/"))) 241 | 242 | HTML(""" 243 | 246 | """.format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices 247 | 248 | 249 | # In[ ]: 250 | 251 | from submit import submit_cartpole 252 | submit_cartpole(generate_session, , ) 253 | 254 | 255 | # In[ ]: 256 | 257 | # That's all, thank you for your attention! 258 | # Not having enough? There's an actor-critic waiting for you in the honor section. 259 | # But make sure you've seen the videos first. 260 | 261 | -------------------------------------------------------------------------------- /reinforcement-learning/week6/seq2seq/basic_model_tf.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import keras.layers as L 3 | 4 | 5 | class BasicTranslationModel: 6 | def __init__(self, name, inp_voc, out_voc, 7 | emb_size, hid_size,): 8 | 9 | self.name = name 10 | self.inp_voc = inp_voc 11 | self.out_voc = out_voc 12 | 13 | with tf.variable_scope(name): 14 | self.emb_inp = L.Embedding(len(inp_voc), emb_size) 15 | self.emb_out = L.Embedding(len(out_voc), emb_size) 16 | self.enc0 = tf.nn.rnn_cell.GRUCell(hid_size) 17 | self.dec_start = L.Dense(hid_size) 18 | self.dec0 = tf.nn.rnn_cell.GRUCell(hid_size) 19 | self.logits = L.Dense(len(out_voc)) 20 | 21 | 22 | # run on dummy output to .build all layers (and therefore create weights) 23 | inp = tf.placeholder('int32', [None, None]) 24 | out = tf.placeholder('int32', [None, None]) 25 | h0 = self.encode(inp) 26 | h1 = self.decode(h0,out[:,0]) 27 | # h2 = self.decode(h1,out[:,1]) etc. 28 | 29 | self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) 30 | 31 | 32 | def encode(self, inp, **flags): 33 | """ 34 | Takes symbolic input sequence, computes initial state 35 | :param inp: matrix of input tokens [batch, time] 36 | :return: a list of initial decoder state tensors 37 | """ 38 | inp_lengths = infer_length(inp, self.inp_voc.eos_ix) 39 | inp_emb = self.emb_inp(inp) 40 | 41 | _, enc_last = tf.nn.dynamic_rnn( 42 | self.enc0, inp_emb, 43 | sequence_length=inp_lengths, 44 | dtype = inp_emb.dtype) 45 | 46 | dec_start = self.dec_start(enc_last) 47 | return [dec_start] 48 | 49 | def decode(self, prev_state, prev_tokens, **flags): 50 | """ 51 | Takes previous decoder state and tokens, returns new state and logits 52 | :param prev_state: a list of previous decoder state tensors 53 | :param prev_tokens: previous output tokens, an int vector of [batch_size] 54 | :return: a list of next decoder state tensors, a tensor of logits [batch,n_tokens] 55 | """ 56 | 57 | [prev_dec] = prev_state 58 | 59 | prev_emb = self.emb_out(prev_tokens[:,None])[:,0] 60 | 61 | new_dec_out,new_dec_state = self.dec0(prev_emb, prev_dec) 62 | 63 | output_logits = self.logits(new_dec_out) 64 | 65 | return [new_dec_state], output_logits 66 | 67 | def symbolic_score(self, inp, out, eps=1e-30, **flags): 68 | """ 69 | Takes symbolic int32 matrices of hebrew words and their english translations. 70 | Computes the log-probabilities of all possible english characters given english prefices and hebrew word. 71 | :param inp: input sequence, int32 matrix of shape [batch,time] 72 | :param out: output sequence, int32 matrix of shape [batch,time] 73 | :return: log-probabilities of all possible english characters of shape [bath,time,n_tokens] 74 | 75 | NOTE: log-probabilities time axis is synchronized with out 76 | In other words, logp are probabilities of __current__ output at each tick, not the next one 77 | therefore you can get likelihood as logprobas * tf.one_hot(out,n_tokens) 78 | """ 79 | first_state = self.encode(inp,**flags) 80 | 81 | batch_size = tf.shape(inp)[0] 82 | bos = tf.fill([batch_size],self.out_voc.bos_ix) 83 | first_logits = tf.log(tf.one_hot(bos, len(self.out_voc)) + eps) 84 | 85 | def step(blob, y_prev): 86 | h_prev = blob[:-1] 87 | h_new, logits = self.decode(h_prev, y_prev, **flags) 88 | return list(h_new) + [logits] 89 | 90 | results = tf.scan(step,initializer=list(first_state)+[first_logits], 91 | elems=tf.transpose(out)) 92 | 93 | # gather state and logits, each of shape [time,batch,...] 94 | states_seq, logits_seq = results[:-1], results[-1] 95 | 96 | # add initial state and logits 97 | logits_seq = tf.concat((first_logits[None], logits_seq),axis=0) 98 | states_seq = [tf.concat((init[None], states), axis=0) 99 | for init, states in zip(first_state, states_seq)] 100 | 101 | #convert from [time,batch,...] to [batch,time,...] 102 | logits_seq = tf.transpose(logits_seq, [1, 0, 2]) 103 | states_seq = [tf.transpose(states, [1, 0] + list(range(2, states.shape.ndims))) 104 | for states in states_seq] 105 | 106 | return tf.nn.log_softmax(logits_seq) 107 | 108 | def symbolic_translate(self, inp, greedy=False, max_len = None, eps = 1e-30, **flags): 109 | """ 110 | takes symbolic int32 matrix of hebrew words, produces output tokens sampled 111 | from the model and output log-probabilities for all possible tokens at each tick. 112 | :param inp: input sequence, int32 matrix of shape [batch,time] 113 | :param greedy: if greedy, takes token with highest probablity at each tick. 114 | Otherwise samples proportionally to probability. 115 | :param max_len: max length of output, defaults to 2 * input length 116 | :return: output tokens int32[batch,time] and 117 | log-probabilities of all tokens at each tick, [batch,time,n_tokens] 118 | """ 119 | first_state = self.encode(inp, **flags) 120 | 121 | batch_size = tf.shape(inp)[0] 122 | bos = tf.fill([batch_size],self.out_voc.bos_ix) 123 | first_logits = tf.log(tf.one_hot(bos, len(self.out_voc)) + eps) 124 | max_len = tf.reduce_max(tf.shape(inp)[1])*2 125 | 126 | def step(blob,t): 127 | h_prev, y_prev = blob[:-2], blob[-1] 128 | h_new, logits = self.decode(h_prev, y_prev, **flags) 129 | y_new = tf.argmax(logits,axis=-1) if greedy else tf.multinomial(logits,1)[:,0] 130 | return list(h_new) + [logits, tf.cast(y_new,y_prev.dtype)] 131 | 132 | results = tf.scan(step, initializer=list(first_state) + [first_logits, bos], 133 | elems=[tf.range(max_len)]) 134 | 135 | # gather state, logits and outs, each of shape [time,batch,...] 136 | states_seq, logits_seq, out_seq = results[:-2], results[-2], results[-1] 137 | 138 | # add initial state, logits and out 139 | logits_seq = tf.concat((first_logits[None],logits_seq),axis=0) 140 | out_seq = tf.concat((bos[None], out_seq), axis=0) 141 | states_seq = [tf.concat((init[None], states), axis=0) 142 | for init, states in zip(first_state, states_seq)] 143 | 144 | #convert from [time,batch,...] to [batch,time,...] 145 | logits_seq = tf.transpose(logits_seq, [1, 0, 2]) 146 | out_seq = tf.transpose(out_seq) 147 | states_seq = [tf.transpose(states, [1, 0] + list(range(2, states.shape.ndims))) 148 | for states in states_seq] 149 | 150 | return out_seq, tf.nn.log_softmax(logits_seq) 151 | 152 | 153 | 154 | ### Utility functions ### 155 | 156 | def initialize_uninitialized(sess = None): 157 | """ 158 | Initialize unitialized variables, doesn't affect those already initialized 159 | :param sess: in which session to initialize stuff. Defaults to tf.get_default_session() 160 | """ 161 | sess = sess or tf.get_default_session() 162 | global_vars = tf.global_variables() 163 | is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars]) 164 | not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f] 165 | 166 | if len(not_initialized_vars): 167 | sess.run(tf.variables_initializer(not_initialized_vars)) 168 | 169 | def infer_length(seq, eos_ix, time_major=False, dtype=tf.int32): 170 | """ 171 | compute length given output indices and eos code 172 | :param seq: tf matrix [time,batch] if time_major else [batch,time] 173 | :param eos_ix: integer index of end-of-sentence token 174 | :returns: lengths, int32 vector of shape [batch] 175 | """ 176 | axis = 0 if time_major else 1 177 | is_eos = tf.cast(tf.equal(seq, eos_ix), dtype) 178 | count_eos = tf.cumsum(is_eos,axis=axis,exclusive=True) 179 | lengths = tf.reduce_sum(tf.cast(tf.equal(count_eos,0),dtype),axis=axis) 180 | return lengths 181 | 182 | def infer_mask(seq, eos_ix, time_major=False, dtype=tf.float32): 183 | """ 184 | compute mask given output indices and eos code 185 | :param seq: tf matrix [time,batch] if time_major else [batch,time] 186 | :param eos_ix: integer index of end-of-sentence token 187 | :returns: mask, float32 matrix with '0's and '1's of same shape as seq 188 | """ 189 | axis = 0 if time_major else 1 190 | lengths = infer_length(seq, eos_ix, time_major=time_major) 191 | mask = tf.sequence_mask(lengths, maxlen=tf.shape(seq)[axis], dtype=dtype) 192 | if time_major: mask = tf.transpose(mask) 193 | return mask 194 | 195 | 196 | def select_values_over_last_axis(values, indices): 197 | """ 198 | Auxiliary function to select logits corresponding to chosen tokens. 199 | :param values: logits for all actions: float32[batch,tick,action] 200 | :param indices: action ids int32[batch,tick] 201 | :returns: values selected for the given actions: float[batch,tick] 202 | """ 203 | assert values.shape.ndims == 3 and indices.shape.ndims == 2 204 | batch_size, seq_len = tf.shape(indices)[0], tf.shape(indices)[1] 205 | batch_i = tf.tile(tf.range(0,batch_size)[:, None],[1,seq_len]) 206 | time_i = tf.tile(tf.range(0,seq_len)[None, :],[batch_size,1]) 207 | indices_nd = tf.stack([batch_i, time_i, indices], axis=-1) 208 | 209 | return tf.gather_nd(values,indices_nd) 210 | 211 | 212 | 213 | -------------------------------------------------------------------------------- /reinforcement-learning/week6/seq2seq/voc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Vocab: 4 | def __init__(self, tokens, bos="__BOS__", eos="__EOS__", sep=''): 5 | """ 6 | A special class that handles tokenizing and detokenizing 7 | """ 8 | assert bos in tokens, eos in tokens 9 | self.tokens = tokens 10 | self.token_to_ix = {t:i for i,t in enumerate(tokens)} 11 | 12 | self.bos = bos 13 | self.bos_ix = self.token_to_ix[bos] 14 | self.eos = eos 15 | self.eos_ix = self.token_to_ix[eos] 16 | self.sep = sep 17 | 18 | def __len__(self): 19 | return len(self.tokens) 20 | 21 | @staticmethod 22 | def from_lines(lines, bos="__BOS__", eos="__EOS__", sep=''): 23 | flat_lines = sep.join(list(lines)) 24 | flat_lines = list(flat_lines.split(sep)) if sep != '' else list(flat_lines) 25 | tokens = list(set(sep.join(flat_lines))) 26 | tokens = [t for t in tokens if t not in (bos,eos) and len(t) != 0] 27 | tokens = [bos,eos] + tokens 28 | return Vocab(tokens,bos,eos,sep) 29 | 30 | def tokenize(self,string): 31 | """converts string to a list of tokens""" 32 | tokens = list(filter(len,string.split(self.sep))) \ 33 | if self.sep != '' else list(string) 34 | return [self.bos] + tokens + [self.eos] 35 | 36 | def to_matrix(self, lines, max_len=None): 37 | """ 38 | convert variable length token sequences into fixed size matrix 39 | example usage: 40 | >>>print( as_matrix(words[:3],source_to_ix)) 41 | [[15 22 21 28 27 13 -1 -1 -1 -1 -1] 42 | [30 21 15 15 21 14 28 27 13 -1 -1] 43 | [25 37 31 34 21 20 37 21 28 19 13]] 44 | """ 45 | max_len = max_len or max(map(len, lines)) + 2 # 2 for bos and eos 46 | 47 | matrix = np.zeros((len(lines), max_len), dtype='int32') + self.eos_ix 48 | for i, seq in enumerate(lines): 49 | tokens = self.tokenize(seq) 50 | row_ix = list(map(self.token_to_ix.get, tokens))[:max_len] 51 | matrix[i, :len(row_ix)] = row_ix 52 | 53 | return matrix 54 | 55 | def to_lines(self, matrix, crop=True): 56 | """ 57 | Convert matrix of token ids into strings 58 | :param matrix: matrix of tokens of int32, shape=[batch,time] 59 | :param crop: if True, crops BOS and EOS from line 60 | :return: 61 | """ 62 | lines = [] 63 | for line_ix in map(list,matrix): 64 | if crop: 65 | if line_ix[0] == self.bos_ix: 66 | line_ix = line_ix[1:] 67 | if self.eos_ix in line_ix: 68 | line_ix = line_ix[:line_ix.index(self.eos_ix)] 69 | line = self.sep.join(self.tokens[i] for i in line_ix) 70 | lines.append(line) 71 | return lines 72 | --------------------------------------------------------------------------------