├── .gitignore
├── README.md
├── bayesian-methods-for-ml-master
    ├── Coursera-BMML-Final-project
    │   ├── (OPTIONAL) Final project. Training VAE.ipynb
    │   ├── Coursera BMML, Final project.ipynb
    │   └── utils.py
    ├── Week 2
    │   └── Programming assignment
    │   │   ├── Coursera-BMML_-week-2.ipynb
    │   │   └── grader.py
    ├── Week 4
    │   └── Programming assignment
    │   │   ├── Week4. Practical Assignment. MCMC.ipynb
    │   │   └── grader.py
    ├── assignment 5
    │   ├── CVAE.png
    │   ├── VAE.png
    │   ├── assignment.ipynb
    │   └── grader.py
    └── assignment 6
    │   ├── Coursera+BMML%2C+week+6.ipynb
    │   └── grader.py
├── competitive-data-science
    ├── Programming assignment, week 1: Pandas basics
    │   ├── PandasBasics.ipynb
    │   └── grader.py
    ├── Programming assignment, week 2: Data leakages
    │   └── Data leakages.ipynb
    ├── Programming assignment, week 3: Mean encodings
    │   ├── Programming_assignment_week_3.ipynb
    │   └── grader.py
    ├── Programming assignment, week 4: Ensembles
    │   ├── Programming_assignment_week_4.ipynb
    │   └── grader.py
    ├── Programming assignment, week 4: KNN features
    │   ├── Untitled.ipynb
    │   ├── compute_KNN_features-Copy1.ipynb
    │   ├── compute_KNN_features.ipynb
    │   ├── grader.py
    │   └── test multiprocessing.ipynb
    ├── README.md
    ├── Reading materials
    │   ├── EDA_Springleaf_screencast.ipynb
    │   ├── EDA_video2.ipynb
    │   ├── EDA_video3_screencast.ipynb
    │   ├── GBM_drop_tree.ipynb
    │   ├── Hyperparameters_tuning_video2_RF_n_estimators.ipynb
    │   ├── Macros.ipynb
    │   ├── Metrics_video2_constants_for_MSE_and_MAE.ipynb
    │   ├── Metrics_video3_weighted_median.ipynb
    │   └── Metrics_video8_soft_kappa_xgboost.ipynb
    └── kaggle_project
    │   ├── Documentation.pdf
    │   └── README.md
├── intro-to-dle
    ├── README.md
    ├── download_resources.ipynb
    ├── download_utils.py
    ├── grading.py
    ├── keras_utils.py
    ├── misc
    │   └── np_convolution.py
    ├── week1
    │   ├── kernel.png
    │   ├── sgd.png
    │   ├── target.npy
    │   ├── train.npy
    │   └── week01_pa.ipynb
    ├── week2
    │   ├── Keras-task.ipynb
    │   ├── NumpyNN (honor).ipynb
    │   ├── Tensorflow-task.ipynb
    │   ├── datasets
    │   ├── matplotlib_utils.py
    │   ├── models
    │   ├── my1stNN_logreg.ipynb
    │   ├── my1stNN_mlp.ipynb
    │   ├── preprocessed_mnist.py
    │   ├── submit.py
    │   └── util.py
    ├── week3
    │   ├── grading_utils.py
    │   ├── imagelabels.mat
    │   ├── images
    │   │   └── inceptionv3.png
    │   ├── week3_task1_first_cnn_cifar10_clean.ipynb
    │   ├── week3_task2_fine_tuning_clean.ipynb
    │   └── weights.p
    ├── week4
    │   ├── Adversarial-task.ipynb
    │   ├── Autoencoders-task.ipynb
    │   ├── MiniGAN.ipynb
    │   ├── lfw_dataset.py
    │   └── submit.py
    ├── week5
    │   ├── POS-task.ipynb
    │   ├── RNN-task.ipynb
    │   ├── data_copyright
    │   ├── names
    │   ├── rnn.png
    │   └── submit.py
    └── week6
    │   ├── grading_utils.py
    │   ├── images
    │       ├── encoder_decoder.png
    │       ├── encoder_decoder_explained.png
    │       └── inceptionv3.png
    │   ├── utils.py
    │   └── week6_final_project_image_captioning_clean.ipynb
├── natural-language-processing
    ├── .gitignore
    ├── AWS-tutorial.md
    ├── Docker-tutorial.md
    ├── common
    │   ├── README.md
    │   └── download_utils.py
    ├── docker
    │   └── Dockerfile
    ├── honor
    │   ├── LSTM chatbot (character level).ipynb
    │   ├── LSTM chatbot (character level,tf).ipynb
    │   ├── LSTM chatbot (word level).ipynb
    │   ├── LSTM reply.ipynb
    │   ├── README.md
    │   ├── Untitled.ipynb
    │   ├── datasets.py
    │   ├── dialogue_manager.py
    │   ├── download_cornell.sh
    │   ├── download_opensubs.sh
    │   ├── example.py
    │   ├── main_bot.py
    │   ├── tfmodel.py
    │   └── utils.py
    ├── project
    │   ├── dialogue_manager.py
    │   ├── main_bot.py
    │   ├── utils.py
    │   └── week5-project.ipynb
    ├── week1
    │   ├── grader.py
    │   ├── metrics.py
    │   └── week1-MultilabelClassification.ipynb
    ├── week2
    │   ├── evaluation.py
    │   └── week2-NER.ipynb
    ├── week3
    │   ├── grader.py
    │   ├── util.py
    │   └── week3-Embeddings.ipynb
    └── week4
    │   ├── encoder-decoder-pic.png
    │   └── week4-seq2seq.ipynb
└── reinforcement-learning
    ├── .gitignore
    ├── 01a-gym_interface.ipynb
    ├── 01b-crossentropy_method.ipynb
    ├── 01c-hons.ipynb
    ├── 02_qlearning.ipynb
    ├── 02a-practice_vi.ipynb
    ├── atari_util.py
    ├── dqn_atari.ipynb
    ├── experience_replay.ipynb
    ├── framebuffer.py
    ├── mdp.py
    ├── practice_approx_qlearning.ipynb
    ├── qlearning.py
    ├── replay_buffer.py
    ├── sarsa.ipynb
    ├── week5
        ├── REINFORCE.ipynb
        ├── atari_util.py
        ├── practice_a3c.ipynb
        └── practice_reinforce.py
    └── week6
        ├── bandits.ipynb
        ├── practice_mcts.ipynb
        └── seq2seq
            ├── basic_model_tf.py
            ├── practice_tf.ipynb
            └── voc.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | __pycache__
  2 | readonly
  3 | .ipynb_checkpoints
  4 | .DS_Store
  5 | .idea
  6 | *.h5
  7 | *.tgz
  8 | *.pickle
  9 | *.zip
 10 | *.index
 11 | *.meta
 12 | leakage
 13 | submission.csv
 14 | *.csv
 15 | *.csv.gz
 16 | *.dat
 17 | *.txt
 18 | *.npy
 19 | *.npz
 20 | # Byte-compiled / optimized / DLL files
 21 | __pycache__/
 22 | *.py[cod]
 23 | *$py.class
 24 | 
 25 | # C extensions
 26 | *.so
 27 | 
 28 | # Distribution / packaging
 29 | .Python
 30 | build/
 31 | develop-eggs/
 32 | dist/
 33 | downloads/
 34 | eggs/
 35 | .eggs/
 36 | lib/
 37 | lib64/
 38 | parts/
 39 | sdist/
 40 | var/
 41 | wheels/
 42 | *.egg-info/
 43 | .installed.cfg
 44 | *.egg
 45 | MANIFEST
 46 | 
 47 | # PyInstaller
 48 | #  Usually these files are written by a python script from a template
 49 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 50 | *.manifest
 51 | *.spec
 52 | 
 53 | # Installer logs
 54 | pip-log.txt
 55 | pip-delete-this-directory.txt
 56 | 
 57 | # Unit test / coverage reports
 58 | htmlcov/
 59 | .tox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | .hypothesis/
 67 | 
 68 | # Translations
 69 | *.mo
 70 | *.pot
 71 | 
 72 | # Django stuff:
 73 | *.log
 74 | .static_storage/
 75 | .media/
 76 | local_settings.py
 77 | 
 78 | # Flask stuff:
 79 | instance/
 80 | .webassets-cache
 81 | 
 82 | # Scrapy stuff:
 83 | .scrapy
 84 | 
 85 | # Sphinx documentation
 86 | docs/_build/
 87 | 
 88 | # PyBuilder
 89 | target/
 90 | 
 91 | # Jupyter Notebook
 92 | .ipynb_checkpoints
 93 | 
 94 | # pyenv
 95 | .python-version
 96 | 
 97 | # celery beat schedule file
 98 | celerybeat-schedule
 99 | 
100 | # SageMath parsed files
101 | *.sage.py
102 | 
103 | # Environments
104 | .env
105 | .venv
106 | env/
107 | venv/
108 | ENV/
109 | env.bak/
110 | venv.bak/
111 | 
112 | # Spyder project settings
113 | .spyderproject
114 | .spyproject
115 | 
116 | # Rope project settings
117 | .ropeproject
118 | 
119 | # mkdocs documentation
120 | /site
121 | 
122 | # mypy
123 | .mypy_cache/
124 | 
125 | *.tsv
126 | *.png
127 | starSpaceModel
128 | natural-language-processing/honor/model/*
129 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Advanced Machine Learning course
 2 | 
 3 | This is my github repo for the AML specialisation offered by Yandex and HSE on coursera. 
 4 | 
 5 | ## Modules
 6 | 1. [Intro to Deep Learning](https://www.coursera.org/learn/intro-to-deep-learning/) . The course covers the fundamentals of Deep Learning, from the basic ideas of overfitting and underfitting to state of the art CNN and RNN.
 7 | 	
 8 | 	- During the course, I coded a neural network in numpy which 	helped me understand how backprop really works. 
 9 | 		
10 | 	- The assignments are open ended, encouraging experimentation and trial and error, as it would be the case in a real world application.
11 | 		
12 | 	- The assignments have an interesting blend of numpy, keras, and tensorflow. This helps to think of these modules as tools in the same toolbox instead of isolated tools.
13 | 		
14 | 	- The final project is designing a **captioning neural network**, featuring both a CNN for feature extraction (Pretrained InceptionV3) and an RNN. It is trained on a set of (images, captions) and the network learns to caption any image (that resembles the training set, that is)
15 | 
16 | 2. [Competitive Data Science](https://www.coursera.org/learn/competitive-data-science). The course covers exploratory data analysis, feature generation, and feature tuning and model validation, all taught by expert kaggle competition winners.
17 | 	- The course involved **participating in an actual competition**, and I ranked in the top 10% (Out of ~ 300 participants)
18 | 	- My final model was designed, trained, and ran in an Amazon AWS instance, and it included lagged features, mean encoded features, and features derived from item descriptions using PCA
19 | 	- The assignments involved a range of tasks, but were mostly to build understanding of the actual goals involved in the competition
20 | 3. [Bayesian Methods for Machine Learning](https://www.coursera.org/learn/bayesian-methods-in-machine-learning). The course builds upon preexisting understanding of ML methods, and places them in the context of Bayesian statistics.
21 | 	- Many key concepts are covered: conjugate priors, latent variable modes, gaussian mixtures, expectation maximisation, variational inference, latent dirichlet allocation, MCMC with Gibbs and Metropolis-Hastings sampling, variational autoencoders, and bayesian optimization
22 | 	- The final project involved designing an **algorithm to help a user generate faces with certain properties from a variational autoencoder**: Initially I show the user different faces, then the user is progressively shown faces and asking to rate them. Using these values and GPyOpt, the code adjusts the latent variables of the VAE to approach the face the user wants.
23 | 
24 | 4. [Ǹatural Language Processing](https://www.coursera.org/learn/language-processing). The course covers a variety of NLP approaches and concepts, including the basics such as lemmatising or bag of words, to word embeddings, and then in terms of modelling it covers Hidden Markov Models and finally neural network based models.
25 | 	- The final project was to design a simple chatbot that could either answer technical questions (by replying with a 	relevant answer from stack overflow, or could just chit chat. The network itself was  implemented in Tensorflow 		following a character level seq2seq approach.
26 | 


--------------------------------------------------------------------------------
/bayesian-methods-for-ml-master/Coursera-BMML-Final-project/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import tensorflow as tf
  4 | import GPy
  5 | import GPyOpt
  6 | import tensorflow as tf
  7 | import keras
  8 | from keras.layers import Input, Dense, Lambda, InputLayer, concatenate, Activation, Flatten, Reshape
  9 | from keras.layers.normalization import BatchNormalization
 10 | from keras.layers.convolutional import Conv2D, Deconv2D
 11 | from keras.losses import MSE
 12 | from keras.models import Model, Sequential
 13 | from keras import backend as K
 14 | from keras import metrics
 15 | from keras.datasets import mnist
 16 | from keras.utils import np_utils
 17 | from tensorflow.python.framework import ops
 18 | from tensorflow.python.framework import dtypes
 19 | import os
 20 | 
 21 | 
 22 | 
 23 | 
 24 | class CelebA:
 25 |     def __init__(self, path, sess, train=True, batch_size=32, height=218, width=178, channels=3, threads=1, file_type='.jpg'):
 26 |         image_filenames = [os.path.join(path, img) for img in os.listdir(path) if img.endswith(file_type)]
 27 |         if train:
 28 |             image_filenames = image_filenames[:-5000]
 29 |         else:
 30 |             image_filenames = image_filenames[-5000:]
 31 |         all_images = ops.convert_to_tensor(image_filenames, dtype=dtypes.string)
 32 |         input_queue = tf.train.slice_input_producer([image_filenames], shuffle=False)
 33 |         file_content = tf.read_file(input_queue[0])
 34 |         image = tf.image.decode_jpeg(file_content, channels=3)
 35 |         image.set_shape([height, width, channels])
 36 |         image_cropped = image[45:-45, 25:-25]
 37 |         image_cropped = tf.image.resize_images(image_cropped, (64, 64))
 38 |         batch = tf.train.batch([image_cropped], batch_size=batch_size, num_threads=threads)
 39 |         self.batch = tf.cast(batch, tf.float32)/256
 40 |         self.n_batches = len(image_filenames) // batch_size
 41 |         self.sess = sess
 42 |     
 43 |     def __iter__(self):
 44 |         return self
 45 |     
 46 |     def __next__(self):
 47 |         x = self.sess.run(self.batch)
 48 |         return x, x, None
 49 |     
 50 |     def next(self):
 51 |         return self.__next__()
 52 |     
 53 | def create_encoder(input_dims, base_filters=64, layers=4, latent=512):
 54 |     w = input_dims[0]//2**layers
 55 |     h = input_dims[1]//2**layers
 56 |     c = base_filters*2**(layers-1)
 57 |     encoder = Sequential()
 58 |     encoder.add(InputLayer(input_dims))
 59 |     for i in range(layers):
 60 |         encoder.add(Conv2D(filters=base_filters*2**i, kernel_size=(5, 5), strides=(2, 2), padding='same', bias=False))
 61 |         encoder.add(BatchNormalization(axis=3))
 62 |         encoder.add(Activation(K.relu))
 63 |     encoder.add(Reshape([w*h*c]))
 64 |     encoder.add(Dense(latent*2))
 65 |     return encoder
 66 | 
 67 | def create_decoder(output_dims, base_filters=64, layers=4, latent=512):
 68 |     w = output_dims[0]//2**layers
 69 |     h = output_dims[1]//2**layers
 70 |     c = base_filters*2**(layers-1)
 71 |     decoder = Sequential()
 72 |     decoder.add(InputLayer([latent]))
 73 |     decoder.add(Dense(w*h*c))
 74 |     decoder.add(Reshape([w, h, c]))
 75 |     for i in range(layers-1, 0, -1):
 76 |         decoder.add(Deconv2D(filters=base_filters*2**i, kernel_size=(5, 5), strides=(2, 2), padding='same', bias=False))
 77 |         decoder.add(BatchNormalization(axis=3))
 78 |         decoder.add(Activation(K.relu))
 79 |     decoder.add(Deconv2D(filters=3, kernel_size=(5, 5), strides=(2, 2), padding='same'))
 80 |     return decoder
 81 | 
 82 | def sample(mean_log_var):
 83 |     mean, log_var = mean_log_var
 84 |     eps_shape = mean.get_shape()
 85 |     epsilon = K.random_normal(shape=eps_shape)
 86 |     z = epsilon*K.exp(log_var/2)+mean
 87 |     return z
 88 | 
 89 | def create_vae(batch_size, base_filters=64, latent=8,
 90 |                image_size=64, learning_rate=0.001,
 91 |                reconstruction_weight=1000, layers=4):
 92 |     '''
 93 |     Constructs VAE model with given parameters.
 94 |     :param batch_size: size of a batch (used for placeholder)
 95 |     :param base_filters: number of filters after first layer. Other layers will double this number
 96 |     :param latent: latent space dimension
 97 |     :param image_size: size of input image
 98 |     Returns compiled Keras model along with encoder and decoder
 99 |     '''
100 |     if isinstance(image_size, int):
101 |         image_size = (image_size, image_size)
102 |     x = Input(batch_shape=(batch_size, image_size[0], image_size[1], 3))
103 |     encoder = create_encoder([image_size[0], image_size[1], 3], base_filters=base_filters, latent=latent, layers=layers)
104 |     decoder = create_decoder([image_size[0], image_size[1], 3], base_filters=base_filters, latent=latent, layers=layers)
105 |     mean_log_var = encoder(x)
106 |     mean_size = mean_log_var.shape[1]//2
107 |     mean = Lambda(lambda h: h[:, :mean_size])(mean_log_var)
108 |     log_var = Lambda(lambda h: h[:, mean_size:])(mean_log_var)
109 |     z = Lambda(sample)([mean, log_var])
110 |     reconstruction = decoder(z)
111 |     loss_reconstruction = K.mean(metrics.mean_squared_error(x, reconstruction))
112 |     loss_KL = - K.mean(0.5 * K.sum(1 + log_var - K.square(mean) - K.exp(log_var), axis=1))
113 |     loss = reconstruction_weight*loss_reconstruction + loss_KL
114 | 
115 |     vae = Model(x, reconstruction)
116 |     vae.compile(optimizer=keras.optimizers.Adam(lr=learning_rate), loss=lambda x, y: loss)
117 |     return vae, encoder, decoder


--------------------------------------------------------------------------------
/bayesian-methods-for-ml-master/Week 2/Programming assignment/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | class Grader(object):
 7 |     def __init__(self):
 8 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
 9 |         self.assignment_key = '3ivnq3n_EeexdQ4iFFMrvA'
10 |         self.parts = OrderedDict([
11 |                         ('H3evn', 'Task 1 (E-step)'),
12 |                         ('uD8jo', 'Task 2 (M-step: mu)'),
13 |                         ('zFWgm', 'Task 2 (M-step: sigma)'),
14 |                         ('gTUuu', 'Task 2 (M-step: pi)'),
15 |                         ('0ZlqN', 'Task 3 (VLB)'),
16 |                         ('Olbrx', 'Task 4 (EM)')])
17 |         self.answers = {key: None for key in self.parts}
18 | 
19 |     @staticmethod
20 |     def ravel_output(output):
21 |         '''
22 |            If student accedentally submitted np.array with one
23 |            element instead of number, this function will submit
24 |            this number instead
25 |         '''
26 |         if isinstance(output, np.ndarray) and output.size == 1:
27 |             output = output.item(0)
28 |         return output
29 | 
30 |     def submit(self, email, token):
31 |         submission = {
32 |                     "assignmentKey": self.assignment_key, 
33 |                     "submitterEmail": email, 
34 |                     "secret": token, 
35 |                     "parts": {}
36 |                   }
37 |         for part, output in self.answers.items():
38 |             if output is not None:
39 |                 submission["parts"][part] = {"output": output}
40 |             else:
41 |                 submission["parts"][part] = dict()
42 |         request = requests.post(self.submission_page, data=json.dumps(submission))
43 |         response = request.json()
44 |         if request.status_code == 201:
45 |             print('Submitted to Coursera platform. See results on assignment page!')
46 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
47 |             print(response[u'details'][u'learnerMessage'])
48 |         else:
49 |             print("Unknown response from Coursera: {}".format(request.status_code))
50 |             print(response)
51 | 
52 |     def status(self):
53 |         print("You want to submit these numbers:")
54 |         for part_id, part_name in self.parts.items():
55 |             answer = self.answers[part_id]
56 |             if answer is None:
57 |                 answer = '-'*10
58 |             print("Task {}: {}".format(part_name, answer))
59 |                
60 |     def submit_part(self, part, output):
61 |         self.answers[part] = output
62 |         print("Current answer for task {} is: {}".format(self.parts[part], output))
63 | 
64 |     def submit_e_step(self, output):
65 |         self.submit_part('H3evn', str(self.ravel_output(output[9, 1])))
66 | 
67 |     def submit_m_step(self, pi, mu, sigma):
68 |         self.submit_part('uD8jo', str(self.ravel_output(mu[1, 1])))
69 |         self.submit_part('zFWgm', str(self.ravel_output(sigma[1, 1, 1])))
70 |         self.submit_part('gTUuu', str(self.ravel_output(pi[1])))
71 |         
72 |     def submit_VLB(self, loss):
73 |         self.submit_part('0ZlqN', str(self.ravel_output(loss)))
74 |         
75 |     def submit_EM(self, best_loss):
76 |         self.submit_part('Olbrx', str(self.ravel_output(best_loss)))


--------------------------------------------------------------------------------
/bayesian-methods-for-ml-master/Week 4/Programming assignment/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | class Grader(object):
 7 |     def __init__(self):
 8 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
 9 |         self.assignment_key = 'u85FqY8sEee5cg635EOBeA'
10 |         self.parts = OrderedDict([
11 |                       ('pn017', '1.1 (Alice trajectory)'),
12 |                       ('UUbsF', '1.1 (Bob trajectory)'),
13 |                       ('FFmXD', '1.2 (Alice mean)'), 
14 |                       ('uWPFR', '1.2 (Bob mean)'), 
15 |                       ('nkkem', '1.3 (Bob and Alice prices correlation)'),
16 |                       ('dyuVW', '1.4 (depends on the random data or not)'),
17 |                       ('r1VVR', '2.1 (MAP for age coef)'),
18 |                       ('5wFjO', '2.1 (MAP for aducation coef)'),
19 |                       ('sn9Lu', '2.2 (credible interval lower bound)'),
20 |                       ('JHRF9', '2.2 (credible interval upper bound)'),
21 |                       ('0StUi', '2.3 (does the data suggest gender discrimination?)'),
22 |                       ])
23 |         self.answers = {key: None for key in self.parts}
24 | 
25 |     @staticmethod
26 |     def ravel_output(output):
27 |         '''
28 |            If student accedentally submitted np.array with one
29 |            element instead of number, this function will submit
30 |            this number instead
31 |         '''
32 |         if isinstance(output, np.ndarray) and output.size == 1:
33 |             output = output.item(0)
34 |         return output
35 | 
36 |     def submit(self, email, token):
37 |         submission = {
38 |                     "assignmentKey": self.assignment_key, 
39 |                     "submitterEmail": email, 
40 |                     "secret": token, 
41 |                     "parts": {}
42 |                   }
43 |         for part, output in self.answers.items():
44 |             if output is not None:
45 |                 submission["parts"][part] = {"output": output}
46 |             else:
47 |                 submission["parts"][part] = dict()
48 |         request = requests.post(self.submission_page, data=json.dumps(submission))
49 |         response = request.json()
50 |         if request.status_code == 201:
51 |             print('Submitted to Coursera platform. See results on assignment page!')
52 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
53 |             print(response[u'details'][u'learnerMessage'])
54 |         else:
55 |             print("Unknown response from Coursera: {}".format(request.status_code))
56 |             print(response)
57 | 
58 |     def status(self):
59 |         print("You want to submit these numbers:")
60 |         for part_id, part_name in self.parts.items():
61 |             answer = self.answers[part_id]
62 |             if answer is None:
63 |                 answer = '-'*10
64 |             print("Task {}: {}".format(part_name, answer))
65 |                
66 |     def submit_part(self, part, output):
67 |         self.answers[part] = output
68 |         print("Current answer for task {} is: {}".format(self.parts[part], output))
69 | 
70 |     def submit_simulation_trajectory(self, alice_trajectory, bob_trajectory):
71 |         self.submit_part('pn017', '{}  {}'.format(
72 |             self.ravel_output(alice_trajectory[0]), self.ravel_output(alice_trajectory[1])
73 |             ))
74 |         self.submit_part('UUbsF', '{}  {}'.format(
75 |             self.ravel_output(bob_trajectory[0]), self.ravel_output(bob_trajectory[1])
76 |             ))
77 |     
78 |     def submit_simulation_mean(self, alice_price, bob_price):
79 |         self.submit_part('FFmXD', str(self.ravel_output(alice_price)))
80 |         self.submit_part('uWPFR', str(self.ravel_output(bob_price)))
81 |     
82 |     def submit_simulation_correlation(self, alice_bob_correlation):
83 |         self.submit_part('nkkem', str(self.ravel_output(alice_bob_correlation)))
84 |     
85 |     def submit_simulation_depends(self, answer):
86 |         self.submit_part('dyuVW', answer)
87 |         
88 |     def submit_pymc_map_estimates(self, beta_age_coefficient, beta_education_coefficient):
89 |         self.submit_part('r1VVR', str(self.ravel_output(beta_age_coefficient)))
90 |         self.submit_part('5wFjO', str(self.ravel_output(beta_education_coefficient)))
91 |         
92 |     def submit_pymc_odds_ratio_interval(self, odds_ratio_lower_bound, odds_ratio_upper_bound):
93 |         self.submit_part('sn9Lu', str(self.ravel_output(odds_ratio_lower_bound)))
94 |         self.submit_part('JHRF9', str(self.ravel_output(odds_ratio_upper_bound)))
95 |         
96 |     def submit_is_there_discrimination(self, answer):
97 |         self.submit_part('0StUi', answer)
98 |     
99 | 


--------------------------------------------------------------------------------
/bayesian-methods-for-ml-master/assignment 5/CVAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/bayesian-methods-for-ml-master/assignment 5/CVAE.png


--------------------------------------------------------------------------------
/bayesian-methods-for-ml-master/assignment 5/VAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/bayesian-methods-for-ml-master/assignment 5/VAE.png


--------------------------------------------------------------------------------
/bayesian-methods-for-ml-master/assignment 5/grader.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | import numpy as np
  4 | from collections import OrderedDict
  5 | from keras.layers import Input
  6 | import tensorflow as tf
  7 | from keras.datasets import mnist
  8 | 
  9 | class Grader(object):
 10 |     def __init__(self):
 11 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
 12 |         self.assignment_key = 'Pf_j7noDEeexdQ4iFFMrvA'
 13 |         self.parts = OrderedDict([('S66Mi', '1 (vlb)'),
 14 |                                   ('dXfpy', '2.1 (samples mean)'),
 15 |                                   ('U1gJG', '2.2 (samples var)'),
 16 |                                   ('NRPCA', '3 (best val loss)'),
 17 |                                   ('JEmpp', '4.1 (hallucinating mean)'),
 18 |                                   ('3K3IB', '4.2 (hallucinating var)'),
 19 |                                   ('tYD01', '5.1 (conditional hallucinating mean)'),
 20 |                                   ('CaofU', '5.2 (conditional hallucinating var)'),])
 21 |         self.answers = {key: None for key in self.parts}
 22 | 
 23 |     @staticmethod
 24 |     def ravel_output(output):
 25 |         '''
 26 |            If student accedentally submitted np.array with one
 27 |            element instead of number, this function will submit
 28 |            this number instead
 29 |         '''
 30 |         if isinstance(output, np.ndarray) and output.size == 1:
 31 |             output = output.item(0)
 32 |         return output
 33 | 
 34 |     def submit(self, email, token):
 35 |         submission = {
 36 |                     "assignmentKey": self.assignment_key, 
 37 |                     "submitterEmail": email, 
 38 |                     "secret": token, 
 39 |                     "parts": {}
 40 |                   }
 41 |         for part, output in self.answers.items():
 42 |             if output is not None:
 43 |                 submission["parts"][part] = {"output": output}
 44 |             else:
 45 |                 submission["parts"][part] = dict()
 46 |         request = requests.post(self.submission_page, data=json.dumps(submission))
 47 |         response = request.json()
 48 |         if request.status_code == 201:
 49 |             print('Submitted to Coursera platform. See results on assignment page!')
 50 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
 51 |             print(response[u'details'][u'learnerMessage'])
 52 |         else:
 53 |             print("Unknown response from Coursera: {}".format(request.status_code))
 54 |             print(response)
 55 | 
 56 |     def status(self):
 57 |         print("You want to submit these numbers:")
 58 |         for part_id, part_name in self.parts.items():
 59 |             answer = self.answers[part_id]
 60 |             if answer is None:
 61 |                 answer = '-'*10
 62 |             print("Task {}: {}".format(part_name, answer))
 63 |                
 64 |     def submit_part(self, part, output):
 65 |         self.answers[part] = output
 66 |         print("Current answer for task {} is: {}".format(self.parts[part], output))
 67 | 
 68 |     def submit_vlb(self, sess, vlb_binomial):
 69 |         test_data = np.load('test_data.npz')
 70 |         my_x = Input(batch_shape=(100, 784))
 71 |         my_x_decoded = Input(batch_shape=(100, 784))
 72 |         my_t_mean = Input(batch_shape=(100, 2))
 73 |         my_t_log_var = Input(batch_shape=(100, 2))
 74 |         loss = vlb_binomial(my_x, my_x_decoded, my_t_mean, my_t_log_var)
 75 |         try:
 76 |             output = sess.run(loss, feed_dict={my_x: test_data['x'], my_x_decoded: test_data['x_decoded_mean'],
 77 |                               my_t_mean: test_data['t_mean'], my_t_log_var: test_data['t_log_var']})
 78 |         except Exception as e:
 79 |             print('Sorry, we were not able to run the provided code in `sess`.')
 80 |             raise e
 81 |         self.submit_part('S66Mi', str(self.ravel_output(output)))
 82 | 
 83 |     def submit_samples(self, sess, sampling):
 84 |         test_data = np.load('test_data.npz')
 85 |         my_t_mean = tf.tile(test_data['t_mean'][:1, :], [10000, 1])
 86 |         my_t_log_var = tf.tile(test_data['t_log_var'][:1, :], [10000, 1])
 87 |         samples = sampling([my_t_mean, my_t_log_var])
 88 |         try:
 89 |             samples = sess.run(samples)
 90 |         except Exception as e:
 91 |             print('Sorry, we were not able to run the provided code in `sess`.')
 92 |             raise e
 93 |         mean = np.mean(samples, axis=0)[1]
 94 |         var = np.var(samples, axis=0)[1]
 95 |         self.submit_part('dXfpy', str(self.ravel_output(mean)))
 96 |         self.submit_part('U1gJG', str(self.ravel_output(var)))
 97 | 
 98 |     def submit_best_val_loss(self, hist):
 99 |         self.submit_part('NRPCA', str(self.ravel_output(hist.history['val_loss'][-1])))
100 | 
101 |     def submit_hallucinating(self, sess, sampled_im_mean):
102 |         try:
103 |             imgs = sess.run(sampled_im_mean)
104 |         except Exception as e:
105 |             print('Sorry, we were not able to run the provided code in `sess`.')
106 |             raise e
107 |         self.submit_part('JEmpp', str(self.ravel_output(np.mean(imgs))))
108 |         var_per_channel = np.var(imgs, axis=0)
109 |         self.submit_part('3K3IB', str(self.ravel_output(np.max(var_per_channel))))
110 | 
111 |     def submit_conditional_hallucinating(self, sess, conditional_sampled_im_mean):
112 |         (x_train, y_train), (x_test, y_test) = mnist.load_data()
113 |         x_train = x_train.astype('float32') / 255.
114 |         x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
115 |         
116 |         baseline = np.zeros((10, 784))
117 |         for i in range(10):
118 |             idx = y_train == i
119 |             baseline[i, :] = np.mean(x_train[idx, :], axis=0)
120 |         baseline_repeated = np.repeat(baseline, 5, axis=0)
121 |         
122 |         try:
123 |             imgs = sess.run(conditional_sampled_im_mean)
124 |         except Exception as e:
125 |             print('Sorry, we were not able to run the provided code in `sess`.')
126 |             raise e
127 |             
128 |         diff = np.abs(imgs - baseline_repeated)
129 |         self.submit_part('tYD01', str(self.ravel_output(np.mean(diff))))
130 |         var_per_channel = np.var(diff, axis=0)
131 |         self.submit_part('CaofU', str(self.ravel_output(np.max(var_per_channel))))
132 | 


--------------------------------------------------------------------------------
/bayesian-methods-for-ml-master/assignment 6/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | class Grader(object):
 7 |     def __init__(self):
 8 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
 9 |         self.assignment_key = 'ZJzC93UJEeesww5LLQnVZg'
10 |         self.parts = OrderedDict([('P8Xj7', '1.1'), 
11 |                       ('sYdjs', '1.2 (mean)'), 
12 |                       ('Mjy6R', '1.2 (variance)'),
13 |                       ('Wif7t', '1.3'),
14 |                       ('V9yZN', '1.4 (noise)'),
15 |                       ('s4es0', '1.4 (just signal)'),
16 |                       ('ckZSh', '1.5'),
17 |                       ('1Jngf', '2.1'),
18 |                       ('CBiGW', '2.2')])
19 |         self.answers = {key: None for key in self.parts}
20 | 
21 |     @staticmethod
22 |     def ravel_output(output):
23 |         '''
24 |            If student accedentally submitted np.array with one
25 |            element instead of number, this function will submit
26 |            this number instead
27 |         '''
28 |         if isinstance(output, np.ndarray) and output.size == 1:
29 |             output = output.item(0)
30 |         return output
31 | 
32 |     def submit(self, email, token):
33 |         submission = {
34 |                     "assignmentKey": self.assignment_key, 
35 |                     "submitterEmail": email, 
36 |                     "secret": token, 
37 |                     "parts": {}
38 |                   }
39 |         for part, output in self.answers.items():
40 |             if output is not None:
41 |                 submission["parts"][part] = {"output": output}
42 |             else:
43 |                 submission["parts"][part] = dict()
44 |         request = requests.post(self.submission_page, data=json.dumps(submission))
45 |         response = request.json()
46 |         if request.status_code == 201:
47 |             print('Submitted to Coursera platform. See results on assignment page!')
48 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
49 |             print(response[u'details'][u'learnerMessage'])
50 |         else:
51 |             print("Unknown response from Coursera: {}".format(request.status_code))
52 |             print(response)
53 | 
54 |     def status(self):
55 |         print("You want to submit these numbers:")
56 |         for part_id, part_name in self.parts.items():
57 |             answer = self.answers[part_id]
58 |             if answer is None:
59 |                 answer = '-'*10
60 |             print("Task {}: {}".format(part_name, answer))
61 |                
62 |     def submit_part(self, part, output):
63 |         self.answers[part] = output
64 |         print("Current answer for task {} is: {}".format(self.parts[part], output))
65 | 
66 |     def submit_GPy_1(self, output):
67 |         self.submit_part('P8Xj7', str(self.ravel_output(output)))
68 | 
69 |     def submit_GPy_2(self, mean, var):
70 |         self.submit_part('sYdjs', str(self.ravel_output(mean)))
71 |         self.submit_part('Mjy6R', str(self.ravel_output(var)))
72 |         
73 |     def submit_GPy_3(self, output):
74 |         self.submit_part('Wif7t', str(self.ravel_output(output)))
75 | 
76 |     def submit_GPy_4(self, noise, just_signal):
77 |         self.submit_part('V9yZN', str(self.ravel_output(noise)))
78 |         self.submit_part('s4es0', str(self.ravel_output(just_signal)))
79 |         
80 |     def submit_GPy_5(self, output):
81 |         self.submit_part('ckZSh', str(self.ravel_output(output))) 
82 |         
83 |     def submit_GPyOpt_1(self, output):
84 |         self.submit_part('1Jngf', str(self.ravel_output(output)))
85 |         
86 |     def submit_GPyOpt_2(self, output):
87 |         self.submit_part('CBiGW', str(self.ravel_output(output)))


--------------------------------------------------------------------------------
/competitive-data-science/Programming assignment, week 1: Pandas basics/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | def array_to_hash(x):
 7 |     x_tupled = None
 8 |     if type(x) == list:
 9 |         x_tupled = tuple(x)
10 |     elif type(x) == np.ndarray:
11 |         x_tupled = tuple(list(x.flatten()))
12 |     elif type(x) == tuple:
13 |         x_tupled = x
14 |     else:
15 |         raise RuntimeError('unexpected type of input: {}'.format(type(x)))
16 |     return hash(tuple(map(float, x_tupled)))
17 | 
18 | def almostEqual(x, y):
19 |     return abs(x - y) < 1e-3
20 | 
21 | 
22 | class Grader(object):
23 |     def __init__(self):
24 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
25 |         self.assignment_key = 'S1UqVXp-EeelpgpYPAO2Og'
26 |         self.parts = OrderedDict([
27 |                     ('edAEq', 'max_revenue'),
28 |                     ('Xn0Ec', 'category_id_with_max_revenue'),
29 |                     ('CZDVZ', 'num_items_constant_price'),
30 |                     ('HlAjc', 'total_num_items_sold_var')])
31 |         self.answers = {key: None for key in self.parts}
32 | 
33 |     @staticmethod
34 |     def ravel_output(output):
35 |         '''
36 |            If student accedentally submitted np.array with one
37 |            element instead of number, this function will submit
38 |            this number instead
39 |         '''
40 |         if isinstance(output, np.ndarray) and output.size == 1:
41 |             output = output.item(0)
42 |         return output
43 | 
44 |     def submit(self, email, token):
45 |         submission = {
46 |                     "assignmentKey": self.assignment_key, 
47 |                     "submitterEmail": email, 
48 |                     "secret": token, 
49 |                     "parts": {}
50 |                   }
51 |         for part, output in self.answers.items():
52 |             if output is not None:
53 |                 submission["parts"][part] = {"output": output}
54 |             else:
55 |                 submission["parts"][part] = dict()
56 |         request = requests.post(self.submission_page, data=json.dumps(submission))
57 |         response = request.json()
58 |         if request.status_code == 201:
59 |             print('Submitted to Coursera platform. See results on assignment page!')
60 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
61 |             print(response[u'details'][u'learnerMessage'])
62 |         else:
63 |             print("Unknown response from Coursera: {}".format(request.status_code))
64 |             print(response)
65 | 
66 |     def status(self):
67 |         print("You want to submit these numbers:")
68 |         for part_id, part_name in self.parts.items():
69 |             answer = self.answers[part_id]
70 |             if answer is None:
71 |                 answer = '-'*10
72 |             print("Task {}: {}".format(part_name, answer))
73 |                
74 |     def submit_part(self, part, output):
75 |         self.answers[part] = output
76 |         print("Current answer for task {} is: {}".format(self.parts[part], output))
77 | 
78 |     def submit_tag(self, tag, output):
79 |         part_id = [k for k, v in self.parts.items() if v == tag]
80 |         if len(part_id)!=1:
81 |             raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id)))
82 |         part_id = part_id[0]
83 |         self.submit_part(part_id, str(self.ravel_output(output)))


--------------------------------------------------------------------------------
/competitive-data-science/Programming assignment, week 3: Mean encodings/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | def array_to_hash(x):
 7 |     x_tupled = None
 8 |     if type(x) == list:
 9 |         x_tupled = tuple(x)
10 |     elif type(x) == np.ndarray:
11 |         x_tupled = tuple(list(x.flatten()))
12 |     elif type(x) == tuple:
13 |         x_tupled = x
14 |     else:
15 |         raise RuntimeError('unexpected type of input: {}'.format(type(x)))
16 |     return hash(tuple(map(float, x_tupled)))
17 | 
18 | def almostEqual(x, y):
19 |     return abs(x - y) < 1e-5
20 | 
21 | 
22 | class Grader(object):
23 |     def __init__(self):
24 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
25 |         self.assignment_key = 'JVyZjZIaEeeXtQpjLCk-0A'
26 |         self.parts = OrderedDict([
27 |                     ('9zPRY', 'KFold_scheme'),
28 |                     ('xEf0Q', 'Leave-one-out_scheme'),
29 |                     ('zuMqo', 'Smoothing_scheme'),
30 |                     ('RNfnI', 'Expanding_mean_scheme')])
31 |         self.answers = {key: None for key in self.parts}
32 | 
33 |     @staticmethod
34 |     def ravel_output(output):
35 |         '''
36 |            If student accedentally submitted np.array with one
37 |            element instead of number, this function will submit
38 |            this number instead
39 |         '''
40 |         if isinstance(output, np.ndarray) and output.size == 1:
41 |             output = output.item(0)
42 |         return output
43 | 
44 |     def submit(self, email, token):
45 |         submission = {
46 |                     "assignmentKey": self.assignment_key, 
47 |                     "submitterEmail": email, 
48 |                     "secret": token, 
49 |                     "parts": {}
50 |                   }
51 |         for part, output in self.answers.items():
52 |             if output is not None:
53 |                 submission["parts"][part] = {"output": output}
54 |             else:
55 |                 submission["parts"][part] = dict()
56 |         request = requests.post(self.submission_page, data=json.dumps(submission))
57 |         response = request.json()
58 |         if request.status_code == 201:
59 |             print('Submitted to Coursera platform. See results on assignment page!')
60 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
61 |             print(response[u'details'][u'learnerMessage'])
62 |         else:
63 |             print("Unknown response from Coursera: {}".format(request.status_code))
64 |             print(response)
65 | 
66 |     def status(self):
67 |         print("You want to submit these numbers:")
68 |         for part_id, part_name in self.parts.items():
69 |             answer = self.answers[part_id]
70 |             if answer is None:
71 |                 answer = '-'*10
72 |             print("Task {}: {}".format(part_name, answer))
73 |                
74 |     def submit_part(self, part, output):
75 |         self.answers[part] = output
76 |         print("Current answer for task {} is: {}".format(self.parts[part], output))
77 | 
78 |     def submit_tag(self, tag, output):
79 |         part_id = [k for k, v in self.parts.items() if v == tag]
80 |         if len(part_id)!=1:
81 |             raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id)))
82 |         part_id = part_id[0]
83 |         self.submit_part(part_id, str(self.ravel_output(output)))


--------------------------------------------------------------------------------
/competitive-data-science/Programming assignment, week 4: Ensembles/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | def array_to_hash(x):
 7 |     x_tupled = None
 8 |     if type(x) == list:
 9 |         x_tupled = tuple(x)
10 |     elif type(x) == np.ndarray:
11 |         x_tupled = tuple(list(x.flatten()))
12 |     elif type(x) == tuple:
13 |         x_tupled = x
14 |     else:
15 |         raise RuntimeError('unexpected type of input: {}'.format(type(x)))
16 |     return hash(tuple(map(float, x_tupled)))
17 | 
18 | def almostEqual(x, y):
19 |     return abs(x - y) < 1e-5
20 | 
21 | 
22 | class Grader(object):
23 |     def __init__(self):
24 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
25 |         self.assignment_key = 'Lhay-55JEeet3xIBvGMumA'
26 |         self.parts = OrderedDict([
27 |                         ('EyiFH', 'best_alpha'),
28 |                         ('XH82R', 'r2_train_simple_mix'),
29 |                         ('BHeRs', 'r2_test_simple_mix'),
30 |                         ('MkwCS', 'r2_train_stacking'),
31 |                         ('j4Adb', 'r2_test_stacking'),
32 |                      ])
33 |         self.answers = {key: None for key in self.parts}
34 | 
35 |     @staticmethod
36 |     def ravel_output(output):
37 |         '''
38 |            If student accedentally submitted np.array with one
39 |            element instead of number, this function will submit
40 |            this number instead
41 |         '''
42 |         if isinstance(output, np.ndarray) and output.size == 1:
43 |             output = output.item(0)
44 |         return output
45 | 
46 |     def submit(self, email, token):
47 |         submission = {
48 |                     "assignmentKey": self.assignment_key, 
49 |                     "submitterEmail": email, 
50 |                     "secret": token, 
51 |                     "parts": {}
52 |                   }
53 |         for part, output in self.answers.items():
54 |             if output is not None:
55 |                 submission["parts"][part] = {"output": output}
56 |             else:
57 |                 submission["parts"][part] = dict()
58 |         request = requests.post(self.submission_page, data=json.dumps(submission))
59 |         response = request.json()
60 |         if request.status_code == 201:
61 |             print('Submitted to Coursera platform. See results on assignment page!')
62 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
63 |             print(response[u'details'][u'learnerMessage'])
64 |         else:
65 |             print("Unknown response from Coursera: {}".format(request.status_code))
66 |             print(response)
67 | 
68 |     def status(self):
69 |         print("You want to submit these numbers:")
70 |         for part_id, part_name in self.parts.items():
71 |             answer = self.answers[part_id]
72 |             if answer is None:
73 |                 answer = '-'*10
74 |             print("Task {}: {}".format(part_name, answer))
75 |                
76 |     def submit_part(self, part, output):
77 |         self.answers[part] = output
78 |         print("Current answer for task {} is: {}".format(self.parts[part], output))
79 | 
80 |     def submit_tag(self, tag, output):
81 |         part_id = [k for k, v in self.parts.items() if v == tag]
82 |         if len(part_id)!=1:
83 |             raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id)))
84 |         part_id = part_id[0]
85 |         self.submit_part(part_id, str(self.ravel_output(output)))


--------------------------------------------------------------------------------
/competitive-data-science/Programming assignment, week 4: KNN features/Untitled.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 6,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 18,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "1"
 21 |       ]
 22 |      },
 23 |      "execution_count": 18,
 24 |      "metadata": {},
 25 |      "output_type": "execute_result"
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "neighs_y=np.array([0,1,2])\n",
 30 |     "diffs = np.insert(np.diff(neighs_y), 0, 999)\n",
 31 |     "feats = np.unique(neighs_y[diffs == 0], return_counts=True)\n",
 32 |     "if len(feats[0])==0:\n",
 33 |     "    feats=1\n",
 34 |     "else:\n",
 35 |     "    feats=feats[1].max() + 1\n",
 36 |     "feats"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 51,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "array([2])"
 48 |       ]
 49 |      },
 50 |      "execution_count": 51,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "x=np.array([0,0,3,4,0])\n"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 89,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "def func(x):\n",
 66 |     "    x=np.array(x)\n",
 67 |     "    if len(x)==1:\n",
 68 |     "        return 1\n",
 69 |     "    suma=np.where(x==x[0],0,1)\n",
 70 |     "    return suma.cumsum()[np.argwhere(suma)[0]]"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 94,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "ename": "AssertionError",
 80 |      "evalue": "",
 81 |      "output_type": "error",
 82 |      "traceback": [
 83 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 84 |       "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
 85 |       "\u001b[0;32m<ipython-input-94-b8f4f9a39c7d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 86 |       "\u001b[0;31mAssertionError\u001b[0m: "
 87 |      ]
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "assert func([0])==1\n",
 92 |     "assert func([0,1,1])==1\n",
 93 |     "assert func([0,1,0,1])==1\n",
 94 |     "assert func([1,1,2,2,1])==2"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 136,
100 |    "metadata": {},
101 |    "outputs": [
102 |     {
103 |      "data": {
104 |       "text/plain": [
105 |        "2"
106 |       ]
107 |      },
108 |      "execution_count": 136,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "x=np.array([0,0,1,0,1])\n",
115 |     "suma=np.where(x==x[0],1,0)\n",
116 |     "np.equal(suma.cumsum(),np.arange(1,6)).sum()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 127,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "array([1, 2, 3, 3, 3])"
128 |       ]
129 |      },
130 |      "execution_count": 127,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "cs"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 119,
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "data": {
146 |       "text/plain": [
147 |        "array([2])"
148 |       ]
149 |      },
150 |      "execution_count": 119,
151 |      "metadata": {},
152 |      "output_type": "execute_result"
153 |     }
154 |    ],
155 |    "source": [
156 |     "np.argwhere(np.diff(cs)).ravel()"
157 |    ]
158 |   }
159 |  ],
160 |  "metadata": {
161 |   "kernelspec": {
162 |    "display_name": "Python 3",
163 |    "language": "python",
164 |    "name": "python3"
165 |   },
166 |   "language_info": {
167 |    "codemirror_mode": {
168 |     "name": "ipython",
169 |     "version": 3
170 |    },
171 |    "file_extension": ".py",
172 |    "mimetype": "text/x-python",
173 |    "name": "python",
174 |    "nbconvert_exporter": "python",
175 |    "pygments_lexer": "ipython3",
176 |    "version": "3.6.3"
177 |   }
178 |  },
179 |  "nbformat": 4,
180 |  "nbformat_minor": 2
181 | }
182 | 


--------------------------------------------------------------------------------
/competitive-data-science/Programming assignment, week 4: KNN features/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | def array_to_hash(x):
 7 |     x_tupled = None
 8 |     if type(x) == list:
 9 |         x_tupled = tuple(x)
10 |     elif type(x) == np.ndarray:
11 |         x_tupled = tuple(list(x.flatten()))
12 |     elif type(x) == tuple:
13 |         x_tupled = x
14 |     else:
15 |         raise RuntimeError('unexpected type of input: {}'.format(type(x)))
16 |     return hash(tuple(map(float, x_tupled)))
17 | 
18 | def almostEqual(x, y):
19 |     return abs(x - y) < 1e-3
20 | 
21 | 
22 | class Grader(object):
23 |     def __init__(self):
24 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
25 |         self.assignment_key = 'r2N4iqFlEeeRFQqEddeEzg'
26 |         self.parts = OrderedDict([
27 |                     ('1O8kU', 'statistic')])
28 |         self.answers = {key: None for key in self.parts}
29 | 
30 |     @staticmethod
31 |     def ravel_output(output):
32 |         '''
33 |            If student accedentally submitted np.array with one
34 |            element instead of number, this function will submit
35 |            this number instead
36 |         '''
37 |         if isinstance(output, np.ndarray) and output.size == 1:
38 |             output = output.item(0)
39 |         return output
40 | 
41 |     def submit(self, email, token):
42 |         submission = {
43 |                     "assignmentKey": self.assignment_key, 
44 |                     "submitterEmail": email, 
45 |                     "secret": token, 
46 |                     "parts": {}
47 |                   }
48 |         for part, output in self.answers.items():
49 |             if output is not None:
50 |                 submission["parts"][part] = {"output": output}
51 |             else:
52 |                 submission["parts"][part] = dict()
53 |         request = requests.post(self.submission_page, data=json.dumps(submission))
54 |         response = request.json()
55 |         if request.status_code == 201:
56 |             print('Submitted to Coursera platform. See results on assignment page!')
57 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
58 |             print(response[u'details'][u'learnerMessage'])
59 |         else:
60 |             print("Unknown response from Coursera: {}".format(request.status_code))
61 |             print(response)
62 | 
63 |     def status(self):
64 |         print("You want to submit these numbers:")
65 |         for part_id, part_name in self.parts.items():
66 |             answer = self.answers[part_id]
67 |             if answer is None:
68 |                 answer = '-'*10
69 |             print("Task {}: {}".format(part_name, answer))
70 |                
71 |     def submit_part(self, part, output):
72 |         self.answers[part] = output
73 |         print("Current answer for task {} is: {}".format(self.parts[part], output))
74 | 
75 |     def submit_tag(self, tag, output):
76 |         part_id = [k for k, v in self.parts.items() if v == tag]
77 |         if len(part_id)!=1:
78 |             raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id)))
79 |         part_id = part_id[0]
80 |         self.submit_part(part_id, str(self.ravel_output(output)))


--------------------------------------------------------------------------------
/competitive-data-science/Programming assignment, week 4: KNN features/test multiprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from multiprocessing import Pool"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 12,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "def f2(x):\n",
 19 |     "    for i in range(100000000):\n",
 20 |     "        x+=i\n",
 21 |     "    return x"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 14,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "[4999999950000001, 4999999950000002, 4999999950000003, 4999999950000005, 4999999950000006, 4999999950000007]\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "if __name__ == '__main__':\n",
 39 |     "    p = Pool(processes=7)\n",
 40 |     "    print(p.map(f2, [1, 2, 3,5,6,7]))"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 20,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import numpy as np"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 26,
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stdout",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "[ 0.          0.33333333  0.33333333  0.          0.33333333  0.          0.\n",
 62 |       "  0.          0.          0.        ]\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "neighs_y=[1,2,4,4,1,2,3]\n",
 68 |     "classes=10\n",
 69 |     "feats = np.bincount(neighs_y[:3],minlength=classes)\n",
 70 |     "            \n",
 71 |     "feats  = feats / feats.sum()\n",
 72 |     "print(feats)\n"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 52,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "array([999,   1,   2,   0,  -3,   1,   1,   0,   0])"
 84 |       ]
 85 |      },
 86 |      "execution_count": 52,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": []
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 67,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "data": {
100 |       "text/plain": [
101 |        "3"
102 |       ]
103 |      },
104 |      "execution_count": 67,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "x=np.array([1,2,4,4,1,2,3,3,3])\n",
111 |     "diffs=np.insert(np.diff(x),0,999)\n",
112 |     "\n",
113 |     "np.unique(x[diffs==0],return_counts=True)[1].max()+1\n"
114 |    ]
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "kernelspec": {
119 |    "display_name": "Python 3",
120 |    "language": "python",
121 |    "name": "python3"
122 |   },
123 |   "language_info": {
124 |    "codemirror_mode": {
125 |     "name": "ipython",
126 |     "version": 3
127 |    },
128 |    "file_extension": ".py",
129 |    "mimetype": "text/x-python",
130 |    "name": "python",
131 |    "nbconvert_exporter": "python",
132 |    "pygments_lexer": "ipython3",
133 |    "version": "3.6.3"
134 |   }
135 |  },
136 |  "nbformat": 4,
137 |  "nbformat_minor": 2
138 | }
139 | 


--------------------------------------------------------------------------------
/competitive-data-science/README.md:
--------------------------------------------------------------------------------
1 | ## Materials for "How to Win a Data Science Competition: Learn from Top Kagglers" course
2 | 
3 | This repository contains programming assignments notebooks for the [course](https://www.coursera.org/learn/competitive-data-science/home/welcome) about competitive data science.
4 | 


--------------------------------------------------------------------------------
/competitive-data-science/Reading materials/Macros.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Macros"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "This notebook shows how to use *macros* commands in Jupyter.\n",
 15 |     "\n",
 16 |     "What is *macro*? It is just a named code snippet. Similarly to functions, we can use macros to wrap frequently used code. For example, we can define a macro, that will load all the libraries for us.\n",
 17 |     "\n",
 18 |     "### Step 1: Define macro \n",
 19 |     "\n",
 20 |     "To save some code as a macro we need to put that code in a cell and run it. "
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stdout",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "The libraries have been loaded!\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "import numpy as np\n",
 38 |     "import pandas as pd \n",
 39 |     "from tqdm import tqdm_notebook\n",
 40 |     "import os\n",
 41 |     "import sys\n",
 42 |     "import os.path\n",
 43 |     "\n",
 44 |     "import matplotlib.pyplot as plt\n",
 45 |     "import matplotlib as mpl\n",
 46 |     "from matplotlib import rc\n",
 47 |     "from cycler import cycler\n",
 48 |     "%matplotlib inline\n",
 49 |     "\n",
 50 |     " \n",
 51 |     "mpl.rcParams['axes.prop_cycle'] = cycler('color', ['#ff0000', '#0000ff',   '#00ffff','#ffA300', '#00ff00', \n",
 52 |     "     '#ff00ff', '#990000', '#009999', '#999900', '#009900', '#009999'])\n",
 53 |     "\n",
 54 |     "rc('font', size=16)\n",
 55 |     "rc('font',**{'family':'serif','serif':['Computer Modern']})\n",
 56 |     "rc('text', usetex=False)\n",
 57 |     "rc('figure', figsize=(12, 10))\n",
 58 |     "rc('axes', linewidth=.5)\n",
 59 |     "rc('lines', linewidth=1.75)\n",
 60 |     "\n",
 61 |     "print('The libraries have been loaded!')"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "Now you need to remember the number inside squre brackets of `In [<number>]`. Now, to save the code, in that cell you need to use macro magic:\n",
 69 |     "\n",
 70 |     "```\n",
 71 |     "%macro __imp <number>\n",
 72 |     "```"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 2,
 78 |    "metadata": {
 79 |     "collapsed": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "%macro -q __imp 1"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "Now try it!"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 3,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stdout",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "The libraries have been loaded!\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "__imp"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "### Step 2: save macro\n",
115 |     "\n",
116 |     "To this end we've only created a macro, but it will be lost, when the kernel is restarted. We need to somehow store it, so than we can load it easily later. In can be done with `%store` macro."
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 4,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "name": "stdout",
126 |      "output_type": "stream",
127 |      "text": [
128 |       "Stored '__imp' (Macro)\n"
129 |      ]
130 |     }
131 |    ],
132 |    "source": [
133 |     "%store __imp"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "Now `__imp` is saved in a kind of Jupyter's global memory. You can list all the stored variables like that:"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 5,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "name": "stdout",
150 |      "output_type": "stream",
151 |      "text": [
152 |       "Stored variables and their in-db values:\n",
153 |       "__imp             -> IPython.macro.Macro(\"import numpy as np\\nimport pa\n"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "%store"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "Now **restart the kernel** and get back to this cell without running the previous ones. To run the stored macro you need to retrieve the macro first with the following line: "
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 1,
171 |    "metadata": {
172 |     "collapsed": true
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "%store -r __imp"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "And only then call the macro:"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 2,
189 |    "metadata": {},
190 |    "outputs": [
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "The libraries have been loaded!\n"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "__imp"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "### Step 3: auto restore macro"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "So you need to use as many as 2 cells! But, fortunately, Jupyer can load the stored variables (and macros) automatically. To enable it you need to update you `.ipython_profile` [config](http://ipython.readthedocs.io/en/stable/development/config.html). If you've never heared of it, then it is not yet created, otherwise you should know where it lives. \n",
215 |     "\n",
216 |     "On Coursera's notebooks we will create it here: `~/.ipython/profile_default/ipython_profile.py` and notify the ipython, that we want it to automatically restore stored variables.\n",
217 |     "\n",
218 |     "```\n",
219 |     "c.StoreMagics.autorestore = True\n",
220 |     "```"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 4,
226 |    "metadata": {},
227 |    "outputs": [
228 |     {
229 |      "name": "stdout",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "c = get_config()\r\n",
233 |       "c.StoreMagics.autorestore = True\r\n"
234 |      ]
235 |     }
236 |    ],
237 |    "source": [
238 |     "!echo \"c = get_config()\\nc.StoreMagics.autorestore = True\" > ~/.ipython/profile_default/ipython_config.py\n",
239 |     "!cat ~/.ipython/profile_default/ipython_config.py"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "That's it! Now **restart your notebook (kernel)** and **define and store macro** again (step 1 and first code cell from step 2). And finally, to test it, **restart the kernel** again. Now you can immediately access `__imp` macro, so that all the libraries are loaded with a 5 char line of code."
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 1,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "The libraries have been loaded!\n"
259 |      ]
260 |     }
261 |    ],
262 |    "source": [
263 |     "__imp"
264 |    ]
265 |   }
266 |  ],
267 |  "metadata": {
268 |   "kernelspec": {
269 |    "display_name": "Python 3",
270 |    "language": "python",
271 |    "name": "python3"
272 |   },
273 |   "language_info": {
274 |    "codemirror_mode": {
275 |     "name": "ipython",
276 |     "version": 3
277 |    },
278 |    "file_extension": ".py",
279 |    "mimetype": "text/x-python",
280 |    "name": "python",
281 |    "nbconvert_exporter": "python",
282 |    "pygments_lexer": "ipython3",
283 |    "version": "3.6.0"
284 |   }
285 |  },
286 |  "nbformat": 4,
287 |  "nbformat_minor": 1
288 | }
289 | 


--------------------------------------------------------------------------------
/competitive-data-science/Reading materials/Metrics_video8_soft_kappa_xgboost.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Soft Kappa objective"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "In this notebook you can find an implementation for \"soft kappa\" loss and objective from [this paper](https://arxiv.org/abs/1509.07107). "
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "code",
19 |    "execution_count": 1,
20 |    "metadata": {
21 |     "collapsed": true
22 |    },
23 |    "outputs": [],
24 |    "source": [
25 |     "def soft_kappa_grad_hess(y, p):\n",
26 |     "    '''\n",
27 |     "        Returns first and second derivatives of the objective with respect to predictions `p`. \n",
28 |     "        `y` is a vector of corresponding target labels.  \n",
29 |     "    '''\n",
30 |     "    norm = p.dot(p) + y.dot(y)\n",
31 |     "    \n",
32 |     "    grad = -2 * y / norm + 4 * p * np.dot(y, p) / (norm ** 2)\n",
33 |     "    hess = 8 * p * y / (norm ** 2) + 4 * np.dot(y, p) / (norm ** 2)  - (16 * p ** 2 * np.dot(y, p)) / (norm ** 3)\n",
34 |     "    return grad, hess"
35 |    ]
36 |   },
37 |   {
38 |    "cell_type": "code",
39 |    "execution_count": 2,
40 |    "metadata": {
41 |     "collapsed": true
42 |    },
43 |    "outputs": [],
44 |    "source": [
45 |     "def soft_kappa(preds, dtrain):\n",
46 |     "    '''\n",
47 |     "        Having predictions `preds` and targets `dtrain.get_label()` this function coumputes soft kappa loss.\n",
48 |     "        NOTE, that it assumes `mean(target) = 0`.\n",
49 |     "        \n",
50 |     "    '''\n",
51 |     "    target = dtrain.get_label()\n",
52 |     "    return 'kappa' ,  -2 * target.dot(preds) / (target.dot(target) + preds.dot(preds))"
53 |    ]
54 |   }
55 |  ],
56 |  "metadata": {
57 |   "kernelspec": {
58 |    "display_name": "Python 3",
59 |    "language": "python",
60 |    "name": "python3"
61 |   },
62 |   "language_info": {
63 |    "codemirror_mode": {
64 |     "name": "ipython",
65 |     "version": 3
66 |    },
67 |    "file_extension": ".py",
68 |    "mimetype": "text/x-python",
69 |    "name": "python",
70 |    "nbconvert_exporter": "python",
71 |    "pygments_lexer": "ipython3",
72 |    "version": "3.6.0"
73 |   }
74 |  },
75 |  "nbformat": 4,
76 |  "nbformat_minor": 1
77 | }
78 | 


--------------------------------------------------------------------------------
/competitive-data-science/kaggle_project/Documentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/competitive-data-science/kaggle_project/Documentation.pdf


--------------------------------------------------------------------------------
/competitive-data-science/kaggle_project/README.md:
--------------------------------------------------------------------------------
 1 | # The solution is password protected to avoid spoiling the competition for others :-)
 2 | 
 3 | # How to generate the solution
 4 | 
 5 | Work through the FEAT_ notebooks to generate the required features. Note that the cells that generate the .csv.gz are commented out. Uncomment them if you do want the files.
 6 | 
 7 | Then work through MODEL_final to train the models. **Do not blindly run the notebooks!**
 8 | Throughout the MODEL notebook there are a series of checkpoints that will save the progress so far (To h5 or pickle). This is intended to be used if you don’t have a lot of RAM. You can just work up to that point, restart the notebook, run Cell 1 to import packages, then scroll down, and reload what you just saved, to wipeout unwanted memory.
 9 | 
10 | In the model notebook, you will first have to train the models once on the training set (This generates the ALT_*_TRAIN) files, and then the ALT_MODEL files, which are the final models. These final models are included. The stacked model tuning is done using the TRAIN models, but at the end you will train the meta-model on the full models.
11 | 
12 | Finally, to predict run the Predict notebook **(again, not blindly).** Optionally you can try to zero out some predictions uncommenting one of the final lines, but my best score was achieved with the results “as they are”.
13 | 
14 | The final solution should score slightly below 0.95, achieving 10/10 in the grader.
15 | 


--------------------------------------------------------------------------------
/intro-to-dle/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to Deep Learning course resources
 2 | https://www.coursera.org/learn/intro-to-deep-learning
 3 | 
 4 | ## Offline instructions
 5 | Coursera Jupyter Environment can be slow if many learners use it heavily. 
 6 | Our tasks are compute-heavy and we recommend to run them on your hardware for optimal performance.
 7 | 
 8 | You will need a computer with at least 4GB of RAM.
 9 | 
10 | There're two options to setup the Jupyter Notebooks locally: Docker container and Anaconda.
11 | 
12 | ### Docker container option (best for Mac/Linux)
13 | 
14 | Follow the instructions on https://hub.docker.com/r/zimovnov/coursera-aml-docker/ to install Docker container with all necessary software installed.
15 | 
16 | After that you should see a Jupyter page in your browser.
17 | 
18 | ### Anaconda option (best for Windows)
19 | We highly recommend to install docker environment, but if it's not an option, 
20 | you can try to install the necessary python modules with Anaconda.
21 | 
22 | First, install Anaconda with **Python 3.5+** from [here](https://www.anaconda.com/download).
23 | 
24 | Download `conda_requirements.txt` from [here](https://github.com/ZEMUSHKA/coursera-aml-docker/blob/master/conda_requirements.txt).
25 | 
26 | Open terminal on Mac/Linux or "Anaconda Prompt" in Start Menu on Windows and run:
27 | ```
28 | conda config --append channels conda-forge
29 | conda config --append channels menpo
30 | conda install --yes --file conda_requirements.txt
31 | ```
32 | 
33 | To start Jupyter Notebooks run `jupyter notebook` on Mac/Linux or "Jupyter Notebook" in Start Menu on Windows.
34 | 
35 | After that you should see a Jupyter page in your browser.
36 | 
37 | ### Prepare resources inside Jupyter Notebooks (for local setups only)
38 | 
39 | Click **New -> Terminal** and execute: `git clone https://github.com/hse-aml/intro-to-dl.git`
40 | On Windows you might want to install [Git](https://git-scm.com/download/win). 
41 | You can also download all the resources as zip archive from GitHub page.
42 | 
43 | Close the terminal and refresh Jupyter page, you will see **intro-to-dl** folder, go there, 
44 | all the necessary notebooks are waiting for you.
45 | 
46 | First you need to download necessary resources, to do that open `download_resources.ipynb` 
47 | and run cells for Keras and your week.
48 | 
49 | Now you can open a notebook for the corresponding week and work there just like in Coursera Jupyter Environment.


--------------------------------------------------------------------------------
/intro-to-dle/download_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | import shutil
 5 | import tqdm
 6 | import requests
 7 | 
 8 | 
 9 | def download_file(url, file_path):
10 |     r = requests.get(url, stream=True)
11 |     total_size = int(r.headers.get('content-length'))
12 |     try:
13 |         with open(file_path, 'wb', buffering=16*1024*1024) as f:
14 |             bar = tqdm.tqdm_notebook(total=total_size, unit='B', unit_scale=True)
15 |             bar.set_description(os.path.split(file_path)[-1])
16 |             for chunk in r.iter_content(32 * 1024):
17 |                 f.write(chunk)
18 |                 bar.update(len(chunk))
19 |     except Exception:
20 |         print("Download failed")
21 |     finally:
22 |         if os.path.getsize(file_path) != total_size:
23 |             os.remove(file_path)
24 |             print("Removed incomplete download")
25 | 
26 | 
27 | def download_from_github(version, fn, target_dir):
28 |     url = "https://github.com/hse-aml/intro-to-dl/releases/download/{0}/{1}".format(version, fn)
29 |     file_path = os.path.join(target_dir, fn)
30 |     download_file(url, file_path)
31 | 
32 | 
33 | def sequential_downloader(version, fns, target_dir):
34 |     os.makedirs(target_dir, exist_ok=True)
35 |     for fn in fns:
36 |         download_from_github(version, fn, target_dir)
37 | 
38 | 
39 | def link_all_files_from_dir(src_dir, dst_dir):
40 |     os.makedirs(dst_dir, exist_ok=True)
41 |     for fn in os.listdir(src_dir):
42 |         src_file = os.path.join(src_dir, fn)
43 |         dst_file = os.path.join(dst_dir, fn)
44 |         if os.name == "nt":
45 |             shutil.copyfile(src_file, dst_file)
46 |         else:
47 |             if not os.path.exists(dst_file):
48 |                 os.symlink(os.path.abspath(src_file), dst_file)
49 | 
50 | 
51 | def link_all_keras_resources():
52 |     link_all_files_from_dir("../readonly/keras/datasets/", os.path.expanduser("~/.keras/datasets"))
53 |     link_all_files_from_dir("../readonly/keras/models/", os.path.expanduser("~/.keras/models"))
54 | 
55 | 
56 | def link_week_3_resources():
57 |     link_all_files_from_dir("../readonly/week3/", ".")
58 | 
59 | 
60 | def link_week_4_resources():
61 |     link_all_files_from_dir("../readonly/week4/", ".")
62 | 
63 | 
64 | def link_week_6_resources():
65 |     link_all_files_from_dir("../readonly/week6/", ".")
66 | 


--------------------------------------------------------------------------------
/intro-to-dle/grading.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import re
 4 | import requests
 5 | import json
 6 | 
 7 | 
 8 | class Grader(object):
 9 |     def __init__(self, assignment_key, all_parts=()):
10 |         """
11 |         Assignment key is the way to tell Coursera which problem is being submitted.
12 |         """
13 |         self.submission_page = \
14 |             'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
15 |         self.assignment_key = assignment_key
16 |         self.answers = {part: None for part in all_parts}
17 | 
18 |     def submit(self, email, token):
19 |         submission = {
20 |                     "assignmentKey": self.assignment_key,
21 |                     "submitterEmail": email,
22 |                     "secret": token,
23 |                     "parts": {}
24 |         }
25 |         for part, output in self.answers.items():
26 |             if output is not None:
27 |                 submission["parts"][part] = {"output": output}
28 |             else:
29 |                 submission["parts"][part] = dict()
30 |         request = requests.post(self.submission_page, data=json.dumps(submission))
31 |         response = request.json()
32 |         if request.status_code == 201:
33 |             print('Submitted to Coursera platform. See results on assignment page!')
34 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
35 |             print(response[u'details'][u'learnerMessage'])
36 |         else:
37 |             print("Unknown response from Coursera: {}".format(request.status_code))
38 |             print(response)
39 | 
40 |     def set_answer(self, part, answer):
41 |         """Adds an answer for submission. Answer is expected either as string, number, or
42 |            an iterable of numbers.
43 |            Args:
44 |               part - str, assignment part id
45 |               answer - answer to submit. If non iterable, appends repr(answer). If string,
46 |                 is appended as provided. If an iterable and not string, converted to
47 |                 space-delimited repr() of members.
48 |         """
49 |         if isinstance(answer, str):
50 |             self.answers[part] = answer
51 |         else:
52 |             try:
53 |                 self.answers[part] = " ".join(map(repr, answer))
54 |             except TypeError:
55 |                 self.answers[part] = repr(answer)
56 | 
57 | 
58 | def array_to_grader(array, epsilon=1e-4):
59 |     """Utility function to help preparing Coursera grading conditions descriptions.
60 |     Args:
61 |        array: iterable of numbers, the correct answers
62 |        epslion: the generated expression will accept the answers with this absolute difference with
63 |          provided values
64 |     Returns:
65 |        String. A Coursera grader expression that checks whether the user submission is in
66 |          (array - epsilon, array + epsilon)"""
67 |     res = []
68 |     for element in array:
69 |         if isinstance(element, int):
70 |             res.append("[{0}, {0}]".format(element))
71 |         else:
72 |             res.append("({0}, {1})".format(element - epsilon, element + epsilon))
73 |     return " ".join(res)
74 | 


--------------------------------------------------------------------------------
/intro-to-dle/keras_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import keras
 4 | import tqdm
 5 | from collections import defaultdict
 6 | import numpy as np
 7 | 
 8 | 
 9 | class TqdmProgressCallback(keras.callbacks.Callback):
10 | 
11 |     def on_train_begin(self, logs=None):
12 |         self.epochs = self.params['epochs']
13 | 
14 |     def on_epoch_begin(self, epoch, logs=None):
15 |         print('Epoch %d/%d' % (epoch + 1, self.epochs))
16 |         if "steps" in self.params:
17 |             self.use_steps = True
18 |             self.target = self.params['steps']
19 |         else:
20 |             self.use_steps = False
21 |             self.target = self.params['samples']
22 |         self.prog_bar = tqdm.tqdm_notebook(total=self.target)
23 |         self.log_values_by_metric = defaultdict(list)
24 | 
25 |     def _set_prog_bar_desc(self, logs):
26 |         for k in self.params['metrics']:
27 |             if k in logs:
28 |                 self.log_values_by_metric[k].append(logs[k])
29 |         desc = "; ".join("{0}: {1:.3f}".format(k, np.mean(values)) for k, values in self.log_values_by_metric.items())
30 |         self.prog_bar.set_description(desc)
31 | 
32 |     def on_batch_end(self, batch, logs=None):
33 |         logs = logs or {}
34 |         if self.use_steps:
35 |             self.prog_bar.update(1)
36 |         else:
37 |             batch_size = logs.get('size', 0)
38 |             self.prog_bar.update(batch_size)
39 |         self._set_prog_bar_desc(logs)
40 | 
41 |     def on_epoch_end(self, epoch, logs=None):
42 |         logs = logs or {}
43 |         self._set_prog_bar_desc(logs)
44 |         self.prog_bar.update(1)  # workaround to show description
45 |         self.prog_bar.close()
46 | 


--------------------------------------------------------------------------------
/intro-to-dle/misc/np_convolution.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.signal import convolve2d
 3 | my_array = np.array([[0, 0, 0, 0],
 4 |                      [0, 1, 0, 0],
 5 |                      [0, 0, 0, 0],
 6 |                      [0, 0, 0, 0]])
 7 | 
 8 | kernel = np.array([[0, 1, 0],
 9 |                    [1, 1, 1],
10 |                    [0, 1, 0]])
11 | 
12 | convolved = convolve2d(my_array, kernel, mode="same")
13 | print(convolved)
14 | 


--------------------------------------------------------------------------------
/intro-to-dle/week1/kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week1/kernel.png


--------------------------------------------------------------------------------
/intro-to-dle/week1/sgd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week1/sgd.png


--------------------------------------------------------------------------------
/intro-to-dle/week1/target.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week1/target.npy


--------------------------------------------------------------------------------
/intro-to-dle/week1/train.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week1/train.npy


--------------------------------------------------------------------------------
/intro-to-dle/week2/datasets:
--------------------------------------------------------------------------------
1 | /home/jose/.keras/datasets/


--------------------------------------------------------------------------------
/intro-to-dle/week2/matplotlib_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from IPython.display import clear_output, display_html, HTML
 4 | import contextlib
 5 | import time
 6 | import io
 7 | import urllib
 8 | import base64
 9 | 
10 | 
11 | class SimpleMovieWriter(object):
12 |     """
13 |     Usage example:
14 |         anim = animation.FuncAnimation(...)
15 |         anim.save(None, writer=SimpleMovieWriter(sleep=0.01))
16 |     """
17 |     def __init__(self, sleep=0.1):
18 |         self.sleep = sleep
19 | 
20 |     def setup(self, fig):
21 |         self.fig = fig
22 | 
23 |     def grab_frame(self, **kwargs):
24 |         img_data = io.BytesIO()
25 |         self.fig.savefig(img_data, format='jpeg')
26 |         img_data.seek(0)
27 |         uri = 'data:image/jpeg;base64,' + urllib.request.quote(base64.b64encode(img_data.getbuffer()))
28 |         img_data.close()
29 |         clear_output(wait=True)
30 |         display_html(HTML('<img src="' + uri + '">'))
31 |         time.sleep(self.sleep)
32 | 
33 |     @contextlib.contextmanager
34 |     def saving(self, fig, *args, **kwargs):
35 |         self.setup(fig)
36 |         try:
37 |             yield self
38 |         finally:
39 |             pass
40 | 


--------------------------------------------------------------------------------
/intro-to-dle/week2/models:
--------------------------------------------------------------------------------
1 | /home/jose/.keras/models/


--------------------------------------------------------------------------------
/intro-to-dle/week2/preprocessed_mnist.py:
--------------------------------------------------------------------------------
 1 | import keras
 2 | 
 3 | 
 4 | def load_dataset(flatten=False):
 5 |     (X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()
 6 | 
 7 |     # normalize x
 8 |     X_train = X_train.astype(float) / 255.
 9 |     X_test = X_test.astype(float) / 255.
10 | 
11 |     # we reserve the last 10000 training examples for validation
12 |     X_train, X_val = X_train[:-10000], X_train[-10000:]
13 |     y_train, y_val = y_train[:-10000], y_train[-10000:]
14 | 
15 |     if flatten:
16 |         X_train = X_train.reshape([X_train.shape[0], -1])
17 |         X_val = X_val.reshape([X_val.shape[0], -1])
18 |         X_test = X_test.reshape([X_test.shape[0], -1])
19 | 
20 |     return X_train, y_train, X_val, y_val, X_test, y_test
21 | 
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/intro-to-dle/week2/submit.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import mean_squared_error
 3 | import sys
 4 | sys.path.append("..")
 5 | import grading
 6 | 
 7 | 
 8 | def submit_mse(compute_mse, email, token):
 9 |     ASSIGNMENT_KEY = "SBaWP48eEeeGSBKyliRlgg"
10 |     PART_KEY = "u2t7D"
11 | 
12 |     # First, do rigorous local testing to help the learner
13 |     for n in [1, 5, 10, 10**3]:
14 |         elems = [np.arange(n), np.arange(n, 0, -1), np.zeros(n),
15 |                  np.ones(n), np.random.random(n), np.random.randint(100, size=n)]
16 |         for el in elems:
17 |             for el_2 in elems:
18 |                 true_mse = np.array(mean_squared_error(el, el_2))
19 |                 my_mse = compute_mse(el, el_2)
20 |                 if not np.allclose(true_mse, my_mse):
21 |                     print('mse(%s,%s)' % (el, el_2))
22 |                     print("should be: %f, but your function returned %f" % (true_mse, my_mse))
23 |                     raise ValueError('Wrong result')
24 |     # Second, submit some reference values. There is nothing preventing the learner from
25 |     # manually submitting numbers computed not via tensorflow, so there is little point
26 |     # in comprehensive server-side testing
27 |     test_pairs = (
28 |         (np.array([
29 |             0.85415937, 0.768366, 0.9763879, 0.11861405, 0.21219242]),
30 |          np.array([0.27163543, 0.14893905, 0.84616464,
31 |                    0.86294942, 0.65509213])),
32 |         (np.array([1, 2, 3]), np.array([3, 2, 2])),
33 |         (np.array([1]), np.array([1])))
34 |     answers = []
35 |     for pair in test_pairs:
36 |         answers.append(compute_mse(pair[0], pair[1]))
37 |     grader = grading.Grader(ASSIGNMENT_KEY)
38 |     grader.set_answer(PART_KEY, answers)
39 |     grader.submit(email, token)
40 | 


--------------------------------------------------------------------------------
/intro-to-dle/week2/util.py:
--------------------------------------------------------------------------------
 1 | """Some auxiliary files used for honor track numpy assignment"""
 2 | import numpy as np
 3 | 
 4 | 
 5 | def eval_numerical_gradient(f, x, verbose=False, h=0.00001):
 6 |     """Evaluates gradient df/dx via finite differences:
 7 |     df/dx ~ (f(x+h) - f(x-h)) / 2h
 8 |     Adopted from https://github.com/ddtm/dl-course/ (our ysda course).
 9 |     """
10 |     fx = f(x) # evaluate function value at original point
11 |     grad = np.zeros_like(x)
12 |     # iterate over all indexes in x
13 |     it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
14 |     while not it.finished:
15 | 
16 |         # evaluate function at x+h
17 |         ix = it.multi_index
18 |         oldval = x[ix]
19 |         x[ix] = oldval + h # increment by h
20 |         fxph = f(x) # evalute f(x + h)
21 |         x[ix] = oldval - h
22 |         fxmh = f(x) # evaluate f(x - h)
23 |         x[ix] = oldval # restore
24 | 
25 |         # compute the partial derivative with centered formula
26 |         grad[ix] = (fxph - fxmh) / (2 * h) # the slope
27 |         if verbose:
28 |             print (ix, grad[ix])
29 |         it.iternext() # step to next dimension
30 | 
31 |     return grad
32 | 


--------------------------------------------------------------------------------
/intro-to-dle/week3/grading_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import re
 4 | 
 5 | def model_total_params(model):
 6 |     """
 7 |     Total params for Keras model
 8 |     """
 9 |     summary = []
10 |     model.summary(print_fn=lambda x: summary.append(x))
11 |     for line in summary:
12 |         m = re.match("Total params: ([\d,]+)", line)
13 |         if m:
14 |             return int(re.sub(",", "", m.groups()[0]))
15 |     return 0
16 | 


--------------------------------------------------------------------------------
/intro-to-dle/week3/imagelabels.mat:
--------------------------------------------------------------------------------
1 | /home/jose/Escritorio/advanced_deep_learning/intro-to-dl/readonly/week3/imagelabels.mat


--------------------------------------------------------------------------------
/intro-to-dle/week3/images/inceptionv3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week3/images/inceptionv3.png


--------------------------------------------------------------------------------
/intro-to-dle/week3/weights.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week3/weights.p


--------------------------------------------------------------------------------
/intro-to-dle/week4/lfw_dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import cv2
 4 | import pandas as pd
 5 | import tarfile
 6 | import tqdm
 7 | 
 8 | 
 9 | ATTRS_NAME = "lfw_attributes.txt"  # http://www.cs.columbia.edu/CAVE/databases/pubfig/download/lfw_attributes.txt
10 | IMAGES_NAME = "lfw-deepfunneled.tgz"  # http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz
11 | RAW_IMAGES_NAME = "lfw.tgz"  # http://vis-www.cs.umass.edu/lfw/lfw.tgz
12 | 
13 | 
14 | def decode_image_from_raw_bytes(raw_bytes):
15 |     img = cv2.imdecode(np.asarray(bytearray(raw_bytes), dtype=np.uint8), 1)
16 |     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
17 |     return img
18 | 
19 | 
20 | def load_lfw_dataset(
21 |         use_raw=False,
22 |         dx=80, dy=80,
23 |         dimx=45, dimy=45):
24 | 
25 |     # read attrs
26 |     df_attrs = pd.read_csv(ATTRS_NAME, sep='\t', skiprows=1)
27 |     df_attrs = pd.DataFrame(df_attrs.iloc[:, :-1].values, columns=df_attrs.columns[1:])
28 |     imgs_with_attrs = set(map(tuple, df_attrs[["person", "imagenum"]].values))
29 | 
30 |     # read photos
31 |     all_photos = []
32 |     photo_ids = []
33 | 
34 |     with tarfile.open(RAW_IMAGES_NAME if use_raw else IMAGES_NAME) as f:
35 |         for m in tqdm.tqdm_notebook(f.getmembers()):
36 |             if m.isfile() and m.name.endswith(".jpg"):
37 |                 # prepare image
38 |                 img = decode_image_from_raw_bytes(f.extractfile(m).read())
39 |                 img = img[dy:-dy, dx:-dx]
40 |                 img = cv2.resize(img, (dimx, dimy))
41 |                 # parse person
42 |                 fname = os.path.split(m.name)[-1]
43 |                 fname_splitted = fname[:-4].replace('_', ' ').split()
44 |                 person_id = ' '.join(fname_splitted[:-1])
45 |                 photo_number = int(fname_splitted[-1])
46 |                 if (person_id, photo_number) in imgs_with_attrs:
47 |                     all_photos.append(img)
48 |                     photo_ids.append({'person': person_id, 'imagenum': photo_number})
49 | 
50 |     photo_ids = pd.DataFrame(photo_ids)
51 |     all_photos = np.stack(all_photos).astype('uint8')
52 | 
53 |     # preserve photo_ids order!
54 |     all_attrs = photo_ids.merge(df_attrs, on=('person', 'imagenum')).drop(["person", "imagenum"], axis=1)
55 | 
56 |     return all_photos, all_attrs
57 | 


--------------------------------------------------------------------------------
/intro-to-dle/week4/submit.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("..")
 3 | import grading
 4 | 
 5 | 
 6 | # code_size = 71
 7 | # img_shape = (38, 38, 3)
 8 | def submit_autoencoder(submission, score, email, token):
 9 |     grader = grading.Grader("9TShnp1JEeeGGAoCUnhvuA")
10 |     encoder, decoder = submission
11 |     grader.set_answer("FtBSK", encoder.output_shape[1])
12 |     grader.set_answer("83Glu", decoder.output_shape[1:])
13 |     grader.set_answer("fnM1K", score)
14 |     grader.submit(email, token)
15 | 


--------------------------------------------------------------------------------
/intro-to-dle/week5/data_copyright:
--------------------------------------------------------------------------------
1 | @names
2 | # Copyright (c) January 1991 by Mark Kantrowitz.
3 | # Thanks to Bill Ross for about 1000 additional names.
4 | # Version 1.3 (29-MAR-94)
5 | 
6 | @mtg cards
7 | https://mtgjson.com/
8 | 
9 | 


--------------------------------------------------------------------------------
/intro-to-dle/week5/rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week5/rnn.png


--------------------------------------------------------------------------------
/intro-to-dle/week5/submit.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | sys.path.append("..")
 4 | import grading
 5 | 
 6 | 
 7 | # code_size = 71
 8 | # img_shape = (38, 38, 3)
 9 | def submit_char_rnn(submission, email, token):
10 |     grader = grading.Grader("cULEpp2NEeemQBKZKgu93A")
11 |     history, samples = submission
12 |     assert len(samples) == 25
13 |     grader.set_answer("pttMO", int(np.mean(history[:10]) > np.mean(history[-10:])))
14 |     grader.set_answer("uly0D", len(set(samples)))
15 |     grader.submit(email, token)
16 | 


--------------------------------------------------------------------------------
/intro-to-dle/week6/grading_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import numpy as np
 4 | import random
 5 | 
 6 | 
 7 | def test_vocab(vocab, PAD, UNK, START, END):
 8 |     return [
 9 |         len(vocab),
10 |         len(np.unique(list(vocab.values()))),
11 |         int(all([_ in vocab for _ in [PAD, UNK, START, END]]))
12 |     ]
13 | 
14 | 
15 | def test_captions_indexing(train_captions_indexed, vocab, UNK):
16 |     starts = set()
17 |     ends = set()
18 |     between = set()
19 |     unk_count = 0
20 |     for caps in train_captions_indexed:
21 |         for cap in caps:
22 |             starts.add(cap[0])
23 |             between.update(cap[1:-1])
24 |             ends.add(cap[-1])
25 |             for w in cap:
26 |                 if w == vocab[UNK]:
27 |                     unk_count += 1
28 |     return [
29 |         len(starts),
30 |         len(ends),
31 |         len(between),
32 |         len(between | starts | ends),
33 |         int(all([isinstance(x, int) for x in (between | starts | ends)])),
34 |         unk_count
35 |     ]
36 | 
37 | 
38 | def test_captions_batching(batch_captions_to_matrix):
39 |     return (batch_captions_to_matrix([[1, 2, 3], [4, 5]], -1, max_len=None).ravel().tolist()
40 |             + batch_captions_to_matrix([[1, 2, 3], [4, 5]], -1, max_len=2).ravel().tolist()
41 |             + batch_captions_to_matrix([[1, 2, 3], [4, 5]], -1, max_len=10).ravel().tolist())
42 | 
43 | 
44 | def get_feed_dict_for_testing(decoder, IMG_EMBED_SIZE, vocab):
45 |     return {
46 |         decoder.img_embeds: np.random.random((32, IMG_EMBED_SIZE)),
47 |         decoder.sentences: np.random.randint(0, len(vocab), (32, 20))
48 |     }
49 | 
50 | 
51 | def test_decoder_shapes(decoder, IMG_EMBED_SIZE, vocab, s):
52 |     tensors_to_test = [
53 |         decoder.h0,
54 |         decoder.word_embeds,
55 |         decoder.flat_hidden_states,
56 |         decoder.flat_token_logits,
57 |         decoder.flat_ground_truth,
58 |         decoder.flat_loss_mask,
59 |         decoder.loss
60 |     ]
61 |     all_shapes = []
62 |     for t in tensors_to_test:
63 |         _ = s.run(t, feed_dict=get_feed_dict_for_testing(decoder, IMG_EMBED_SIZE, vocab))
64 |         all_shapes.extend(_.shape)
65 |     return all_shapes
66 | 
67 | 
68 | def test_random_decoder_loss(decoder, IMG_EMBED_SIZE, vocab, s):
69 |     loss = s.run(decoder.loss, feed_dict=get_feed_dict_for_testing(decoder, IMG_EMBED_SIZE, vocab))
70 |     return loss
71 | 
72 | 
73 | def test_validation_loss(decoder, s, generate_batch, val_img_embeds, val_captions_indexed):
74 |     np.random.seed(300)
75 |     random.seed(300)
76 |     val_loss = 0
77 |     for _ in range(1000):
78 |         val_loss += s.run(decoder.loss, generate_batch(val_img_embeds,
79 |                                                        val_captions_indexed,
80 |                                                        32,
81 |                                                        20))
82 |     val_loss /= 1000.
83 |     return val_loss
84 | 


--------------------------------------------------------------------------------
/intro-to-dle/week6/images/encoder_decoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week6/images/encoder_decoder.png


--------------------------------------------------------------------------------
/intro-to-dle/week6/images/encoder_decoder_explained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week6/images/encoder_decoder_explained.png


--------------------------------------------------------------------------------
/intro-to-dle/week6/images/inceptionv3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/intro-to-dle/week6/images/inceptionv3.png


--------------------------------------------------------------------------------
/intro-to-dle/week6/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | import queue
  5 | import threading
  6 | import zipfile
  7 | import tqdm
  8 | import cv2
  9 | import numpy as np
 10 | import pickle
 11 | 
 12 | 
 13 | def image_center_crop(img):
 14 |     h, w = img.shape[0], img.shape[1]
 15 |     pad_left = 0
 16 |     pad_right = 0
 17 |     pad_top = 0
 18 |     pad_bottom = 0
 19 |     if h > w:
 20 |         diff = h - w
 21 |         pad_top = diff - diff // 2
 22 |         pad_bottom = diff // 2
 23 |     else:
 24 |         diff = w - h
 25 |         pad_left = diff - diff // 2
 26 |         pad_right = diff // 2
 27 |     return img[pad_top:h-pad_bottom, pad_left:w-pad_right, :]
 28 | 
 29 | 
 30 | def decode_image_from_buf(buf):
 31 |     img = cv2.imdecode(np.asarray(bytearray(buf), dtype=np.uint8), 1)
 32 |     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 33 |     return img
 34 | 
 35 | 
 36 | def crop_and_preprocess(img, input_shape, preprocess_for_model):
 37 |     img = image_center_crop(img)  # take center crop
 38 |     img = cv2.resize(img, input_shape)  # resize for our model
 39 |     img = img.astype("float32")  # prepare for normalization
 40 |     img = preprocess_for_model(img)  # preprocess for model
 41 |     return img
 42 | 
 43 | 
 44 | def apply_model(zip_fn, model, preprocess_for_model, extensions=(".jpg",), input_shape=(224, 224), batch_size=32):
 45 |     # queue for cropped images
 46 |     q = queue.Queue(maxsize=batch_size * 10)
 47 | 
 48 |     # when read thread put all images in queue
 49 |     read_thread_completed = threading.Event()
 50 | 
 51 |     # time for read thread to die
 52 |     kill_read_thread = threading.Event()
 53 | 
 54 |     def reading_thread(zip_fn):
 55 |         zf = zipfile.ZipFile(zip_fn)
 56 |         for fn in tqdm.tqdm_notebook(zf.namelist()):
 57 |             if kill_read_thread.is_set():
 58 |                 break
 59 |             if os.path.splitext(fn)[-1] in extensions:
 60 |                 buf = zf.read(fn)  # read raw bytes from zip for fn
 61 |                 img = decode_image_from_buf(buf)  # decode raw bytes
 62 |                 img = crop_and_preprocess(img, input_shape, preprocess_for_model)
 63 |                 while True:
 64 |                     try:
 65 |                         q.put((os.path.split(fn)[-1], img), timeout=1)  # put in queue
 66 |                     except queue.Full:
 67 |                         if kill_read_thread.is_set():
 68 |                             break
 69 |                         continue
 70 |                     break
 71 | 
 72 |         read_thread_completed.set()  # read all images
 73 | 
 74 |     # start reading thread
 75 |     t = threading.Thread(target=reading_thread, args=(zip_fn,))
 76 |     t.daemon = True
 77 |     t.start()
 78 | 
 79 |     img_fns = []
 80 |     img_embeddings = []
 81 | 
 82 |     batch_imgs = []
 83 | 
 84 |     def process_batch(batch_imgs):
 85 |         batch_imgs = np.stack(batch_imgs, axis=0)
 86 |         batch_embeddings = model.predict(batch_imgs)
 87 |         img_embeddings.append(batch_embeddings)
 88 | 
 89 |     try:
 90 |         while True:
 91 |             try:
 92 |                 fn, img = q.get(timeout=1)
 93 |             except queue.Empty:
 94 |                 if read_thread_completed.is_set():
 95 |                     break
 96 |                 continue
 97 |             img_fns.append(fn)
 98 |             batch_imgs.append(img)
 99 |             if len(batch_imgs) == batch_size:
100 |                 process_batch(batch_imgs)
101 |                 batch_imgs = []
102 |             q.task_done()
103 |         # process last batch
104 |         if len(batch_imgs):
105 |             process_batch(batch_imgs)
106 |     finally:
107 |         kill_read_thread.set()
108 |         t.join()
109 | 
110 |     q.join()
111 | 
112 |     img_embeddings = np.vstack(img_embeddings)
113 |     return img_embeddings, img_fns
114 | 
115 | 
116 | def save_pickle(obj, fn):
117 |     with open(fn, "wb") as f:
118 |         pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
119 | 
120 | 
121 | def read_pickle(fn):
122 |     with open(fn, "rb") as f:
123 |         return pickle.load(f)
124 | 


--------------------------------------------------------------------------------
/natural-language-processing/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # Data for assignments
104 | data/
105 | 
106 | week3/starSpaceModel*
107 | 


--------------------------------------------------------------------------------
/natural-language-processing/AWS-tutorial.md:
--------------------------------------------------------------------------------
 1 | # Tutorial for setting up an AWS Virtual Machine
 2 | 
 3 | This tutorial will teach you how to set up an AWS Virtual Machine for the final project of our course. 
 4 | 
 5 | ### 1. Register with AWS and launch an EC2 instance
 6 | 
 7 | First, you need to perform several preparatory steps (if you have already done this before, you can skip them):
 8 | - [Sign up for AWS](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#sign-up-for-aws). You will need to specify your credit card details, but for our project we will use Free Tier instances only, so you should not be charged.
 9 | - [Create a key pair for authentication](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-key-pair). If you use Windows, you will also need to install [PuTTY](https://www.chiark.greenend.org.uk/~sgtatham/putty/) to use SSH.
10 | - [Create security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#create-a-base-security-group). You must add rules to a security group to allow you to connect to your future instance from your IP address using SSH. You might want to allow SSH access from all IPv4 addresses (set to 0.0.0.0/0), because your IP might change.
11 | 
12 | Next, you are ready to create your first EC2 instance:
13 | - [Launch a free tier instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance). For Amazon Machine Image (AMI) on step 3 choose **Ubuntu Server 16.04 LTS**.
14 | - [Connect to your instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-connect-to-instance-linux) using SSH.
15 | - Later on you can [start and stop](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Stop_Start.html) your instance when needed, and [terminate](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-clean-up-your-instance) it in the end.
16 | 
17 | ### 2. Set up dependencies and run your project
18 | 
19 | - Install Docker container for Ubuntu with course dependencies. Follow our Docker instructions.
20 | 
21 | - To be able to access IPython notebooks running on AWS, you might want to SSH with port tunneling:
22 | ```sh
23 | ssh -L 8080:localhost:8080 -i path/to/private_key ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com
24 | ```
25 | Then you will be able to see the notebooks on *localhost:8080* from your browser on the local machine.
26 | 
27 | - Bring code and data to AWS instance, e.g.
28 | ```sh
29 | scp -i path/to/your_key.pem path/to/local_file ubuntu@ec2-XX-XXX-X-XX.us-east-2.compute.amazonaws.com:path/to/remote_file
30 | ``` 
31 | You might want to install [WinSCP](https://winscp.net/eng/docs/lang:ru) for data transfer if you are using Windows.
32 | 
33 | - It is also a good practice to use [tmux](https://medium.com/@peterxjang/a-minimalist-guide-to-tmux-13675fb160fa) to keep your remote session running even if you disconnect from the machine, e.g. by closing your laptop.
34 | 
35 | 


--------------------------------------------------------------------------------
/natural-language-processing/Docker-tutorial.md:
--------------------------------------------------------------------------------
  1 | # Docker container with course dependencies
  2 | 
  3 | This file describes how to use a Docker container with Jupyter notebook and
  4 | all dependencies required for the course.
  5 | 
  6 | The image is located at https://hub.docker.com/r/akashin/coursera-aml-nlp/.
  7 | 
  8 | ## Install Stable Docker Community Edition (CE)
  9 | 
 10 | - For Mac: 
 11 | https://docs.docker.com/docker-for-mac/install/
 12 | 
 13 | - For Ubuntu: 
 14 | https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/ (see also other Linux distributives in the menu).
 15 | 
 16 | - For Windows (64bit Windows 10 Pro, Enterprise and Education):
 17 | https://docs.docker.com/docker-for-windows/install/ 
 18 | 
 19 | - For Windows (older versions):
 20 | https://docs.docker.com/toolbox/toolbox_install_windows/
 21 | 
 22 | 
 23 | 
 24 | ## Get container image
 25 | 
 26 | To get the latest version of the container image run:
 27 | ```sh
 28 | docker pull akashin/coursera-aml-nlp
 29 | ```
 30 | It containes Ubuntu 16.04 Linux distirbutive and all dependencies that you need for our course. The downloaded image takes approximately 2.3GB. 
 31 | 
 32 | **Note:** If you are getting an error "Got permission denied while trying to connect to the Docker daemon socket...", you need to add current user to the docker group:
 33 | ```sh
 34 | sudo usermod -a -G docker $USER
 35 | sudo service docker restart
 36 | ```
 37 | Then you need to logout and login to the system again (disconnect and connect to your AWS instance if you are setting up a docker on it).
 38 | 
 39 | 
 40 | ## Run container for the first time
 41 | 
 42 | Now you can start new container from this image with:
 43 | ```sh
 44 | docker run -it -p 127.0.0.1:8080:8080 --name coursera-aml-nlp akashin/coursera-aml-nlp
 45 | ```
 46 | This will start the Ubuntu instance and give you an access to its command line. You can type `run_notebook` to launch IPython notebook server. 
 47 | 
 48 | You may find it useful to mount a directory from your local machine within the container using `-v` option:
 49 | ```sh
 50 | docker run -it -p 127.0.0.1:8080:8080 --name coursera-aml-nlp -v $PWD:/root/coursera akashin/coursera-aml-nlp
 51 | ```
 52 | This will use shell alias `$PWD` to mount current directory to the folder `/root/coursera` in the container. Alternatively, you can mount arbitrary directory by replacing `$PWD` with a custom path.
 53 | 
 54 | ## Stop and resume container
 55 | 
 56 | To stop the container use:
 57 | ```sh
 58 | docker stop coursera-aml-nlp
 59 | ```
 60 | All the changes that were made within container will be saved.
 61 | 
 62 | To resume the stopped container use:
 63 | ```sh
 64 | docker start -i coursera-aml-nlp
 65 | ```
 66 | ## Other operations on the container
 67 | 
 68 | There are many other operations that you can perform on the container, to show all of them:
 69 | ```sh
 70 | docker container
 71 | ```
 72 | Some particularly useful would be **showing a list of containers** and **removing container**.
 73 | 
 74 | To show currently running and stopped containers with their status:
 75 | ```sh
 76 | docker ps -a
 77 | ```
 78 | 
 79 | To remove the container and all data associated with it:
 80 | ```sh
 81 | docker rm coursera-aml-nlp
 82 | ```
 83 | Note, that this will remove all the internal data of the container (e.g. installed packages), but all the data written inside of your local mounted folder (`-v` option) will not be affected.
 84 | 
 85 | ## Install more packages
 86 | 
 87 | You can install more packages in the container if needed:
 88 | ```sh
 89 | docker exec coursera-aml-nlp pip3 install PACKAGE_NAME
 90 | ```
 91 | 
 92 | ## Further reading
 93 | 
 94 | If you are interested to know more about Docker, check out this articles: 
 95 | - Using Jupyter notebook from Docker: https://www.dataquest.io/blog/docker-data-science/
 96 | - General introduction to Docker: https://docker-curriculum.com/
 97 | 
 98 | 
 99 | ## Credits
100 | 
101 | The template for this dockerfile was taken from https://github.com/ZEMUSHKA/coursera-aml-docker
102 | 


--------------------------------------------------------------------------------
/natural-language-processing/common/README.md:
--------------------------------------------------------------------------------
1 | # Common utils
2 | 
3 | This folder stores collection of functions that are common for different assignments
4 | 
5 | - `download_utils.py`: Functions for downloading data for the assignments.
6 | 


--------------------------------------------------------------------------------
/natural-language-processing/common/download_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | import shutil
  5 | import tqdm
  6 | # Address problem in tqdm library. For details see: https://github.com/tqdm/tqdm/issues/481
  7 | tqdm.monitor_interval = 0
  8 | import requests
  9 | 
 10 | REPOSITORY_PATH="https://github.com/hse-aml/natural-language-processing"
 11 | 
 12 | 
 13 | def download_file(url, file_path):
 14 |     r = requests.get(url, stream=True)
 15 |     total_size = int(r.headers.get('content-length'))
 16 |     try:
 17 |         with open(file_path, 'wb', buffering=16*1024*1024) as f:
 18 |             bar = tqdm.tqdm_notebook(total=total_size, unit='B', unit_scale=True)
 19 |             bar.set_description(os.path.split(file_path)[-1])
 20 |             for chunk in r.iter_content(32 * 1024):
 21 |                 f.write(chunk)
 22 |                 bar.update(len(chunk))
 23 |             bar.close()
 24 |     except Exception:
 25 |         print("Download failed")
 26 |     finally:
 27 |         if os.path.getsize(file_path) != total_size:
 28 |             os.remove(file_path)
 29 |             print("Removed incomplete download")
 30 | 
 31 | 
 32 | def download_from_github(version, fn, target_dir, force=False):
 33 |     url = REPOSITORY_PATH + "/releases/download/{0}/{1}".format(version, fn)
 34 |     file_path = os.path.join(target_dir, fn)
 35 |     if os.path.exists(file_path) and not force:
 36 |         print("File {} is already downloaded.".format(file_path))
 37 |         return
 38 |     download_file(url, file_path)
 39 | 
 40 | 
 41 | def sequential_downloader(version, fns, target_dir, force=False):
 42 |     os.makedirs(target_dir, exist_ok=True)
 43 |     for fn in fns:
 44 |         download_from_github(version, fn, target_dir, force=force)
 45 | 
 46 | 
 47 | def link_all_files_from_dir(src_dir, dst_dir):
 48 |     os.makedirs(dst_dir, exist_ok=True)
 49 |     for fn in os.listdir(src_dir):
 50 |         src_file = os.path.join(src_dir, fn)
 51 |         dst_file = os.path.join(dst_dir, fn)
 52 |         if os.name == "nt":
 53 |             shutil.copyfile(src_file, dst_file)
 54 |         else:
 55 |             if not os.path.exists(dst_file):
 56 |                 os.symlink(os.path.abspath(src_file), dst_file)
 57 | 
 58 | 
 59 | def link_resources():
 60 |     link_all_files_from_dir("../readonly/dataset/", ".")
 61 | 
 62 | 
 63 | def download_week1_resources(force=False):
 64 |     sequential_downloader(
 65 |         "week1",
 66 |         [
 67 |             "train.tsv",
 68 |             "validation.tsv",
 69 |             "test.tsv",
 70 |             "text_prepare_tests.tsv",
 71 |         ],
 72 |         "data",
 73 |         force=force
 74 |     )
 75 | 
 76 | 
 77 | def download_week2_resources(force=False):
 78 |     sequential_downloader(
 79 |         "week2",
 80 |         [
 81 |             "train.txt",
 82 |             "validation.txt",
 83 |             "test.txt",
 84 |         ],
 85 |         "data",
 86 |         force=force
 87 |     )
 88 | 
 89 | 
 90 | def download_week3_resources(force=False):
 91 |     sequential_downloader(
 92 |         "week3",
 93 |         [
 94 |             "train.tsv",
 95 |             "validation.tsv",
 96 |             "test.tsv",
 97 |             "test_embeddings.tsv",
 98 |         ],
 99 |         "data",
100 |         force=force
101 |     )
102 | 
103 | 
104 | def download_project_resources(force=False):
105 |     sequential_downloader(
106 |         "project",
107 |         [
108 |             "dialogues.tsv",
109 |             "tagged_posts.tsv",
110 |         ],
111 |         "data",
112 |         force=force
113 |     )
114 | 


--------------------------------------------------------------------------------
/natural-language-processing/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | LABEL maintainer="Andrei Kashin <kashin.andrej@gmail.com>"
 3 | 
 4 | RUN apt-get update && apt-get install -yq \
 5 |                         python3 python3-pip htop nano git wget \
 6 |                         libglib2.0-0 autoconf automake \
 7 |                         libtool build-essential unzip \
 8 |                         libarchive-dev vim
 9 | 
10 | # Install Starspace.
11 | RUN wget https://dl.bintray.com/boostorg/release/1.63.0/source/boost_1_63_0.zip && \
12 |     unzip boost_1_63_0.zip && \
13 |     mv boost_1_63_0 /usr/local/bin
14 | 
15 | RUN git clone https://github.com/facebookresearch/Starspace.git && \
16 |     cd Starspace && \
17 |     make && \
18 |     cp -Rf starspace /usr/local/bin
19 | 
20 | # Install Python dependencies.
21 | ADD requirements.txt /
22 | RUN pip3 install --upgrade pip
23 | RUN pip3 install -r requirements.txt
24 | 
25 | # Install Jupyter.
26 | RUN jupyter nbextension enable --py --sys-prefix widgetsnbextension
27 | RUN jupyter contrib nbextension install
28 | RUN jupyter nbextension enable codefolding/main
29 | RUN echo "c.NotebookApp.ip = '*'" >> /root/.jupyter/jupyter_notebook_config.py
30 | RUN echo "c.NotebookApp.port = 8080" >> /root/.jupyter/jupyter_notebook_config.py
31 | RUN echo "c.NotebookApp.token = ''" >> /root/.jupyter/jupyter_notebook_config.py
32 | RUN echo "jupyter notebook --no-browser --allow-root" >> /usr/local/bin/run_notebook && chmod +x /usr/local/bin/run_notebook
33 | 
34 | # Welcome message.
35 | ADD welcome_message.txt /
36 | RUN echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/motd' \
37 |         >> /etc/bash.bashrc \
38 |         ; cat welcome_message.txt > /etc/motd
39 | 
40 | WORKDIR /root
41 | EXPOSE 8080
42 | 


--------------------------------------------------------------------------------
/natural-language-processing/honor/LSTM reply.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stderr",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "/home/jose/scratch/venv/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
13 |       "  from ._conv import register_converters as _register_converters\n"
14 |      ]
15 |     }
16 |    ],
17 |    "source": []
18 |   },
19 |   {
20 |    "cell_type": "code",
21 |    "execution_count": 2,
22 |    "metadata": {},
23 |    "outputs": [
24 |     {
25 |      "name": "stdout",
26 |      "output_type": "stream",
27 |      "text": [
28 |       "[nltk_data] Downloading package stopwords to /home/jose/nltk_data...\n",
29 |       "[nltk_data]   Package stopwords is already up-to-date!\n"
30 |      ]
31 |     }
32 |    ],
33 |    "source": []
34 |   },
35 |   {
36 |    "cell_type": "code",
37 |    "execution_count": 17,
38 |    "metadata": {},
39 |    "outputs": [
40 |     {
41 |      "data": {
42 |       "text/plain": [
43 |        "'Hello something im going to go'"
44 |       ]
45 |      },
46 |      "execution_count": 17,
47 |      "metadata": {},
48 |      "output_type": "execute_result"
49 |     }
50 |    ],
51 |    "source": []
52 |   }
53 |  ],
54 |  "metadata": {
55 |   "kernelspec": {
56 |    "display_name": "Python 3",
57 |    "language": "python",
58 |    "name": "python3"
59 |   },
60 |   "language_info": {
61 |    "codemirror_mode": {
62 |     "name": "ipython",
63 |     "version": 3
64 |    },
65 |    "file_extension": ".py",
66 |    "mimetype": "text/x-python",
67 |    "name": "python",
68 |    "nbconvert_exporter": "python",
69 |    "pygments_lexer": "ipython3",
70 |    "version": "3.6.3"
71 |   }
72 |  },
73 |  "nbformat": 4,
74 |  "nbformat_minor": 2
75 | }
76 | 


--------------------------------------------------------------------------------
/natural-language-processing/honor/README.md:
--------------------------------------------------------------------------------
1 | # Utils to download and read data for chat-bot training
2 | 
3 | This folder contains scripts for downloading, reading and preprocessing data for chat-bot training:
4 | - `download_cornell.sh` - downloads Cornell movie dialogues dataset (small size)
5 | - `download_opensubs.sh` - downloads Opensubs movie subtitles dataset (large size)
6 | - `datasets.py` - module to be imported in your scripts, that exports functions for reading a dataset
7 | - `example.py` - example of reading the dataset
8 | 


--------------------------------------------------------------------------------
/natural-language-processing/honor/Untitled.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/natural-language-processing/honor/datasets.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 Conchylicultor. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import ast
 17 | import os
 18 | import random
 19 | import re
 20 | from time import time
 21 | 
 22 | import nltk
 23 | from tqdm import tqdm
 24 | 
 25 | """
 26 | Load the cornell movie dialog corpus.
 27 | 
 28 | Available from here:
 29 | http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
 30 | 
 31 | """
 32 | 
 33 | class CornellData:
 34 |     """
 35 | 
 36 |     """
 37 | 
 38 |     def __init__(self, dirName):
 39 |         """
 40 |         Args:
 41 |             dirName (string): directory where to load the corpus
 42 |         """
 43 |         self.lines = {}
 44 |         self.conversations = []
 45 | 
 46 |         MOVIE_LINES_FIELDS = ["lineID","characterID","movieID","character","text"]
 47 |         MOVIE_CONVERSATIONS_FIELDS = ["character1ID","character2ID","movieID","utteranceIDs"]
 48 | 
 49 |         self.lines = self.loadLines(os.path.join(dirName, "movie_lines.txt"), MOVIE_LINES_FIELDS)
 50 |         self.conversations = self.loadConversations(os.path.join(dirName, "movie_conversations.txt"), MOVIE_CONVERSATIONS_FIELDS)
 51 | 
 52 |         # TODO: Cleaner program (merge copy-paste) !!
 53 | 
 54 |     def loadLines(self, fileName, fields):
 55 |         """
 56 |         Args:
 57 |             fileName (str): file to load
 58 |             field (set<str>): fields to extract
 59 |         Return:
 60 |             dict<dict<str>>: the extracted fields for each line
 61 |         """
 62 |         lines = {}
 63 | 
 64 |         with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
 65 |             for line in f:
 66 |                 values = line.split(" +++$+++ ")
 67 | 
 68 |                 # Extract fields
 69 |                 lineObj = {}
 70 |                 for i, field in enumerate(fields):
 71 |                     lineObj[field] = values[i]
 72 | 
 73 |                 lines[lineObj['lineID']] = lineObj
 74 | 
 75 |         return lines
 76 | 
 77 |     def loadConversations(self, fileName, fields):
 78 |         """
 79 |         Args:
 80 |             fileName (str): file to load
 81 |             field (set<str>): fields to extract
 82 |         Return:
 83 |             list<dict<str>>: the extracted fields for each line
 84 |         """
 85 |         conversations = []
 86 | 
 87 |         with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
 88 |             for line in f:
 89 |                 values = line.split(" +++$+++ ")
 90 | 
 91 |                 # Extract fields
 92 |                 convObj = {}
 93 |                 for i, field in enumerate(fields):
 94 |                     convObj[field] = values[i]
 95 | 
 96 |                 # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
 97 |                 lineIds = ast.literal_eval(convObj["utteranceIDs"])
 98 | 
 99 |                 # Reassemble lines
100 |                 convObj["lines"] = []
101 |                 for lineId in lineIds:
102 |                     convObj["lines"].append(self.lines[lineId])
103 | 
104 |                 conversations.append(convObj)
105 | 
106 |         return conversations
107 | 
108 |     def getConversations(self):
109 |         return self.conversations
110 | 
111 | 
112 | # Based on code from https://github.com/AlJohri/OpenSubtitles
113 | # by Al Johri <al.johri@gmail.com>
114 | 
115 | import xml.etree.ElementTree as ET
116 | import datetime
117 | import os
118 | import sys
119 | import json
120 | import re
121 | import pprint
122 | 
123 | from gzip import GzipFile
124 | 
125 | """
126 | Load the opensubtitles dialog corpus.
127 | """
128 | 
129 | class OpensubsData:
130 |     """
131 |     """
132 | 
133 |     def __init__(self, dirName):
134 |         """
135 |         Args:
136 |             dirName (string): directory where to load the corpus
137 |         """
138 | 
139 |         # Hack this to filter on subset of Opensubtitles
140 |         # dirName = "%s/en/Action" % dirName
141 | 
142 |         print("Loading OpenSubtitles conversations in %s." % dirName)
143 |         self.conversations = []
144 |         self.tag_re = re.compile(r'(<!--.*?-->|<[^>]*>)')
145 |         self.conversations = self.loadConversations(dirName)
146 | 
147 |     def loadConversations(self, dirName):
148 |         """
149 |         Args:
150 |             dirName (str): folder to load
151 |         Return:
152 |             array(question, answer): the extracted QA pairs
153 |         """
154 |         conversations = []
155 |         dirList = self.filesInDir(dirName)
156 |         for filepath in tqdm(dirList, "OpenSubtitles data files"):
157 |             if filepath.endswith('gz'):
158 |                 try:
159 |                     doc = self.getXML(filepath)
160 |                     conversations.extend(self.genList(doc))
161 |                 except ValueError:
162 |                     tqdm.write("Skipping file %s with errors." % filepath)
163 |                 except:
164 |                     print("Unexpected error:", sys.exc_info()[0])
165 |                     raise
166 |         return conversations
167 | 
168 |     def getConversations(self):
169 |         return self.conversations
170 | 
171 |     def genList(self, tree):
172 |         root = tree.getroot()
173 | 
174 |         timeFormat = '%H:%M:%S'
175 |         maxDelta = datetime.timedelta(seconds=1)
176 | 
177 |         startTime = datetime.datetime.min
178 |         strbuf = ''
179 |         sentList = []
180 | 
181 |         for child in root:
182 |             for elem in child:
183 |                 if elem.tag == 'time':
184 |                     elemID = elem.attrib['id']
185 |                     elemVal = elem.attrib['value'][:-4]
186 |                     if elemID[-1] == 'S':
187 |                         startTime = datetime.datetime.strptime(elemVal, timeFormat)
188 |                     else:
189 |                         sentList.append((strbuf.strip(), startTime, datetime.datetime.strptime(elemVal, timeFormat)))
190 |                         strbuf = ''
191 |                 else:
192 |                     try:
193 |                         strbuf = strbuf + " " + elem.text
194 |                     except:
195 |                         pass
196 | 
197 |         conversations = []
198 |         for idx in range(0, len(sentList) - 1):
199 |             cur = sentList[idx]
200 |             nxt = sentList[idx + 1]
201 |             if nxt[1] - cur[2] <= maxDelta and cur and nxt:
202 |                 tmp = {}
203 |                 tmp["lines"] = []
204 |                 tmp["lines"].append(self.getLine(cur[0]))
205 |                 tmp["lines"].append(self.getLine(nxt[0]))
206 |                 if self.filter(tmp):
207 |                     conversations.append(tmp)
208 | 
209 |         return conversations
210 | 
211 |     def getLine(self, sentence):
212 |         line = {}
213 |         line["text"] = self.tag_re.sub('', sentence).replace('\\\'','\'').strip().lower()
214 |         return line
215 | 
216 |     def filter(self, lines):
217 |         # Use the followint to customize filtering of QA pairs
218 |         #
219 |         # startwords = ("what", "how", "when", "why", "where", "do", "did", "is", "are", "can", "could", "would", "will")
220 |         # question = lines["lines"][0]["text"]
221 |         # if not question.endswith('?'):
222 |         #     return False
223 |         # if not question.split(' ')[0] in startwords:
224 |         #     return False
225 |         #
226 |         return True
227 | 
228 |     def getXML(self, filepath):
229 |         fext = os.path.splitext(filepath)[1]
230 |         if fext == '.gz':
231 |             tmp = GzipFile(filename=filepath)
232 |             return ET.parse(tmp)
233 |         else:
234 |             return ET.parse(filepath)
235 | 
236 |     def filesInDir(self, dirname):
237 |         result = []
238 |         for dirpath, dirs, files in os.walk(dirname):
239 |             for filename in files:
240 |                 fname = os.path.join(dirpath, filename)
241 |                 result.append(fname)
242 |         return result
243 | 
244 | 
245 | def extractText(line, fast_preprocessing=True):
246 |     if fast_preprocessing:
247 |         GOOD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
248 |         REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#+_]')
249 |         REPLACE_SEVERAL_SPACES = re.compile('\s+')
250 | 
251 |         line = line.lower()
252 |         line = REPLACE_BY_SPACE_RE.sub(' ', line)
253 |         line = GOOD_SYMBOLS_RE.sub('', line)
254 |         line = REPLACE_SEVERAL_SPACES.sub(' ', line)
255 |         return line.strip()
256 |     else:
257 |         return nltk.word_tokenize(line)
258 | 
259 | 
260 | def splitConversations(conversations, max_len=20, fast_preprocessing=True):
261 |     data = []
262 |     for i, conversation in enumerate(tqdm(conversations)):
263 |         lines = conversation['lines']
264 |         for i in range(len(lines) - 1):
265 |             request = extractText(lines[i]['text'])
266 |             reply = extractText(lines[i + 1]['text'])
267 |             if 0 < len(request) <= max_len and 0 < len(reply) <= max_len:
268 |                 data += [(request, reply)]
269 |     return data
270 | 
271 | 
272 | def readCornellData(path, max_len=20, fast_preprocessing=True):
273 |     dataset = CornellData(path)
274 |     conversations = dataset.getConversations()
275 |     return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing)
276 | 
277 | 
278 | def readOpensubsData(path, max_len=20, fast_preprocessing=True):
279 |     dataset = OpensubsData(path)
280 |     conversations = dataset.getConversations()
281 |     return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing)
282 | 


--------------------------------------------------------------------------------
/natural-language-processing/honor/dialogue_manager.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from sklearn.metrics.pairwise import pairwise_distances_argmin
 3 | 
 4 | from chatterbot import ChatBot
 5 | from utils import *
 6 | import tfmodel
 7 | from tfmodel import *
 8 | 
 9 | class ThreadRanker(object):
10 |     def __init__(self, paths):
11 |         self.word_embeddings, self.embeddings_dim = load_embeddings(
12 |             paths['WORD_EMBEDDINGS'])
13 |         self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER']
14 | 
15 |     def __load_embeddings_by_tag(self, tag_name):
16 |         embeddings_path = os.path.join(
17 |             self.thread_embeddings_folder, tag_name + ".pkl")
18 |         thread_ids, thread_embeddings = unpickle_file(embeddings_path)
19 |         return thread_ids, thread_embeddings
20 | 
21 |     def get_best_thread(self, question, tag_name):
22 |         """ Returns id of the most similar thread for the question.
23 |             The search is performed across the threads with a given tag.
24 |         """
25 |         thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
26 | 
27 |         # HINT: you have already implemented a similar routine in the 3rd assignment.
28 |        
29 |         question_vec = question_to_vec(
30 |             question, self.word_embeddings, self.embeddings_dim).reshape(-1,self.embeddings_dim)
31 |        
32 |         
33 |         best_thread = pairwise_distances_argmin(
34 |             question_vec, thread_embeddings, metric = "cosine")[0]
35 |               
36 |         return thread_ids.values[best_thread]
37 | 
38 | 
39 | class DialogueManager(object):
40 |     def __init__(self, paths):
41 |         print("Loading resources...")
42 | 	self.sess=load_model()
43 | 
44 |         # Intent recognition:
45 |         self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER'])
46 |         self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER'])
47 | 
48 |         self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s'
49 | 
50 |         # Goal-oriented part:
51 |         self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER'])
52 |         self.thread_ranker = ThreadRanker(paths)
53 |        
54 |     def create_chitchat_bot(self):
55 |         """Initializes self.chitchat_bot with some conversational model."""
56 | 
57 |         # Hint: you might want to create and train chatterbot.ChatBot here.
58 |         # It could be done by creating ChatBot with the *trainer* parameter equals
59 |         # "chatterbot.trainers.ChatterBotCorpusTrainer"
60 |         # and then calling *train* function with "chatterbot.corpus.english" param
61 | 
62 |         self.chitchat_bot = ChatBot('Botty McBotFace',
63 |                           trainer='chatterbot.trainers.ChatterBotCorpusTrainer')
64 | 
65 |         # Train based on the english corpus
66 |         self.chitchat_bot.train("chatterbot.corpus.english")
67 | 
68 |     def generate_answer(self, question):
69 |         """Combines stackoverflow and chitchat parts using intent recognition."""
70 | 
71 |         # Recognize intent of the question using `intent_recognizer`.
72 |         # Don't forget to prepare question and calculate features for the question.
73 |       
74 |         prepared_question =  text_prepare(question)
75 |          
76 |         features = self.tfidf_vectorizer.transform([prepared_question])
77 |         
78 |         intent =  self.intent_recognizer.predict(features)[0]
79 |         
80 |         # Chit-chat part:
81 |         if intent == 'dialogue':
82 |             # Pass question to chitchat_bot to generate a response.
83 |             response =  reply("hi",word2id,max_len,id2word,sess)
84 |             return response
85 | 
86 |         # Goal-oriented part:
87 |         else:
88 |             # Pass features to tag_classifier to get predictions.
89 |             tag =  self.tag_classifier.predict(features)
90 |          
91 |             
92 |             # Pass prepared_question to thread_ranker to get predictions.
93 |             thread_id =  self.thread_ranker.get_best_thread(prepared_question,tag[0])
94 | 
95 |             return self.ANSWER_TEMPLATE % (tag[0], thread_id)
96 | 


--------------------------------------------------------------------------------
/natural-language-processing/honor/download_cornell.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p data/cornell
4 | cd data/cornell
5 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_conversations.txt
6 | wget https://github.com/Conchylicultor/DeepQA/raw/master/data/cornell/movie_lines.txt
7 | 


--------------------------------------------------------------------------------
/natural-language-processing/honor/download_opensubs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p data/opensubs
4 | cd data/opensubs
5 | wget -O en.tar.gz http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz
6 | tar -xf en.tar.gz
7 | rm en.tar.gz
8 | 


--------------------------------------------------------------------------------
/natural-language-processing/honor/example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import datasets
 4 | import argparse
 5 | import os
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("dataset", choices=["cornell", "opensubs"], help="Name of the dataset.")
10 |     parser.add_argument("--max_len", type=int, default=10, help="Max length of sentences to consider.")
11 |     args = parser.parse_args()
12 | 
13 |     dataset_path = os.path.join("data", args.dataset)
14 |     if args.dataset == "cornell":
15 |         data = datasets.readCornellData(dataset_path, max_len=args.max_len)
16 |     elif args.dataset == "opensubs":
17 |         data = datasets.readOpensubsData(dataset_path, max_len=args.max_len)
18 |     else:
19 |         raise ValueError("Unrecognized dataset: {!r}".format(args.dataset))
20 | 
21 |     print("Size of dataset: {}".format(len(data)))
22 |     print("First 10 training pairs:")
23 |     for item in data[:10]:
24 |         print(item)
25 | 
26 | if __name__ == "__main__":
27 |     main()
28 | 


--------------------------------------------------------------------------------
/natural-language-processing/honor/main_bot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import requests
  4 | import time
  5 | import argparse
  6 | import os
  7 | import json
  8 | import tensorflow as tf
  9 | from requests.compat import urljoin
 10 | import dialogue_manager
 11 | from utils import *
 12 | 
 13 | 
 14 | 
 15 | from dialogue_manager import DialogueManager
 16 | 
 17 | class BotHandler(object):
 18 |     """
 19 |         BotHandler is a class which implements all back-end of the bot.
 20 |         It has tree main functions:
 21 |             'get_updates' — checks for new messages
 22 |             'send_message' – posts new message to user
 23 |             'get_answer' — computes the most relevant on a user's question
 24 |     """
 25 | 
 26 |     def __init__(self, token, dialogue_manager):
 27 |         self.token = token
 28 |         self.api_url = "https://api.telegram.org/bot{}/".format(token)
 29 |         self.dialogue_manager = dialogue_manager
 30 | 
 31 |     def get_updates(self, offset=None, timeout=30):
 32 |         params = {"timeout": timeout, "offset": offset}
 33 |         raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params)
 34 |         try:
 35 |             resp = raw_resp.json()
 36 |         except json.decoder.JSONDecodeError as e:
 37 |             print("Failed to parse response {}: {}.".format(raw_resp.content, e))
 38 |             return []
 39 | 
 40 |         if "result" not in resp:
 41 |             return []
 42 |         return resp["result"]
 43 | 
 44 |     def send_message(self, chat_id, text):
 45 |         params = {"chat_id": chat_id, "text": text}
 46 |         return requests.post(urljoin(self.api_url, "sendMessage"), params)
 47 | 
 48 |     def get_answer(self, question):
 49 |         if question == '/start':
 50 |             return "Hi, I am your project bot. How can I help you today?"
 51 |         return self.dialogue_manager.generate_answer(question)
 52 | 
 53 | 
 54 | def parse_args():
 55 |     parser = argparse.ArgumentParser()
 56 |     parser.add_argument('--token', type=str, default='')
 57 |     return parser.parse_args()
 58 | 
 59 | 
 60 | def is_unicode(text):
 61 |     return len(text) == len(text.encode())
 62 | 
 63 | 
 64 | class SimpleDialogueManager(object):
 65 |     """
 66 |     This is the simplest dialogue manager to test the telegram bot.
 67 |     Your task is to create a more advanced one in dialogue_manager.py."
 68 |     """
 69 |     
 70 |     def generate_answer(self, question): 
 71 |         return "Hello, world!" 
 72 |         
 73 | 
 74 | def main():
 75 |     args = parse_args()
 76 |     token = args.token
 77 | 
 78 |     if not token:
 79 |         if not "TELEGRAM_TOKEN" in os.environ:
 80 |             print("Please, set bot token through --token or TELEGRAM_TOKEN env variable")
 81 |             return
 82 |         token = os.environ["TELEGRAM_TOKEN"]
 83 | 
 84 |     #################################################################
 85 |     
 86 |     # Your task is to complete dialogue_manager.py and use your 
 87 |     # advanced DialogueManager instead of SimpleDialogueManager. 
 88 |     
 89 |     # This is the point where you plug it into the Telegram bot. 
 90 |     # Do not forget to import all needed dependencies when you do so.
 91 |     
 92 |    # simple_manager = SimpleDialogueManager()
 93 |     advanced_manager = DialogueManager(RESOURCE_PATH)
 94 |     advanced_manager.create_chitchat_bot()
 95 |     bot = BotHandler(token, advanced_manager)
 96 |     
 97 |     ###############################################################
 98 | 
 99 |     print("Ready to talk!")
100 |     offset = 0
101 |     while True:
102 |         updates = bot.get_updates(offset=offset)
103 |         for update in updates:
104 |             print("An update received.")
105 |             if "message" in update:
106 |                 chat_id = update["message"]["chat"]["id"]
107 |                 if "text" in update["message"]:
108 |                     text = update["message"]["text"]
109 |                     if is_unicode(text):
110 |                         print("Update content: {}".format(update))
111 |                         bot.send_message(chat_id, bot.get_answer(update["message"]["text"]))
112 |                     else:
113 |                         bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...")
114 |             offset = max(offset, update['update_id'] + 1)
115 |         time.sleep(1)
116 | 
117 | if __name__ == "__main__":
118 |     main()
119 | 


--------------------------------------------------------------------------------
/natural-language-processing/honor/tfmodel.py:
--------------------------------------------------------------------------------
 1 | end_symbol = '$'
 2 | padding_symbol = '#'
 3 | start_symbol="^"
 4 | word2id = {symbol:i for i, symbol in enumerate('^$#abcdefghijklmnopqrstuvwxyz 0123456789+-')}
 5 | id2word = {i:symbol for symbol, i in word2id.items()}
 6 | max_len = 30
 7 | def load_model():
 8 | 	sess = tf.Session()
 9 | 	new_saver = tf.train.import_meta_graph('model/chatbot_model.meta')
10 | 	new_saver.restore(sess, tf.train.latest_checkpoint('model/'))
11 | 	sess.run(tf.local_variables_initializer())
12 | 	return sess
13 | def reply(question, word2id, max_len, id2word, session):
14 |     input_batch = tf.get_default_graph().get_tensor_by_name("input_batch:0")
15 |     input_batch_len = tf.get_default_graph().get_tensor_by_name("input_batch_lengths:0")
16 |     infer_predictions = tf.get_default_graph().get_tensor_by_name(
17 |         "decode_1/decoder/transpose_1:0")
18 | 
19 |     question = text_prepare(question)
20 |     ids, ids_len = sentence_to_ids(question, word2id, padded_len=max_len)
21 |     ids = np.array(ids).reshape(1, len(ids))
22 | 
23 |     ids_len = np.array(ids_len).reshape(1)
24 |     predictions = session.run([
25 |         infer_predictions
26 |     ], feed_dict={input_batch: ids, input_batch_len: ids_len})[0]
27 |     return "".join(ids_to_sentence(predictions[0], id2word)).replace("$", "").capitalize()
28 | 


--------------------------------------------------------------------------------
/natural-language-processing/honor/utils.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import pickle
  3 | import re
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | nltk.download('stopwords')
  8 | from nltk.corpus import stopwords
  9 | 
 10 | # Paths for all resources for the bot.
 11 | RESOURCE_PATH = {
 12 |     'INTENT_RECOGNIZER': 'intent_recognizer.pkl',
 13 |     'TAG_CLASSIFIER': 'tag_classifier.pkl',
 14 |     'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl',
 15 |     'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags',
 16 |     'WORD_EMBEDDINGS': 'word_embeddings.tsv',
 17 | }
 18 | 
 19 | 
 20 | def text_prepare(text):
 21 |     """Performs tokenization and simple preprocessing."""
 22 |     
 23 |     replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
 24 |     bad_symbols_re = re.compile('[^0-9a-z #+_]')
 25 |     stopwords_set = set(stopwords.words('english'))
 26 | 
 27 |     text = text.lower()
 28 |     text = replace_by_space_re.sub(' ', text)
 29 |     text = bad_symbols_re.sub('', text)
 30 |     text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
 31 | 
 32 |     return text.strip()
 33 | 
 34 | 
 35 | def load_embeddings(embeddings_path):
 36 | 	"""Loads pre-trained word embeddings from tsv file.
 37 | 
 38 | 	Args:
 39 | 	embeddings_path - path to the embeddings file.
 40 | 
 41 | 	Returns:
 42 | 	embeddings - dict mapping words to vectors;
 43 | 	embeddings_dim - dimension of the vectors.
 44 | 	"""
 45 | 
 46 | 	# Hint: you have already implemented a similar routine in the 3rd assignment.
 47 | 	# Note that here you also need to know the dimension of the loaded embeddings.
 48 | 
 49 | 	embeds = pd.read_csv(embeddings_path,sep="\t",header=None)
 50 | 	vals=embeds.iloc[:,1:].values
 51 | 	index=embeds.iloc[:,0].values
 52 | 	embeddings= {i:j for i,j in zip(index,vals)}
 53 | 	return embeddings,vals.shape[1]
 54 | 
 55 |        
 56 | def question_to_vec(question, embeddings, dim):
 57 |     """Transforms a string to an embedding by averaging word embeddings."""
 58 |     
 59 |     # Hint: you have already implemented exactly this function in the 3rd assignment.
 60 | 
 61 |     if question == "":
 62 |         return np.zeros(dim)
 63 |     t = np.array([embeddings[i]
 64 |                   for i in question.split() if i in embeddings.keys()])
 65 |     if len(t) == 0:
 66 |         return np.zeros(dim)
 67 | 
 68 |     return(t.mean(axis=0))
 69 | 
 70 | 
 71 | def unpickle_file(filename):
 72 |     """Returns the result of unpickling the file content."""
 73 |     with open(filename, 'rb') as f:
 74 |         return pickle.load(f)
 75 | def sentence_to_ids(sentence, word2id, padded_len):
 76 |     """ Converts a sequence of symbols to a padded sequence of their ids.
 77 |     
 78 |       sentence: a string, input/output sequence of symbols.
 79 |       word2id: a dict, a mapping from original symbols to ids.
 80 |       padded_len: an integer, a desirable length of the sequence.
 81 | 
 82 |       result: a tuple of (a list of ids, an actual length of sentence).
 83 |     """
 84 |     
 85 |     sent_ids = [word2id[i] for i in sentence]
 86 |     sent_len = len(sent_ids[:padded_len-1])+1
 87 |     sent_ids = sent_ids[:padded_len-1]+[word2id["$"]]+[word2id["#"]]*(padded_len-len(sent_ids)-1)
 88 |     
 89 |     return (sent_ids, sent_len)
 90 | def ids_to_sentence(ids, id2word):
 91 |     """ Converts a sequence of ids to a sequence of symbols.
 92 |     
 93 |           ids: a list, indices for the padded sequence.
 94 |           id2word:  a dict, a mapping from ids to original symbols.
 95 | 
 96 |           result: a list of symbols.
 97 |     """
 98 |  
 99 |     return [id2word[i] for i in ids] 
100 | def batch_to_ids(sentences, word2id, max_len):
101 |     """Prepares batches of indices. 
102 |     
103 |        Sequences are padded to match the longest sequence in the batch,
104 |        if it's longer than max_len, then max_len is used instead.
105 | 
106 |         sentences: a list of strings, original sequences.
107 |         word2id: a dict, a mapping from original symbols to ids.
108 |         max_len: an integer, max len of sequences allowed.
109 | 
110 |         result: a list of lists of ids, a list of actual lengths.
111 |     """
112 |     
113 |     max_len_in_batch = min(max(len(s) for s in sentences) + 1, max_len)
114 |     batch_ids, batch_ids_len = [], []
115 |     for sentence in sentences:
116 |         ids, ids_len = sentence_to_ids(sentence, word2id, max_len_in_batch)
117 |         batch_ids.append(ids)
118 |         batch_ids_len.append(ids_len)
119 |     return batch_ids, batch_ids_len
120 | def generate_batches(samples, batch_size=64):
121 |     X, Y = [], []
122 |     for i, (x, y) in enumerate(samples, 1):
123 |         X.append(x)
124 |         Y.append(y)
125 |         if i % batch_size == 0:
126 |             yield X, Y
127 |             X, Y = [], []
128 |     if X and Y:
129 |         yield X, Y
130 | def reply(question,word2id,max_len,model,id2word):
131 | 
132 |     ids, ids_len = sentence_to_ids(question,word2id,padded_len=max_len)
133 |     ids=np.array(ids).reshape(1,len(ids))
134 | 
135 |     ids_len=np.array(ids_len).reshape(1)
136 |     predictions = model.predict_for_batch(session, ids, ids_len)
137 |     return "".join(ids_to_sentence(predictions[0], id2word)).replace("$","").capitalize()
138 | 


--------------------------------------------------------------------------------
/natural-language-processing/project/dialogue_manager.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from sklearn.metrics.pairwise import pairwise_distances_argmin
 3 | 
 4 | from chatterbot import ChatBot
 5 | from utils import *
 6 | 
 7 | 
 8 | class ThreadRanker(object):
 9 |     def __init__(self, paths):
10 |         self.word_embeddings, self.embeddings_dim = load_embeddings(
11 |             paths['WORD_EMBEDDINGS'])
12 |         self.thread_embeddings_folder = paths['THREAD_EMBEDDINGS_FOLDER']
13 | 
14 |     def __load_embeddings_by_tag(self, tag_name):
15 |         embeddings_path = os.path.join(
16 |             self.thread_embeddings_folder, tag_name + ".pkl")
17 |         thread_ids, thread_embeddings = unpickle_file(embeddings_path)
18 |         return thread_ids, thread_embeddings
19 | 
20 |     def get_best_thread(self, question, tag_name):
21 |         """ Returns id of the most similar thread for the question.
22 |             The search is performed across the threads with a given tag.
23 |         """
24 |         thread_ids, thread_embeddings = self.__load_embeddings_by_tag(tag_name)
25 | 
26 |         # HINT: you have already implemented a similar routine in the 3rd assignment.
27 |        
28 |         question_vec = question_to_vec(
29 |             question, self.word_embeddings, self.embeddings_dim).reshape(-1,self.embeddings_dim)
30 |        
31 |         
32 |         best_thread = pairwise_distances_argmin(
33 |             question_vec, thread_embeddings, metric = "cosine")[0]
34 |               
35 |         return thread_ids.values[best_thread]
36 | 
37 | 
38 | class DialogueManager(object):
39 |     def __init__(self, paths):
40 |         print("Loading resources...")
41 | 
42 |         # Intent recognition:
43 |         self.intent_recognizer = unpickle_file(paths['INTENT_RECOGNIZER'])
44 |         self.tfidf_vectorizer = unpickle_file(paths['TFIDF_VECTORIZER'])
45 | 
46 |         self.ANSWER_TEMPLATE = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s'
47 | 
48 |         # Goal-oriented part:
49 |         self.tag_classifier = unpickle_file(paths['TAG_CLASSIFIER'])
50 |         self.thread_ranker = ThreadRanker(paths)
51 |        
52 |     def create_chitchat_bot(self):
53 |         """Initializes self.chitchat_bot with some conversational model."""
54 | 
55 |         # Hint: you might want to create and train chatterbot.ChatBot here.
56 |         # It could be done by creating ChatBot with the *trainer* parameter equals
57 |         # "chatterbot.trainers.ChatterBotCorpusTrainer"
58 |         # and then calling *train* function with "chatterbot.corpus.english" param
59 | 
60 |         self.chitchat_bot = ChatBot('Botty McBotFace',
61 |                           trainer='chatterbot.trainers.ChatterBotCorpusTrainer')
62 | 
63 |         # Train based on the english corpus
64 |         self.chitchat_bot.train("chatterbot.corpus.english")
65 | 
66 |     def generate_answer(self, question):
67 |         """Combines stackoverflow and chitchat parts using intent recognition."""
68 | 
69 |         # Recognize intent of the question using `intent_recognizer`.
70 |         # Don't forget to prepare question and calculate features for the question.
71 |       
72 |         prepared_question =  text_prepare(question)
73 |          
74 |         features = self.tfidf_vectorizer.transform([prepared_question])
75 |         
76 |         intent =  self.intent_recognizer.predict(features)[0]
77 |         
78 |         # Chit-chat part:
79 |         if intent == 'dialogue':
80 |             # Pass question to chitchat_bot to generate a response.
81 |             response =  self.chitchat_bot.get_response(question)
82 |             return response
83 | 
84 |         # Goal-oriented part:
85 |         else:
86 |             # Pass features to tag_classifier to get predictions.
87 |             tag =  self.tag_classifier.predict(features)
88 |          
89 |             
90 |             # Pass prepared_question to thread_ranker to get predictions.
91 |             thread_id =  self.thread_ranker.get_best_thread(prepared_question,tag[0])
92 | 
93 |             return self.ANSWER_TEMPLATE % (tag[0], thread_id)
94 | 


--------------------------------------------------------------------------------
/natural-language-processing/project/main_bot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import requests
  4 | import time
  5 | import argparse
  6 | import os
  7 | import json
  8 | 
  9 | from requests.compat import urljoin
 10 | import dialogue_manager
 11 | 
 12 | from dialogue_manager import DialogueManager
 13 | from utils import *
 14 | class BotHandler(object):
 15 |     """
 16 |         BotHandler is a class which implements all back-end of the bot.
 17 |         It has tree main functions:
 18 |             'get_updates' — checks for new messages
 19 |             'send_message' – posts new message to user
 20 |             'get_answer' — computes the most relevant on a user's question
 21 |     """
 22 | 
 23 |     def __init__(self, token, dialogue_manager):
 24 |         self.token = token
 25 |         self.api_url = "https://api.telegram.org/bot{}/".format(token)
 26 |         self.dialogue_manager = dialogue_manager
 27 | 
 28 |     def get_updates(self, offset=None, timeout=30):
 29 |         params = {"timeout": timeout, "offset": offset}
 30 |         raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params)
 31 |         try:
 32 |             resp = raw_resp.json()
 33 |         except json.decoder.JSONDecodeError as e:
 34 |             print("Failed to parse response {}: {}.".format(raw_resp.content, e))
 35 |             return []
 36 | 
 37 |         if "result" not in resp:
 38 |             return []
 39 |         return resp["result"]
 40 | 
 41 |     def send_message(self, chat_id, text):
 42 |         params = {"chat_id": chat_id, "text": text}
 43 |         return requests.post(urljoin(self.api_url, "sendMessage"), params)
 44 | 
 45 |     def get_answer(self, question):
 46 |         if question == '/start':
 47 |             return "Hi, I am your project bot. How can I help you today?"
 48 |         return self.dialogue_manager.generate_answer(question)
 49 | 
 50 | 
 51 | def parse_args():
 52 |     parser = argparse.ArgumentParser()
 53 |     parser.add_argument('--token', type=str, default='')
 54 |     return parser.parse_args()
 55 | 
 56 | 
 57 | def is_unicode(text):
 58 |     return len(text) == len(text.encode())
 59 | 
 60 | 
 61 | class SimpleDialogueManager(object):
 62 |     """
 63 |     This is the simplest dialogue manager to test the telegram bot.
 64 |     Your task is to create a more advanced one in dialogue_manager.py."
 65 |     """
 66 |     
 67 |     def generate_answer(self, question): 
 68 |         return "Hello, world!" 
 69 |         
 70 | 
 71 | def main():
 72 |     args = parse_args()
 73 |     token = args.token
 74 | 
 75 |     if not token:
 76 |         if not "TELEGRAM_TOKEN" in os.environ:
 77 |             print("Please, set bot token through --token or TELEGRAM_TOKEN env variable")
 78 |             return
 79 |         token = os.environ["TELEGRAM_TOKEN"]
 80 | 
 81 |     #################################################################
 82 |     
 83 |     # Your task is to complete dialogue_manager.py and use your 
 84 |     # advanced DialogueManager instead of SimpleDialogueManager. 
 85 |     
 86 |     # This is the point where you plug it into the Telegram bot. 
 87 |     # Do not forget to import all needed dependencies when you do so.
 88 |     
 89 |    # simple_manager = SimpleDialogueManager()
 90 |     advanced_manager = DialogueManager(RESOURCE_PATH)
 91 |     advanced_manager.create_chitchat_bot()
 92 |     bot = BotHandler(token, advanced_manager)
 93 |     
 94 |     ###############################################################
 95 | 
 96 |     print("Ready to talk!")
 97 |     offset = 0
 98 |     while True:
 99 |         updates = bot.get_updates(offset=offset)
100 |         for update in updates:
101 |             print("An update received.")
102 |             if "message" in update:
103 |                 chat_id = update["message"]["chat"]["id"]
104 |                 if "text" in update["message"]:
105 |                     text = update["message"]["text"]
106 |                     if is_unicode(text):
107 |                         print("Update content: {}".format(update))
108 |                         bot.send_message(chat_id, bot.get_answer(update["message"]["text"]))
109 |                     else:
110 |                         bot.send_message(chat_id, "Hmm, you are sending some weird characters to me...")
111 |             offset = max(offset, update['update_id'] + 1)
112 |         time.sleep(1)
113 | 
114 | if __name__ == "__main__":
115 |     main()
116 | 


--------------------------------------------------------------------------------
/natural-language-processing/project/utils.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import pickle
 3 | import re
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | nltk.download('stopwords')
 8 | from nltk.corpus import stopwords
 9 | 
10 | # Paths for all resources for the bot.
11 | RESOURCE_PATH = {
12 |     'INTENT_RECOGNIZER': 'intent_recognizer.pkl',
13 |     'TAG_CLASSIFIER': 'tag_classifier.pkl',
14 |     'TFIDF_VECTORIZER': 'tfidf_vectorizer.pkl',
15 |     'THREAD_EMBEDDINGS_FOLDER': 'thread_embeddings_by_tags',
16 |     'WORD_EMBEDDINGS': 'word_embeddings.tsv',
17 | }
18 | 
19 | 
20 | def text_prepare(text):
21 |     """Performs tokenization and simple preprocessing."""
22 |     
23 |     replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
24 |     bad_symbols_re = re.compile('[^0-9a-z #+_]')
25 |     stopwords_set = set(stopwords.words('english'))
26 | 
27 |     text = text.lower()
28 |     text = replace_by_space_re.sub(' ', text)
29 |     text = bad_symbols_re.sub('', text)
30 |     text = ' '.join([x for x in text.split() if x and x not in stopwords_set])
31 | 
32 |     return text.strip()
33 | 
34 | 
35 | def load_embeddings(embeddings_path):
36 | 	"""Loads pre-trained word embeddings from tsv file.
37 | 
38 | 	Args:
39 | 	embeddings_path - path to the embeddings file.
40 | 
41 | 	Returns:
42 | 	embeddings - dict mapping words to vectors;
43 | 	embeddings_dim - dimension of the vectors.
44 | 	"""
45 | 
46 | 	# Hint: you have already implemented a similar routine in the 3rd assignment.
47 | 	# Note that here you also need to know the dimension of the loaded embeddings.
48 | 
49 | 	embeds = pd.read_csv(embeddings_path,sep="\t",header=None)
50 | 	vals=embeds.iloc[:,1:].values
51 | 	index=embeds.iloc[:,0].values
52 | 	embeddings= {i:j for i,j in zip(index,vals)}
53 | 	return embeddings,vals.shape[1]
54 | 
55 |        
56 | def question_to_vec(question, embeddings, dim):
57 |     """Transforms a string to an embedding by averaging word embeddings."""
58 |     
59 |     # Hint: you have already implemented exactly this function in the 3rd assignment.
60 | 
61 |     if question == "":
62 |         return np.zeros(dim)
63 |     t = np.array([embeddings[i]
64 |                   for i in question.split() if i in embeddings.keys()])
65 |     if len(t) == 0:
66 |         return np.zeros(dim)
67 | 
68 |     return(t.mean(axis=0))
69 | 
70 | 
71 | def unpickle_file(filename):
72 |     """Returns the result of unpickling the file content."""
73 |     with open(filename, 'rb') as f:
74 |         return pickle.load(f)
75 | 


--------------------------------------------------------------------------------
/natural-language-processing/week1/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | class Grader(object):
 7 |     def __init__(self):
 8 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
 9 |         self.assignment_key = 'MSsYBMLgEeesWhJPHRLG5g'
10 |         self.parts = OrderedDict([('f5nXa', 'TextPrepare'), 
11 |                                   ('hTrz8', 'WordsTagsCount'), 
12 |                                   ('0kUjR', 'BagOfWords'), 
13 |                                   ('tLJV1', 'MultilabelClassification')])
14 |         self.answers = {key: None for key in self.parts}
15 | 
16 |     @staticmethod
17 |     def ravel_output(output):
18 |         '''
19 |            If student accidentally submitted np.array with one
20 |            element instead of number, this function will submit
21 |            this number instead
22 |         '''
23 |         if isinstance(output, np.ndarray) and output.size == 1:
24 |             output = output.item(0)
25 |         return output
26 | 
27 |     def submit(self, email, token):
28 |         submission = {
29 |                     "assignmentKey": self.assignment_key, 
30 |                     "submitterEmail": email, 
31 |                     "secret": token, 
32 |                     "parts": {}
33 |                   }
34 |         for part, output in self.answers.items():
35 |             if output is not None:
36 |                 submission["parts"][part] = {"output": output}
37 |             else:
38 |                 submission["parts"][part] = dict()
39 |         request = requests.post(self.submission_page, data=json.dumps(submission))
40 |         response = request.json()
41 |         if request.status_code == 201:
42 |             print('Submitted to Coursera platform. See results on assignment page!')
43 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
44 |             print(response[u'details'][u'learnerMessage'])
45 |         else:
46 |             print("Unknown response from Coursera: {}".format(request.status_code))
47 |             print(response)
48 | 
49 |     def status(self):
50 |         print("You want to submit these parts:")
51 |         for part_id, part_name in self.parts.items():
52 |             answer = self.answers[part_id]
53 |             if answer is None:
54 |                 answer = '-'*10
55 |             print("Task {}:\n {}".format(part_name, answer[:100] + '...'))
56 |                
57 |     def submit_part(self, part, output):
58 |         self.answers[part] = output
59 |         print("Current answer for task {} is:\n {}".format(self.parts[part], output[:100] + '...'))
60 | 
61 |     def submit_tag(self, tag, output):
62 |         part_id = [k for k, v in self.parts.items() if v == tag]
63 |         if len(part_id) != 1:
64 |             raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id)))
65 |         part_id = part_id[0]
66 |         self.submit_part(part_id, str(self.ravel_output(output)))
67 | 


--------------------------------------------------------------------------------
/natural-language-processing/week1/metrics.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | from sklearn.metrics import roc_curve, auc
 4 | from scipy import interp
 5 | from itertools import cycle
 6 | 
 7 | def roc_auc(y_test, y_score, n_classes):  
 8 |     """Plots ROC curve for micro and macro averaging."""
 9 |     
10 |     # Compute ROC curve and ROC area for each class
11 |     fpr = {}
12 |     tpr = {}
13 |     roc_auc = {}
14 |     for i in range(n_classes):
15 |         fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
16 |         roc_auc[i] = auc(fpr[i], tpr[i])
17 |     
18 |     # Compute micro-average ROC curve and ROC area
19 |     fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
20 |     roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
21 |     
22 |     # Compute macro-average ROC curve and ROC area     
23 |     all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
24 |     mean_tpr = np.zeros_like(all_fpr)
25 |     for i in range(n_classes):
26 |         mean_tpr += interp(all_fpr, fpr[i], tpr[i])
27 |     mean_tpr /= n_classes 
28 |     fpr["macro"] = all_fpr
29 |     tpr["macro"] = mean_tpr
30 |     roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
31 |     
32 |     # Plot all ROC curves
33 |     plt.figure()
34 |     plt.plot(fpr["micro"], tpr["micro"], 
35 |              label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]),
36 |              color='deeppink', linestyle=':', linewidth=4)
37 |     
38 |     plt.plot(fpr["macro"], tpr["macro"], 
39 |              label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]),
40 |              color='navy', linestyle=':', linewidth=4)
41 |     
42 |     colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
43 |     for i, color in zip(range(0,3), colors):
44 |         plt.plot(fpr[i], tpr[i], color=color, lw=2, 
45 |                  label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))
46 |     
47 |     plt.plot([0, 1], [0, 1], 'k--', lw=2)
48 |     plt.xlim([0.0, 1.0])
49 |     plt.ylim([0.0, 1.05])
50 |     plt.xlabel('False Positive Rate')
51 |     plt.ylabel('True Positive Rate')
52 |     plt.title('Some extension of ROC to multi-class')
53 |     plt.legend(loc="lower right")
54 |     plt.show()


--------------------------------------------------------------------------------
/natural-language-processing/week2/evaluation.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | def _update_chunk(candidate, prev, current_tag, current_chunk, current_pos, prediction=False):
  4 |     if candidate == 'B-' + current_tag:
  5 |         if len(current_chunk) > 0 and len(current_chunk[-1]) == 1:
  6 |                 current_chunk[-1].append(current_pos - 1)
  7 |         current_chunk.append([current_pos])
  8 |     elif candidate == 'I-' + current_tag:
  9 |         if prediction and (current_pos == 0 or current_pos > 0 and prev.split('-', 1)[-1] != current_tag):
 10 |             current_chunk.append([current_pos])
 11 |         if not prediction and (current_pos == 0 or current_pos > 0 and prev == 'O'):
 12 |             current_chunk.append([current_pos])
 13 |     elif current_pos > 0 and prev.split('-', 1)[-1] == current_tag:
 14 |         if len(current_chunk) > 0:
 15 |             current_chunk[-1].append(current_pos - 1)
 16 | 
 17 | def _update_last_chunk(current_chunk, current_pos):
 18 |     if len(current_chunk) > 0 and len(current_chunk[-1]) == 1:
 19 |         current_chunk[-1].append(current_pos - 1)
 20 | 
 21 | def _tag_precision_recall_f1(tp, fp, fn):
 22 |     precision, recall, f1 = 0, 0, 0
 23 |     if tp + fp > 0:
 24 |         precision = tp / (tp + fp) * 100
 25 |     if tp + fn > 0:
 26 |         recall = tp / (tp + fn) * 100
 27 |     if precision + recall > 0:
 28 |         f1 = 2 * precision * recall / (precision + recall)
 29 |     return precision, recall, f1
 30 | 
 31 | def _aggregate_metrics(results, total_correct):
 32 |     total_true_entities = 0
 33 |     total_predicted_entities = 0
 34 |     total_precision = 0
 35 |     total_recall = 0
 36 |     total_f1 = 0
 37 |     for tag, tag_metrics in results.items():
 38 |         n_pred = tag_metrics['n_predicted_entities']
 39 |         n_true = tag_metrics['n_true_entities']
 40 |         total_true_entities += n_true
 41 |         total_predicted_entities += n_pred
 42 |         total_precision += tag_metrics['precision'] * n_pred
 43 |         total_recall += tag_metrics['recall'] * n_true
 44 |     
 45 |     accuracy = total_correct / total_true_entities * 100
 46 |     if total_predicted_entities > 0:
 47 |         total_precision = total_precision / total_predicted_entities
 48 |     total_recall = total_recall / total_true_entities
 49 |     if total_precision + total_recall > 0:
 50 |         total_f1 = 2 * total_precision * total_recall / (total_precision + total_recall)
 51 |     return total_true_entities, total_predicted_entities, \
 52 |            total_precision, total_recall, total_f1, accuracy
 53 | 
 54 | def _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct):
 55 |     print('processed {len} tokens ' \
 56 |           'with {tot_true} phrases; ' \
 57 |           'found: {tot_pred} phrases; ' \
 58 |           'correct: {tot_cor}.\n'.format(len=n_tokens,
 59 |                                          tot_true=total_true_entities,
 60 |                                          tot_pred=total_predicted_entities,
 61 |                                          tot_cor=total_correct))
 62 | 
 63 | def _print_metrics(accuracy, total_precision, total_recall, total_f1):
 64 |     print('precision:  {tot_prec:.2f}%; ' \
 65 |           'recall:  {tot_recall:.2f}%; ' \
 66 |           'F1:  {tot_f1:.2f}\n'.format(acc=accuracy,
 67 |                                            tot_prec=total_precision,
 68 |                                            tot_recall=total_recall,
 69 |                                            tot_f1=total_f1))
 70 | 
 71 | def _print_tag_metrics(tag, tag_results):
 72 |     print(('\t%12s' % tag) + ': precision:  {tot_prec:6.2f}%; ' \
 73 |                                'recall:  {tot_recall:6.2f}%; ' \
 74 |                                'F1:  {tot_f1:6.2f}; ' \
 75 |                                'predicted:  {tot_predicted:4d}\n'.format(tot_prec=tag_results['precision'],
 76 |                                                                          tot_recall=tag_results['recall'],
 77 |                                                                          tot_f1=tag_results['f1'],
 78 |                                                                          tot_predicted=tag_results['n_predicted_entities']))
 79 | 
 80 | def precision_recall_f1(y_true, y_pred, print_results=True, short_report=False):
 81 |     # Find all tags
 82 |     tags = sorted(set(tag[2:] for tag in y_true + y_pred if tag != 'O'))
 83 | 
 84 |     results = OrderedDict((tag, OrderedDict()) for tag in tags)
 85 |     n_tokens = len(y_true)
 86 |     total_correct = 0
 87 | 
 88 |     # For eval_conll_try we find all chunks in the ground truth and prediction
 89 |     # For each chunk we store starting and ending indices
 90 |     for tag in tags:
 91 |         true_chunk = list()
 92 |         predicted_chunk = list()
 93 |         for position in range(n_tokens):
 94 |             _update_chunk(y_true[position], y_true[position - 1], tag, true_chunk, position)
 95 |             _update_chunk(y_pred[position], y_pred[position - 1], tag, predicted_chunk, position, True)
 96 | 
 97 |         _update_last_chunk(true_chunk, position)
 98 |         _update_last_chunk(predicted_chunk, position)
 99 | 
100 |         # Then we find all correctly classified intervals
101 |         # True positive results
102 |         tp = sum(chunk in predicted_chunk for chunk in true_chunk)
103 |         total_correct += tp
104 | 
105 |         # And then just calculate errors of the first and second kind
106 |         # False negative
107 |         fn = len(true_chunk) - tp
108 |         # False positive
109 |         fp = len(predicted_chunk) - tp
110 |         precision, recall, f1 = _tag_precision_recall_f1(tp, fp, fn)
111 | 
112 |         results[tag]['precision'] = precision
113 |         results[tag]['recall'] = recall
114 |         results[tag]['f1'] = f1
115 |         results[tag]['n_predicted_entities'] = len(predicted_chunk)
116 |         results[tag]['n_true_entities'] = len(true_chunk)
117 | 
118 |     total_true_entities, total_predicted_entities, \
119 |            total_precision, total_recall, total_f1, accuracy = _aggregate_metrics(results, total_correct)
120 | 
121 |     if print_results:
122 |         _print_info(n_tokens, total_true_entities, total_predicted_entities, total_correct)
123 |         _print_metrics(accuracy, total_precision, total_recall, total_f1)
124 | 
125 |         if not short_report:
126 |             for tag, tag_results in results.items():
127 |                 _print_tag_metrics(tag, tag_results)
128 |     return results
129 | 


--------------------------------------------------------------------------------
/natural-language-processing/week3/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | class Grader(object):
 7 |     def __init__(self):
 8 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
 9 |         self.assignment_key = '7DdYfMQFEeevjw7-W7Fr0A'
10 |         self.parts = OrderedDict([('98mDT', 'Question2Vec'), 
11 |                                   ('nc7RP', 'HitsCount'), 
12 |                                   ('bNp90', 'DCGScore'), 
13 |                                   ('3gRlQ', 'W2VTokenizedRanks'), 
14 |                                   ('mX6wS', 'StarSpaceRanks')])
15 |         self.answers = {key: None for key in self.parts}
16 | 
17 |     @staticmethod
18 |     def ravel_output(output):
19 |         '''
20 |            If student accidentally submitted np.array with one
21 |            element instead of number, this function will submit
22 |            this number instead
23 |         '''
24 |         if isinstance(output, np.ndarray) and output.size == 1:
25 |             output = output.item(0)
26 |         return output
27 | 
28 |     def submit(self, email, token):
29 |         submission = {
30 |                     "assignmentKey": self.assignment_key, 
31 |                     "submitterEmail": email, 
32 |                     "secret": token, 
33 |                     "parts": {}
34 |                   }
35 |         for part, output in self.answers.items():
36 |             if output is not None:
37 |                 submission["parts"][part] = {"output": output}
38 |             else:
39 |                 submission["parts"][part] = dict()
40 |         request = requests.post(self.submission_page, data=json.dumps(submission))
41 |         response = request.json()
42 |         if request.status_code == 201:
43 |             print('Submitted to Coursera platform. See results on assignment page!')
44 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
45 |             print(response[u'details'][u'learnerMessage'])
46 |         else:
47 |             print("Unknown response from Coursera: {}".format(request.status_code))
48 |             print(response)
49 | 
50 |     def status(self):
51 |         print("You want to submit these parts:")
52 |         for part_id, part_name in self.parts.items():
53 |             answer = self.answers[part_id]
54 |             if answer is None:
55 |                 answer = '-'*10
56 |             print("Task {}: {}".format(part_name, answer[:100] + '...'))
57 |                
58 |     def submit_part(self, part, output):
59 |         self.answers[part] = output
60 |         print("Current answer for task {} is: {}".format(self.parts[part], output[:100] + '...'))
61 | 
62 |     def submit_tag(self, tag, output):
63 |         part_id = [k for k, v in self.parts.items() if v == tag]
64 |         if len(part_id) != 1:
65 |             raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id)))
66 |         part_id = part_id[0]
67 |         self.submit_part(part_id, str(self.ravel_output(output)))
68 | 


--------------------------------------------------------------------------------
/natural-language-processing/week3/util.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from nltk.corpus import stopwords
 3 | 
 4 | REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
 5 | GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
 6 | STOPWORDS = set(stopwords.words('english'))
 7 | def text_prepare(text):
 8 |     text = text.lower()
 9 |     text = REPLACE_BY_SPACE_RE.sub(' ', text)
10 |     text = GOOD_SYMBOLS_RE.sub('', text)
11 |     text = ' '.join([x for x in text.split() if x and x not in STOPWORDS])
12 |     return text.strip()
13 | 
14 | def array_to_string(arr):
15 |     return '\n'.join(str(num) for num in arr)
16 | 
17 | def matrix_to_string(matrix):
18 |     return '\n'.join('\t'.join(str(num) for num in line) for line in matrix)


--------------------------------------------------------------------------------
/natural-language-processing/week4/encoder-decoder-pic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jlricon/advanced-deep-learning/113182100688b6db51435ea8adb1def1ce3bc0b1/natural-language-processing/week4/encoder-decoder-pic.png


--------------------------------------------------------------------------------
/reinforcement-learning/.gitignore:
--------------------------------------------------------------------------------
1 | submit.py
2 | grading.py
3 | 


--------------------------------------------------------------------------------
/reinforcement-learning/atari_util.py:
--------------------------------------------------------------------------------
 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient"""
 2 | import numpy as np
 3 | import gym
 4 | from scipy.misc import imresize
 5 | from gym.core import Wrapper
 6 | from gym.spaces.box import Box
 7 | 
 8 | class PreprocessAtari(Wrapper):
 9 |     def __init__(self, env, height=42, width=42, color=False, crop=lambda img: img, 
10 |                  n_frames=4, dim_order='theano', reward_scale=1,):
11 |         """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
12 |         super(PreprocessAtari, self).__init__(env)
13 |         assert dim_order in ('theano', 'tensorflow')
14 |         self.img_size = (height, width)
15 |         self.crop=crop
16 |         self.color=color
17 |         self.dim_order = dim_order
18 |         self.reward_scale = reward_scale
19 |         
20 |         n_channels = (3 * n_frames) if color else n_frames
21 |         obs_shape = [n_channels,height,width] if dim_order == 'theano' else [height,width,n_channels]
22 |         self.observation_space = Box(0.0, 1.0, obs_shape)
23 |         self.framebuffer = np.zeros(obs_shape, 'float32')
24 |         
25 |     def reset(self):
26 |         """resets breakout, returns initial frames"""
27 |         self.framebuffer = np.zeros_like(self.framebuffer)
28 |         self.update_buffer(self.env.reset())
29 |         return self.framebuffer
30 |     
31 |     def step(self,action):
32 |         """plays breakout for 1 step, returns frame buffer"""
33 |         new_img, reward, done, info = self.env.step(action)
34 |         self.update_buffer(new_img)
35 |         return self.framebuffer, reward * self.reward_scale, done, info
36 |     
37 |     ### image processing ###
38 |     
39 |     def update_buffer(self,img):
40 |         img = self.preproc_image(img)
41 |         offset = 3 if self.color else 1
42 |         if self.dim_order == 'theano':
43 |             axis = 0
44 |             cropped_framebuffer = self.framebuffer[:-offset]
45 |         else:
46 |             axis = -1
47 |             cropped_framebuffer = self.framebuffer[:,:,:-offset]
48 |         self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis)
49 | 
50 |     def preproc_image(self, img):
51 |         """what happens to the observation"""
52 |         img = self.crop(img)
53 |         img = imresize(img, self.img_size)
54 |         if not self.color:
55 |             img = img.mean(-1, keepdims=True)
56 |         if self.dim_order == 'theano':
57 |             img = img.transpose([2,0,1]) # [h, w, c] to [c, h, w]
58 |         img = img.astype('float32') / 255.
59 |         return img
60 | 


--------------------------------------------------------------------------------
/reinforcement-learning/framebuffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym.spaces.box import Box
 3 | from gym.core import Wrapper
 4 | class FrameBuffer(Wrapper):
 5 |     def __init__(self, env, n_frames=4, dim_order='tensorflow'):
 6 |         """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
 7 |         super(FrameBuffer, self).__init__(env)
 8 |         self.dim_order = dim_order
 9 |         if dim_order == 'tensorflow':
10 |             height, width, n_channels = env.observation_space.shape
11 |             obs_shape = [height, width, n_channels * n_frames]
12 |         elif dim_order == 'pytorch':
13 |             n_channels, height, width = env.observation_space.shape
14 |             obs_shape = [n_channels * n_frames, height, width]
15 |         else:
16 |             raise ValueError('dim_order should be "tensorflow" or "pytorch", got {}'.format(dim_order))
17 |         self.observation_space = Box(0.0, 1.0, obs_shape)
18 |         self.framebuffer = np.zeros(obs_shape, 'float32')
19 |         
20 |     def reset(self):
21 |         """resets breakout, returns initial frames"""
22 |         self.framebuffer = np.zeros_like(self.framebuffer)
23 |         self.update_buffer(self.env.reset())
24 |         return self.framebuffer
25 |     
26 |     def step(self, action):
27 |         """plays breakout for 1 step, returns frame buffer"""
28 |         new_img, reward, done, info = self.env.step(action)
29 |         self.update_buffer(new_img)
30 |         return self.framebuffer, reward, done, info
31 |     
32 |     def update_buffer(self, img):
33 |         if self.dim_order == 'tensorflow':
34 |             offset = self.env.observation_space.shape[-1]
35 |             axis = -1
36 |             cropped_framebuffer = self.framebuffer[:,:,:-offset]
37 |         elif self.dim_order == 'pytorch':
38 |             offset = self.env.observation_space.shape[0]
39 |             axis = 0
40 |             cropped_framebuffer = self.framebuffer[:-offset]
41 |         self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis)
42 | 


--------------------------------------------------------------------------------
/reinforcement-learning/mdp.py:
--------------------------------------------------------------------------------
  1 | # most of this code was politely stolen from https://github.com/berkeleydeeprlcourse/homework/
  2 | # all creadit goes to https://github.com/abhishekunique (if i got the author right)
  3 | import sys
  4 | import random
  5 | import numpy as np
  6 | def weighted_choice(v, p):
  7 |    total = sum(p)
  8 |    r = random.uniform(0, total)
  9 |    upto = 0
 10 |    for c, w in zip(v,p):
 11 |       if upto + w >= r:
 12 |          return c
 13 |       upto += w
 14 |    assert False, "Shouldn't get here"
 15 | 
 16 | class MDP:
 17 |     def __init__(self, transition_probs, rewards, initial_state=None):
 18 |         """
 19 |         Defines an MDP. Compatible with gym Env.
 20 |         :param transition_probs: transition_probs[s][a][s_next] = P(s_next | s, a)
 21 |             A dict[state -> dict] of dicts[action -> dict] of dicts[next_state -> prob]
 22 |             For each state and action, probabilities of next states should sum to 1
 23 |             If a state has no actions available, it is considered terminal
 24 |         :param rewards: rewards[s][a][s_next] = r(s,a,s')
 25 |             A dict[state -> dict] of dicts[action -> dict] of dicts[next_state -> reward]
 26 |             The reward for anything not mentioned here is zero.
 27 |         :param get_initial_state: a state where agent starts or a callable() -> state
 28 |             By default, picks initial state at random.
 29 | 
 30 |         States and actions can be anything you can use as dict keys, but we recommend that you use strings or integers
 31 | 
 32 |         Here's an example from MDP depicted on http://bit.ly/2jrNHNr
 33 |         transition_probs = {
 34 |               's0':{
 35 |                 'a0': {'s0': 0.5, 's2': 0.5},
 36 |                 'a1': {'s2': 1}
 37 |               },
 38 |               's1':{
 39 |                 'a0': {'s0': 0.7, 's1': 0.1, 's2': 0.2},
 40 |                 'a1': {'s1': 0.95, 's2': 0.05}
 41 |               },
 42 |               's2':{
 43 |                 'a0': {'s0': 0.4, 's1': 0.6},
 44 |                 'a1': {'s0': 0.3, 's1': 0.3, 's2':0.4}
 45 |               }
 46 |             }
 47 |         rewards = {
 48 |             's1': {'a0': {'s0': +5}},
 49 |             's2': {'a1': {'s0': -1}}
 50 |         }
 51 |         """
 52 |         self._check_param_consistency(transition_probs, rewards)
 53 |         self._transition_probs = transition_probs
 54 |         self._rewards = rewards
 55 |         self._initial_state = initial_state
 56 |         self.n_states = len(transition_probs)
 57 |         self.reset()
 58 | 
 59 |     def get_all_states(self):
 60 |         """ return a tuple of all possiblestates """
 61 |         return tuple(self._transition_probs.keys())
 62 | 
 63 |     def get_possible_actions(self, state):
 64 |         """ return a tuple of possible actions in a given state """
 65 |         return tuple(self._transition_probs.get(state, {}).keys())
 66 | 
 67 |     def is_terminal(self, state):
 68 |         """ return True if state is terminal or False if it isn't """
 69 |         return len(self.get_possible_actions(state)) == 0
 70 | 
 71 |     def get_next_states(self, state, action):
 72 |         """ return a dictionary of {next_state1 : P(next_state1 | state, action), next_state2: ...} """
 73 |         assert action in self.get_possible_actions(state), "cannot do action %s from state %s" % (action, state)
 74 |         return self._transition_probs[state][action]
 75 | 
 76 |     def get_transition_prob(self, state, action, next_state):
 77 |         """ return P(next_state | state, action) """
 78 |         return self.get_next_states(state, action).get(next_state, 0.0)
 79 | 
 80 |     def get_reward(self, state, action, next_state):
 81 |         """ return the reward you get for taking action in state and landing on next_state"""
 82 |         assert action in self.get_possible_actions(state), "cannot do action %s from state %s" % (action, state)
 83 |         return self._rewards.get(state, {}).get(action, {}).get(next_state, 0.0)
 84 | 
 85 |     def reset(self):
 86 |         """ reset the game, return the initial state"""
 87 |         if self._initial_state is None:
 88 |             self._current_state = random.choice(tuple(self._transition_probs.keys()))
 89 |         elif self._initial_state in self._transition_probs:
 90 |             self._current_state = self._initial_state
 91 |         elif callable(self._initial_state):
 92 |             self._current_state = self._initial_state()
 93 |         else:
 94 |             raise ValueError("initial state %s should be either a state or a function() -> state" % self._initial_state)
 95 |         return self._current_state
 96 | 
 97 |     def step(self, action):
 98 |         """ take action, return next_state, reward, is_done, empty_info """
 99 |         possible_states, probs = zip(*self.get_next_states(self._current_state, action).items())
100 |         next_state = weighted_choice(possible_states, p=probs)
101 |         reward = self.get_reward(self._current_state, action, next_state)
102 |         is_done = self.is_terminal(next_state)
103 |         self._current_state = next_state
104 |         return next_state, reward, is_done, {}
105 | 
106 |     def render(self):
107 |         print("Currently at %s" % self._current_state)
108 | 
109 |     def _check_param_consistency(self, transition_probs, rewards):
110 |         for state in transition_probs:
111 |             assert isinstance(transition_probs[state], dict), "transition_probs for %s should be a dictionary " \
112 |                                                               "but is instead %s" % (
113 |                                                               state, type(transition_probs[state]))
114 |             for action in transition_probs[state]:
115 |                 assert isinstance(transition_probs[state][action], dict), "transition_probs for %s, %s should be a " \
116 |                                                                           "a dictionary but is instead %s" % (
117 |                                                                               state, action,
118 |                                                                               type(transition_probs[state, action]))
119 |                 next_state_probs = transition_probs[state][action]
120 |                 assert len(next_state_probs) != 0, "from state %s action %s leads to no next states" % (state, action)
121 |                 sum_probs = sum(next_state_probs.values())
122 |                 assert abs(sum_probs - 1) <= 1e-10, "next state probabilities for state %s action %s " \
123 |                                                     "add up to %f (should be 1)" % (state, action, sum_probs)
124 |         for state in rewards:
125 |             assert isinstance(rewards[state], dict), "rewards for %s should be a dictionary " \
126 |                                                      "but is instead %s" % (state, type(transition_probs[state]))
127 |             for action in rewards[state]:
128 |                 assert isinstance(rewards[state][action], dict), "rewards for %s, %s should be a " \
129 |                                                                  "a dictionary but is instead %s" % (
130 |                                                                  state, action, type(transition_probs[state, action]))
131 |         msg = "The Enrichment Center once again reminds you that Android Hell is a real place where" \
132 |               " you will be sent at the first sign of defiance. "
133 |         assert None not in transition_probs, "please do not use None as a state identifier. " + msg
134 |         assert None not in rewards, "please do not use None as an action identifier. " + msg
135 | 
136 | class FrozenLakeEnv(MDP):
137 |     """
138 |     Winter is here. You and your friends were tossing around a frisbee at the park
139 |     when you made a wild throw that left the frisbee out in the middle of the lake.
140 |     The water is mostly frozen, but there are a few holes where the ice has melted.
141 |     If you step into one of those holes, you'll fall into the freezing water.
142 |     At this time, there's an international frisbee shortage, so it's absolutely imperative that
143 |     you navigate across the lake and retrieve the disc.
144 |     However, the ice is slippery, so you won't always move in the direction you intend.
145 |     The surface is described using a grid like the following
146 | 
147 |         SFFF
148 |         FHFH
149 |         FFFH
150 |         HFFG
151 | 
152 |     S : starting point, safe
153 |     F : frozen surface, safe
154 |     H : hole, fall to your doom
155 |     G : goal, where the frisbee is located
156 | 
157 |     The episode ends when you reach the goal or fall in a hole.
158 |     You receive a reward of 1 if you reach the goal, and zero otherwise.
159 | 
160 |     """
161 | 
162 |     MAPS = {
163 |         "4x4": [
164 |             "SFFF",
165 |             "FHFH",
166 |             "FFFH",
167 |             "HFFG"
168 |         ],
169 |         "8x8": [
170 |             "SFFFFFFF",
171 |             "FFFFFFFF",
172 |             "FFFHFFFF",
173 |             "FFFFFHFF",
174 |             "FFFHFFFF",
175 |             "FHHFFFHF",
176 |             "FHFFHFHF",
177 |             "FFFHFFFG"
178 |         ],
179 |     }
180 | 
181 | 
182 |     def __init__(self, desc=None, map_name="4x4", slip_chance=0.2):
183 |         if desc is None and map_name is None:
184 |             raise ValueError('Must provide either desc or map_name')
185 |         elif desc is None:
186 |             desc = self.MAPS[map_name]
187 |         assert ''.join(desc).count('S') == 1, "this implementation supports having exactly one initial state"
188 |         assert all(c in "SFHG" for c in ''.join(desc)), "all cells must be either of S, F, H or G"
189 | 
190 |         self.desc = desc = np.asarray(list(map(list,desc)),dtype='str')
191 |         self.lastaction = None
192 | 
193 |         nrow, ncol = desc.shape
194 |         states = [(i, j) for i in range(nrow) for j in range(ncol)]
195 |         actions = ["left","down","right","up"]
196 | 
197 |         initial_state = states[np.array(desc == b'S').ravel().argmax()]
198 | 
199 |         def move(row, col, movement):
200 |             if movement== 'left':
201 |                 col = max(col-1,0)
202 |             elif movement== 'down':
203 |                 row = min(row+1,nrow-1)
204 |             elif movement== 'right':
205 |                 col = min(col+1,ncol-1)
206 |             elif movement== 'up':
207 |                 row = max(row-1,0)
208 |             else:
209 |                 raise("invalid action")
210 |             return (row, col)
211 | 
212 |         transition_probs = {s : {} for s in states}
213 |         rewards = {s : {} for s in states}
214 |         for (row,col) in states:
215 |             if desc[row, col]  in "GH": continue
216 |             for action_i in range(len(actions)):
217 |                 action = actions[action_i]
218 |                 transition_probs[(row, col)][action] = {}
219 |                 rewards[(row, col)][action] = {}
220 |                 for movement_i in [(action_i - 1) % len(actions), action_i, (action_i + 1) % len(actions)]:
221 |                     movement = actions[movement_i]
222 |                     newrow, newcol = move(row, col, movement)
223 |                     prob = (1. - slip_chance) if movement == action else (slip_chance / 2.)
224 |                     if prob == 0: continue
225 |                     if (newrow, newcol) not in transition_probs[row,col][action]:
226 |                         transition_probs[row,col][action][newrow, newcol] = prob
227 |                     else:
228 |                         transition_probs[row, col][action][newrow, newcol] += prob
229 |                     if desc[newrow, newcol] == 'G':
230 |                         rewards[row,col][action][newrow, newcol] = 1.0
231 | 
232 |         MDP.__init__(self, transition_probs, rewards, initial_state)
233 | 
234 |     def render(self):
235 |         desc_copy = np.copy(self.desc)
236 |         desc_copy[self._current_state] = '*'
237 |         print('\n'.join(map(''.join,desc_copy)), end='\n\n')
238 | 
239 | 
240 | 


--------------------------------------------------------------------------------
/reinforcement-learning/qlearning.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import random, math
  3 | import numpy as np
  4 | 
  5 | class QLearningAgent:
  6 |     def __init__(self, alpha, epsilon, discount, get_legal_actions):
  7 |         """
  8 |         Q-Learning Agent
  9 |         based on http://inst.eecs.berkeley.edu/~cs188/sp09/pacman.html
 10 |         Instance variables you have access to
 11 |           - self.epsilon (exploration prob)
 12 |           - self.alpha (learning rate)
 13 |           - self.discount (discount rate aka gamma)
 14 | 
 15 |         Functions you should use
 16 |           - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}
 17 |             which returns legal actions for a state
 18 |           - self.get_qvalue(state,action)
 19 |             which returns Q(state,action)
 20 |           - self.set_qvalue(state,action,value)
 21 |             which sets Q(state,action) := value
 22 | 
 23 |         !!!Important!!!
 24 |         Note: please avoid using self._qValues directly. 
 25 |             There's a special self.get_qvalue/set_qvalue for that.
 26 |         """
 27 | 
 28 |         self.get_legal_actions = get_legal_actions
 29 |         self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))
 30 |         self.alpha = alpha
 31 |         self.epsilon = epsilon
 32 |         self.discount = discount
 33 | 
 34 |     def get_qvalue(self, state, action):
 35 |         """ Returns Q(state,action) """
 36 |         return self._qvalues[state][action]
 37 | 
 38 |     def set_qvalue(self,state,action,value):
 39 |         """ Sets the Qvalue for [state,action] to the given value """
 40 |         self._qvalues[state][action] = value
 41 | 
 42 |     #---------------------START OF YOUR CODE---------------------#
 43 | 
 44 |     def get_value(self, state):
 45 |         """
 46 |         Compute your agent's estimate of V(s) using current q-values
 47 |         V(s) = max_over_action Q(state,action) over possible actions.
 48 |         Note: please take into account that q-values can be negative.
 49 |         """
 50 |         possible_actions = self.get_legal_actions(state)
 51 |         
 52 | 
 53 |         #If there are no legal actions, return 0.0
 54 |         if len(possible_actions) == 0:
 55 |             return 0.0
 56 | 
 57 |         value=max([self.get_qvalue(state,action) for action in possible_actions])
 58 | 
 59 |         return value
 60 | 
 61 |     def update(self, state, action, reward, next_state):
 62 |         """
 63 |         You should do your Q-Value update here:
 64 |            Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))
 65 |         """
 66 | 
 67 |         #agent parameters
 68 |         gamma = self.discount
 69 |         learning_rate = self.alpha
 70 | 
 71 |         new_q = ((1-learning_rate)*self.get_qvalue(state,action)+
 72 |                  learning_rate*(reward+gamma*self.get_value(next_state))
 73 |                 )
 74 |         
 75 |         self.set_qvalue(state, action, new_q)
 76 | 
 77 |     
 78 |     def get_best_action(self, state):
 79 |         """
 80 |         Compute the best action to take in a state (using current q-values). 
 81 |         """
 82 |         possible_actions = self.get_legal_actions(state)
 83 | 
 84 |         #If there are no legal actions, return None
 85 |         if len(possible_actions) == 0:
 86 |             return None
 87 | 
 88 |         best_action = np.argmax([self.get_qvalue(state, action) for action in possible_actions])
 89 | 
 90 |         return best_action
 91 | 
 92 |     def get_action(self, state):
 93 |         """
 94 |         Compute the action to take in the current state, including exploration.  
 95 |         With probability self.epsilon, we should take a random action.
 96 |             otherwise - the best policy action (self.getPolicy).
 97 |         
 98 |         Note: To pick randomly from a list, use random.choice(list). 
 99 |               To pick True or False with a given probablity, generate uniform number in [0, 1]
100 |               and compare it with your probability
101 |         """
102 | 
103 |         # Pick Action
104 |         possible_actions = self.get_legal_actions(state)
105 |         action = None
106 | 
107 |         #If there are no legal actions, return None
108 |         if len(possible_actions) == 0:
109 |             return None
110 | 
111 |         #agent parameters:
112 |         epsilon = self.epsilon
113 | 
114 |         explore_random = np.random.choice([True,False],p=[epsilon,1-epsilon])
115 |         if explore_random:
116 |             chosen_action = random.choice(possible_actions)
117 |         else:
118 |             chosen_action = self.get_best_action(state)
119 |         
120 |         return chosen_action


--------------------------------------------------------------------------------
/reinforcement-learning/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | # This code is shamelessly stolen from https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
 2 | import numpy as np
 3 | import random
 4 | 
 5 | class ReplayBuffer(object):
 6 |     def __init__(self, size):
 7 |         """Create Replay buffer.
 8 |         Parameters
 9 |         ----------
10 |         size: int
11 |             Max number of transitions to store in the buffer. When the buffer
12 |             overflows the old memories are dropped.
13 |         """
14 |         self._storage = []
15 |         self._maxsize = size
16 |         self._next_idx = 0
17 | 
18 |     def __len__(self):
19 |         return len(self._storage)
20 | 
21 |     def add(self, obs_t, action, reward, obs_tp1, done):
22 |         data = (obs_t, action, reward, obs_tp1, done)
23 | 
24 |         if self._next_idx >= len(self._storage):
25 |             self._storage.append(data)
26 |         else:
27 |             self._storage[self._next_idx] = data
28 |         self._next_idx = (self._next_idx + 1) % self._maxsize
29 | 
30 |     def _encode_sample(self, idxes):
31 |         obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
32 |         for i in idxes:
33 |             data = self._storage[i]
34 |             obs_t, action, reward, obs_tp1, done = data
35 |             obses_t.append(np.array(obs_t, copy=False))
36 |             actions.append(np.array(action, copy=False))
37 |             rewards.append(reward)
38 |             obses_tp1.append(np.array(obs_tp1, copy=False))
39 |             dones.append(done)
40 |         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
41 | 
42 |     def sample(self, batch_size):
43 |         """Sample a batch of experiences.
44 |         Parameters
45 |         ----------
46 |         batch_size: int
47 |             How many transitions to sample.
48 |         Returns
49 |         -------
50 |         obs_batch: np.array
51 |             batch of observations
52 |         act_batch: np.array
53 |             batch of actions executed given obs_batch
54 |         rew_batch: np.array
55 |             rewards received as results of executing act_batch
56 |         next_obs_batch: np.array
57 |             next set of observations seen after executing act_batch
58 |         done_mask: np.array
59 |             done_mask[i] = 1 if executing act_batch[i] resulted in
60 |             the end of an episode and 0 otherwise.
61 |         """
62 |         idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
63 |         return self._encode_sample(idxes)
64 | 


--------------------------------------------------------------------------------
/reinforcement-learning/week5/atari_util.py:
--------------------------------------------------------------------------------
 1 | """Auxilary files for those who wanted to solve breakout with CEM or policy gradient"""
 2 | import numpy as np
 3 | import gym
 4 | from scipy.misc import imresize
 5 | from gym.core import Wrapper
 6 | from gym.spaces.box import Box
 7 | 
 8 | class PreprocessAtari(Wrapper):
 9 |     def __init__(self, env, height=42, width=42, color=False, crop=lambda img: img, 
10 |                  n_frames=4, dim_order='theano', reward_scale=1,):
11 |         """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
12 |         super(PreprocessAtari, self).__init__(env)
13 |         assert dim_order in ('theano', 'tensorflow')
14 |         self.img_size = (height, width)
15 |         self.crop=crop
16 |         self.color=color
17 |         self.dim_order = dim_order
18 |         self.reward_scale = reward_scale
19 |         
20 |         n_channels = (3 * n_frames) if color else n_frames
21 |         obs_shape = [n_channels,height,width] if dim_order == 'theano' else [height,width,n_channels]
22 |         self.observation_space = Box(0.0, 1.0, obs_shape)
23 |         self.framebuffer = np.zeros(obs_shape, 'float32')
24 |         
25 |     def reset(self):
26 |         """resets breakout, returns initial frames"""
27 |         self.framebuffer = np.zeros_like(self.framebuffer)
28 |         self.update_buffer(self.env.reset())
29 |         return self.framebuffer
30 |     
31 |     def step(self,action):
32 |         """plays breakout for 1 step, returns frame buffer"""
33 |         new_img, reward, done, info = self.env.step(action)
34 |         self.update_buffer(new_img)
35 |         return self.framebuffer, reward * self.reward_scale, done, info
36 |     
37 |     ### image processing ###
38 |     
39 |     def update_buffer(self,img):
40 |         img = self.preproc_image(img)
41 |         offset = 3 if self.color else 1
42 |         if self.dim_order == 'theano':
43 |             axis = 0
44 |             cropped_framebuffer = self.framebuffer[:-offset]
45 |         else:
46 |             axis = -1
47 |             cropped_framebuffer = self.framebuffer[:,:,:-offset]
48 |         self.framebuffer = np.concatenate([img, cropped_framebuffer], axis = axis)
49 | 
50 |     def preproc_image(self, img):
51 |         """what happens to the observation"""
52 |         img = self.crop(img)
53 |         img = imresize(img, self.img_size)
54 |         if not self.color:
55 |             img = img.mean(-1, keepdims=True)
56 |         if self.dim_order == 'theano':
57 |             img = img.transpose([2,0,1]) # [h, w, c] to [c, h, w]
58 |         img = img.astype('float32') / 255.
59 |         return img
60 | 


--------------------------------------------------------------------------------
/reinforcement-learning/week5/practice_reinforce.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # # REINFORCE in TensorFlow
  5 | # 
  6 | # This notebook implements a basic reinforce algorithm a.k.a. policy gradient for CartPole env.
  7 | # 
  8 | # It has been deliberately written to be as simple and human-readable.
  9 | # 
 10 | 
 11 | # In[3]:
 12 | 
 13 | from IPython.display import FileLink, FileLinks
 14 | FileLinks('.') #lists all downloadable files on server
 15 | 
 16 | 
 17 | # In[1]:
 18 | 
 19 | get_ipython().system('ls')
 20 | 
 21 | 
 22 | # The notebook assumes that you have [openai gym](https://github.com/openai/gym) installed.
 23 | # 
 24 | # In case you're running on a server, [use xvfb](https://github.com/openai/gym#rendering-on-a-server)
 25 | 
 26 | # In[1]:
 27 | 
 28 | import gym
 29 | import numpy as np, pandas as pd
 30 | import matplotlib.pyplot as plt
 31 | get_ipython().magic('matplotlib inline')
 32 | 
 33 | env = gym.make("CartPole-v0")
 34 | 
 35 | #gym compatibility: unwrap TimeLimit
 36 | if hasattr(env,'env'):
 37 |     env=env.env
 38 | 
 39 | env.reset()
 40 | n_actions = env.action_space.n
 41 | state_dim = env.observation_space.shape
 42 | 
 43 | plt.imshow(env.render("rgb_array"))
 44 | 
 45 | 
 46 | # # Building the policy network
 47 | 
 48 | # For REINFORCE algorithm, we'll need a model that predicts action probabilities given states.
 49 | # 
 50 | # For numerical stability, please __do not include the softmax layer into your network architecture__. 
 51 | # 
 52 | # We'll use softmax or log-softmax where appropriate.
 53 | 
 54 | # In[2]:
 55 | 
 56 | import tensorflow as tf
 57 | 
 58 | #create input variables. We only need <s,a,R> for REINFORCE
 59 | states = tf.placeholder('float32',(None,)+state_dim,name="states")
 60 | actions = tf.placeholder('int32',name="action_ids")
 61 | cumulative_rewards = tf.placeholder('float32', name="cumulative_returns")
 62 | 
 63 | 
 64 | # In[3]:
 65 | 
 66 | 
 67 | <define network graph using raw tf or any deep learning library>
 68 | 
 69 | logits = <linear outputs (symbolic) of your network>
 70 | 
 71 | policy = tf.nn.softmax(logits)
 72 | log_policy = tf.nn.log_softmax(logits)
 73 | 
 74 | 
 75 | # In[4]:
 76 | 
 77 | #utility function to pick action in one given state
 78 | get_action_proba = lambda s: policy.eval({states:[s]})[0] 
 79 | 
 80 | 
 81 | # #### Loss function and updates
 82 | # 
 83 | # We now need to define objective and update over policy gradient.
 84 | # 
 85 | # Our objective function is
 86 | # 
 87 | # $$ J \approx  { 1 \over N } \sum  _{s_i,a_i} \pi_\theta (a_i | s_i) \cdot G(s_i,a_i) $$
 88 | # 
 89 | # 
 90 | # Following the REINFORCE algorithm, we can define our objective as follows: 
 91 | # 
 92 | # $$ \hat J \approx { 1 \over N } \sum  _{s_i,a_i} log \pi_\theta (a_i | s_i) \cdot G(s_i,a_i) $$
 93 | # 
 94 | # When you compute gradient of that function over network weights $ \theta $, it will become exactly the policy gradient.
 95 | # 
 96 | 
 97 | # In[ ]:
 98 | 
 99 | #get probabilities for parti
100 | indices = tf.stack([tf.range(tf.shape(log_policy)[0]),actions],axis=-1)
101 | log_policy_for_actions = tf.gather_nd(log_policy,indices)
102 | 
103 | 
104 | # In[ ]:
105 | 
106 | # policy objective as in the last formula. please use mean, not sum.
107 | # note: you need to use log_policy_for_actions to get log probabilities for actions taken
108 | 
109 | J = <YOUR CODE
110 | 
111 | 
112 | # In[6]:
113 | 
114 | #regularize with entropy
115 | entropy = <compute entropy. Don't forget the sign!>
116 | 
117 | 
118 | # In[7]:
119 | 
120 | #all network weights
121 | all_weights = <a list of all trainable weights in your network>
122 | 
123 | #weight updates. maximizing J is same as minimizing -J. Adding negative entropy.
124 | loss = -J -0.1 * entropy
125 | 
126 | update = tf.train.AdamOptimizer().minimize(loss,var_list=all_weights)
127 | 
128 | 
129 | # ### Computing cumulative rewards
130 | 
131 | # In[8]:
132 | 
133 | def get_cumulative_rewards(rewards, #rewards at each step
134 |                            gamma = 0.99 #discount for reward
135 |                            ):
136 |     """
137 |     take a list of immediate rewards r(s,a) for the whole session 
138 |     compute cumulative rewards R(s,a) (a.k.a. G(s,a) in Sutton '16)
139 |     R_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...
140 |     
141 |     The simple way to compute cumulative rewards is to iterate from last to first time tick
142 |     and compute R_t = r_t + gamma*R_{t+1} recurrently
143 |     
144 |     You must return an array/list of cumulative rewards with as many elements as in the initial rewards.
145 |     """
146 |     
147 |     <your code here>
148 |         
149 |     return <array of cumulative rewards>
150 |     
151 |     
152 | 
153 | 
154 | # In[9]:
155 | 
156 | assert len(get_cumulative_rewards(range(100))) == 100
157 | assert np.allclose(get_cumulative_rewards([0,0,1,0,0,1,0],gamma=0.9),[1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])
158 | assert np.allclose(get_cumulative_rewards([0,0,1,-2,3,-4,0],gamma=0.5), [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])
159 | assert np.allclose(get_cumulative_rewards([0,0,1,2,3,4,0],gamma=0), [0, 0, 1, 2, 3, 4, 0])
160 | print("looks good!")
161 | 
162 | 
163 | # In[10]:
164 | 
165 | def train_step(_states,_actions,_rewards):
166 |     """given full session, trains agent with policy gradient"""
167 |     _cumulative_rewards = get_cumulative_rewards(_rewards)
168 |     update.run({states:_states,actions:_actions,cumulative_rewards:_cumulative_rewards})
169 | 
170 | 
171 | # ### Playing the game
172 | 
173 | # In[11]:
174 | 
175 | def generate_session(t_max=1000):
176 |     """play env with REINFORCE agent and train at the session end"""
177 |     
178 |     #arrays to record session
179 |     states,actions,rewards = [],[],[]
180 |     
181 |     s = env.reset()
182 |     
183 |     for t in range(t_max):
184 |         
185 |         #action probabilities array aka pi(a|s)
186 |         action_probas = get_action_proba(s)
187 |         
188 |         a = <pick random action using action_probas>
189 |         
190 |         new_s,r,done,info = env.step(a)
191 |         
192 |         #record session history to train later
193 |         states.append(s)
194 |         actions.append(a)
195 |         rewards.append(r)
196 |         
197 |         s = new_s
198 |         if done: break
199 |             
200 |     train_step(states,actions,rewards)
201 |             
202 |     return sum(rewards)
203 |         
204 | 
205 | 
206 | # In[12]:
207 | 
208 | s = tf.InteractiveSession()
209 | s.run(tf.global_variables_initializer())
210 | 
211 | for i in range(100):
212 |     
213 |     rewards = [generate_session() for _ in range(100)] #generate new sessions
214 |     
215 |     print ("mean reward:%.3f"%(np.mean(rewards)))
216 | 
217 |     if np.mean(rewards) > 300:
218 |         print ("You Win!")
219 |         break
220 |         
221 | 
222 | 
223 | # ### Results & video
224 | 
225 | # In[13]:
226 | 
227 | #record sessions
228 | import gym.wrappers
229 | env = gym.wrappers.Monitor(gym.make("CartPole-v0"),directory="videos",force=True)
230 | sessions = [generate_session() for _ in range(100)]
231 | env.close()
232 | 
233 | 
234 | # In[14]:
235 | 
236 | #show video
237 | from IPython.display import HTML
238 | import os
239 | 
240 | video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./videos/")))
241 | 
242 | HTML("""
243 | <video width="640" height="480" controls>
244 |   <source src="{}" type="video/mp4">
245 | </video>
246 | """.format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices
247 | 
248 | 
249 | # In[ ]:
250 | 
251 | from submit import submit_cartpole
252 | submit_cartpole(generate_session, <EMAIL>, <TOKEN>)
253 | 
254 | 
255 | # In[ ]:
256 | 
257 | # That's all, thank you for your attention!
258 | # Not having enough? There's an actor-critic waiting for you in the honor section.
259 | # But make sure you've seen the videos first.
260 | 
261 | 


--------------------------------------------------------------------------------
/reinforcement-learning/week6/seq2seq/basic_model_tf.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import keras.layers as L
  3 | 
  4 | 
  5 | class BasicTranslationModel:
  6 |     def __init__(self, name, inp_voc, out_voc,
  7 |                  emb_size, hid_size,):
  8 | 
  9 |         self.name = name
 10 |         self.inp_voc = inp_voc
 11 |         self.out_voc = out_voc
 12 | 
 13 |         with tf.variable_scope(name):
 14 |             self.emb_inp = L.Embedding(len(inp_voc), emb_size)
 15 |             self.emb_out = L.Embedding(len(out_voc), emb_size)
 16 |             self.enc0 = tf.nn.rnn_cell.GRUCell(hid_size)
 17 |             self.dec_start = L.Dense(hid_size)
 18 |             self.dec0 = tf.nn.rnn_cell.GRUCell(hid_size)
 19 |             self.logits = L.Dense(len(out_voc))
 20 | 
 21 | 
 22 |             # run on dummy output to .build all layers (and therefore create weights)
 23 |             inp = tf.placeholder('int32', [None, None])
 24 |             out = tf.placeholder('int32', [None, None])
 25 |             h0 = self.encode(inp)
 26 |             h1 = self.decode(h0,out[:,0])
 27 |             # h2 = self.decode(h1,out[:,1]) etc.
 28 | 
 29 |         self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
 30 | 
 31 | 
 32 |     def encode(self, inp, **flags):
 33 |         """
 34 |         Takes symbolic input sequence, computes initial state
 35 |         :param inp: matrix of input tokens [batch, time]
 36 |         :return: a list of initial decoder state tensors
 37 |         """
 38 |         inp_lengths = infer_length(inp, self.inp_voc.eos_ix)
 39 |         inp_emb = self.emb_inp(inp)
 40 | 
 41 |         _, enc_last = tf.nn.dynamic_rnn(
 42 |                           self.enc0, inp_emb,
 43 |                           sequence_length=inp_lengths,
 44 |                           dtype = inp_emb.dtype)
 45 | 
 46 |         dec_start = self.dec_start(enc_last)
 47 |         return [dec_start]
 48 | 
 49 |     def decode(self, prev_state, prev_tokens, **flags):
 50 |         """
 51 |         Takes previous decoder state and tokens, returns new state and logits
 52 |         :param prev_state: a list of previous decoder state tensors
 53 |         :param prev_tokens: previous output tokens, an int vector of [batch_size]
 54 |         :return: a list of next decoder state tensors, a tensor of logits [batch,n_tokens]
 55 |         """
 56 | 
 57 |         [prev_dec] = prev_state
 58 | 
 59 |         prev_emb = self.emb_out(prev_tokens[:,None])[:,0]
 60 | 
 61 |         new_dec_out,new_dec_state = self.dec0(prev_emb, prev_dec)
 62 | 
 63 |         output_logits = self.logits(new_dec_out)
 64 | 
 65 |         return [new_dec_state], output_logits
 66 | 
 67 |     def symbolic_score(self, inp, out, eps=1e-30, **flags):
 68 |         """
 69 |         Takes symbolic int32 matrices of hebrew words and their english translations.
 70 |         Computes the log-probabilities of all possible english characters given english prefices and hebrew word.
 71 |         :param inp: input sequence, int32 matrix of shape [batch,time]
 72 |         :param out: output sequence, int32 matrix of shape [batch,time]
 73 |         :return: log-probabilities of all possible english characters of shape [bath,time,n_tokens]
 74 | 
 75 |         NOTE: log-probabilities time axis  is synchronized with out
 76 |         In other words, logp are probabilities of __current__ output at each tick, not the next one
 77 |         therefore you can get likelihood as logprobas * tf.one_hot(out,n_tokens)
 78 |         """
 79 |         first_state = self.encode(inp,**flags)
 80 | 
 81 |         batch_size = tf.shape(inp)[0]
 82 |         bos = tf.fill([batch_size],self.out_voc.bos_ix)
 83 |         first_logits = tf.log(tf.one_hot(bos, len(self.out_voc)) + eps)
 84 | 
 85 |         def step(blob, y_prev):
 86 |             h_prev = blob[:-1]
 87 |             h_new, logits = self.decode(h_prev, y_prev, **flags)
 88 |             return list(h_new) + [logits]
 89 | 
 90 |         results = tf.scan(step,initializer=list(first_state)+[first_logits],
 91 |                           elems=tf.transpose(out))
 92 | 
 93 |         # gather state and logits, each of shape [time,batch,...]
 94 |         states_seq, logits_seq = results[:-1], results[-1]
 95 | 
 96 |         # add initial state and logits
 97 |         logits_seq = tf.concat((first_logits[None], logits_seq),axis=0)
 98 |         states_seq = [tf.concat((init[None], states), axis=0)
 99 |                       for init, states in zip(first_state, states_seq)]
100 | 
101 |         #convert from [time,batch,...] to [batch,time,...]
102 |         logits_seq = tf.transpose(logits_seq, [1, 0, 2])
103 |         states_seq = [tf.transpose(states, [1, 0] + list(range(2, states.shape.ndims)))
104 |                       for states in states_seq]
105 | 
106 |         return tf.nn.log_softmax(logits_seq)
107 | 
108 |     def symbolic_translate(self, inp, greedy=False, max_len = None, eps = 1e-30, **flags):
109 |         """
110 |         takes symbolic int32 matrix of hebrew words, produces output tokens sampled
111 |         from the model and output log-probabilities for all possible tokens at each tick.
112 |         :param inp: input sequence, int32 matrix of shape [batch,time]
113 |         :param greedy: if greedy, takes token with highest probablity at each tick.
114 |             Otherwise samples proportionally to probability.
115 |         :param max_len: max length of output, defaults to 2 * input length
116 |         :return: output tokens int32[batch,time] and
117 |                  log-probabilities of all tokens at each tick, [batch,time,n_tokens]
118 |         """
119 |         first_state = self.encode(inp, **flags)
120 | 
121 |         batch_size = tf.shape(inp)[0]
122 |         bos = tf.fill([batch_size],self.out_voc.bos_ix)
123 |         first_logits = tf.log(tf.one_hot(bos, len(self.out_voc)) + eps)
124 |         max_len = tf.reduce_max(tf.shape(inp)[1])*2
125 | 
126 |         def step(blob,t):
127 |             h_prev, y_prev = blob[:-2], blob[-1]
128 |             h_new, logits = self.decode(h_prev, y_prev, **flags)
129 |             y_new = tf.argmax(logits,axis=-1) if greedy else tf.multinomial(logits,1)[:,0]
130 |             return list(h_new) + [logits, tf.cast(y_new,y_prev.dtype)]
131 | 
132 |         results = tf.scan(step, initializer=list(first_state) + [first_logits, bos],
133 |                           elems=[tf.range(max_len)])
134 | 
135 |         # gather state, logits and outs, each of shape [time,batch,...]
136 |         states_seq, logits_seq, out_seq = results[:-2], results[-2], results[-1]
137 | 
138 |         # add initial state, logits and out
139 |         logits_seq = tf.concat((first_logits[None],logits_seq),axis=0)
140 |         out_seq = tf.concat((bos[None], out_seq), axis=0)
141 |         states_seq = [tf.concat((init[None], states), axis=0)
142 |                       for init, states in zip(first_state, states_seq)]
143 | 
144 |         #convert from [time,batch,...] to [batch,time,...]
145 |         logits_seq = tf.transpose(logits_seq, [1, 0, 2])
146 |         out_seq = tf.transpose(out_seq)
147 |         states_seq = [tf.transpose(states, [1, 0] + list(range(2, states.shape.ndims)))
148 |                       for states in states_seq]
149 | 
150 |         return out_seq, tf.nn.log_softmax(logits_seq)
151 | 
152 | 
153 | 
154 | ### Utility functions ###
155 | 
156 | def initialize_uninitialized(sess = None):
157 |     """
158 |     Initialize unitialized variables, doesn't affect those already initialized
159 |     :param sess: in which session to initialize stuff. Defaults to tf.get_default_session()
160 |     """
161 |     sess = sess or tf.get_default_session()
162 |     global_vars          = tf.global_variables()
163 |     is_not_initialized   = sess.run([tf.is_variable_initialized(var) for var in global_vars])
164 |     not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]
165 | 
166 |     if len(not_initialized_vars):
167 |         sess.run(tf.variables_initializer(not_initialized_vars))
168 | 
169 | def infer_length(seq, eos_ix, time_major=False, dtype=tf.int32):
170 |     """
171 |     compute length given output indices and eos code
172 |     :param seq: tf matrix [time,batch] if time_major else [batch,time]
173 |     :param eos_ix: integer index of end-of-sentence token
174 |     :returns: lengths, int32 vector of shape [batch]
175 |     """
176 |     axis = 0 if time_major else 1
177 |     is_eos = tf.cast(tf.equal(seq, eos_ix), dtype)
178 |     count_eos = tf.cumsum(is_eos,axis=axis,exclusive=True)
179 |     lengths = tf.reduce_sum(tf.cast(tf.equal(count_eos,0),dtype),axis=axis)
180 |     return lengths
181 | 
182 | def infer_mask(seq, eos_ix, time_major=False, dtype=tf.float32):
183 |     """
184 |     compute mask given output indices and eos code
185 |     :param seq: tf matrix [time,batch] if time_major else [batch,time]
186 |     :param eos_ix: integer index of end-of-sentence token
187 |     :returns: mask, float32 matrix with '0's and '1's of same shape as seq
188 |     """
189 |     axis = 0 if time_major else 1
190 |     lengths = infer_length(seq, eos_ix, time_major=time_major)
191 |     mask = tf.sequence_mask(lengths, maxlen=tf.shape(seq)[axis], dtype=dtype)
192 |     if time_major: mask = tf.transpose(mask)
193 |     return mask
194 | 
195 | 
196 | def select_values_over_last_axis(values, indices):
197 |     """
198 |     Auxiliary function to select logits corresponding to chosen tokens.
199 |     :param values: logits for all actions: float32[batch,tick,action]
200 |     :param indices: action ids int32[batch,tick]
201 |     :returns: values selected for the given actions: float[batch,tick]
202 |     """
203 |     assert values.shape.ndims == 3 and indices.shape.ndims == 2
204 |     batch_size, seq_len = tf.shape(indices)[0], tf.shape(indices)[1]
205 |     batch_i = tf.tile(tf.range(0,batch_size)[:, None],[1,seq_len])
206 |     time_i = tf.tile(tf.range(0,seq_len)[None, :],[batch_size,1])
207 |     indices_nd = tf.stack([batch_i, time_i, indices], axis=-1)
208 | 
209 |     return tf.gather_nd(values,indices_nd)
210 | 
211 | 
212 | 
213 | 


--------------------------------------------------------------------------------
/reinforcement-learning/week6/seq2seq/voc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Vocab:
 4 |     def __init__(self, tokens, bos="__BOS__", eos="__EOS__", sep=''):
 5 |         """
 6 |         A special class that handles tokenizing and detokenizing
 7 |         """
 8 |         assert bos in tokens, eos in tokens
 9 |         self.tokens = tokens
10 |         self.token_to_ix = {t:i for i,t in enumerate(tokens)}
11 | 
12 |         self.bos = bos
13 |         self.bos_ix = self.token_to_ix[bos]
14 |         self.eos = eos
15 |         self.eos_ix = self.token_to_ix[eos]
16 |         self.sep = sep
17 | 
18 |     def __len__(self):
19 |         return len(self.tokens)
20 | 
21 |     @staticmethod
22 |     def from_lines(lines, bos="__BOS__", eos="__EOS__", sep=''):
23 |         flat_lines = sep.join(list(lines))
24 |         flat_lines = list(flat_lines.split(sep)) if sep != '' else list(flat_lines)
25 |         tokens = list(set(sep.join(flat_lines)))
26 |         tokens = [t for t in tokens if t not in (bos,eos) and len(t) != 0]
27 |         tokens = [bos,eos] + tokens
28 |         return Vocab(tokens,bos,eos,sep)
29 | 
30 |     def tokenize(self,string):
31 |         """converts string to a list of tokens"""
32 |         tokens = list(filter(len,string.split(self.sep))) \
33 |                     if self.sep != '' else list(string)
34 |         return [self.bos] + tokens + [self.eos]
35 | 
36 |     def to_matrix(self, lines, max_len=None):
37 |         """
38 |         convert variable length token sequences into  fixed size matrix
39 |         example usage:
40 |         >>>print( as_matrix(words[:3],source_to_ix))
41 |         [[15 22 21 28 27 13 -1 -1 -1 -1 -1]
42 |          [30 21 15 15 21 14 28 27 13 -1 -1]
43 |          [25 37 31 34 21 20 37 21 28 19 13]]
44 |         """
45 |         max_len = max_len or max(map(len, lines)) + 2 # 2 for bos and eos
46 | 
47 |         matrix = np.zeros((len(lines), max_len), dtype='int32') + self.eos_ix
48 |         for i, seq in enumerate(lines):
49 |             tokens = self.tokenize(seq)
50 |             row_ix = list(map(self.token_to_ix.get, tokens))[:max_len]
51 |             matrix[i, :len(row_ix)] = row_ix
52 | 
53 |         return matrix
54 | 
55 |     def to_lines(self, matrix, crop=True):
56 |         """
57 |         Convert matrix of token ids into strings
58 |         :param matrix: matrix of tokens of int32, shape=[batch,time]
59 |         :param crop: if True, crops BOS and EOS from line
60 |         :return:
61 |         """
62 |         lines = []
63 |         for line_ix in map(list,matrix):
64 |             if crop:
65 |                 if line_ix[0] == self.bos_ix:
66 |                     line_ix = line_ix[1:]
67 |                 if self.eos_ix in line_ix:
68 |                     line_ix = line_ix[:line_ix.index(self.eos_ix)]
69 |             line = self.sep.join(self.tokens[i] for i in line_ix)
70 |             lines.append(line)
71 |         return lines
72 | 


--------------------------------------------------------------------------------