├── .ipynb_checkpoints ├── Code_Report-checkpoint.ipynb └── Data_Preprocessing-checkpoint.ipynb ├── Data_Preprocessing.ipynb ├── DeepAE_Rec.ipynb ├── README.md ├── Report.md ├── Report.pdf └── img ├── 512_256_512_001_0.png ├── 512_256_512_001_0_rmse.png ├── 512_256_512_001_loss.png ├── 512_256_512_avg_001.png ├── 6.png ├── AutoEncoder.png ├── AutoRec.png ├── Screen Shot 2019-04-09 at 11.15.51 PM.png ├── Screen Shot 2019-04-09 at 11.27.53 PM.png ├── Screen Shot 2019-04-09 at 11.29.47 PM.png ├── age_gender.png ├── average_elu_elu_0002.png ├── average_elu_elu_1000.png ├── average_elu_elu_500.png ├── denoise.png ├── download.png └── dropout noise.png /.ipynb_checkpoints/Code_Report-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Data_Preprocessing-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Import packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "import pandas as pd" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Define constants\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 4, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "BASE_DIR = 'data' \n", 34 | "MOVIELENS_DIR = BASE_DIR + '/ml-1m/'\n", 35 | "USER_DATA_FILE = 'users.dat'\n", 36 | "MOVIE_DATA_FILE = 'movies.dat'\n", 37 | "RATING_DATA_FILE = 'ratings.dat'\n", 38 | "AGES = { 1: \"Under 18\", 18: \"18-24\", 25: \"25-34\", 35: \"35-44\", 45: \"45-49\", 50: \"50-55\", 56: \"56+\" }\n", 39 | "OCCUPATIONS = { 0: \"other or not specified\", 1: \"academic/educator\", 2: \"artist\", 3: \"clerical/admin\",\n", 40 | " 4: \"college/grad student\", 5: \"customer service\", 6: \"doctor/health care\",\n", 41 | " 7: \"executive/managerial\", 8: \"farmer\", 9: \"homemaker\", 10: \"K-12 student\", 11: \"lawyer\",\n", 42 | " 12: \"programmer\", 13: \"retired\", 14: \"sales/marketing\", 15: \"scientist\", 16: \"self-employed\",\n", 43 | " 17: \"technician/engineer\", 18: \"tradesman/craftsman\", 19: \"unemployed\", 20: \"writer\" }\n", 44 | "RATINGS_CSV_FILE = 'ml1m_ratings.csv'\n", 45 | "USERS_CSV_FILE = 'ml1m_users.csv'\n", 46 | "MOVIES_CSV_FILE = 'ml1m_movies.csv'" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Download MovieLens 1M data\n", 54 | "\n", 55 | "The MovieLens 1M Dataset can be downloaded from http://files.grouplens.org/datasets/movielens/ml-1m.zip." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "1000209 ratings loaded\n", 68 | "Saved to ml1m_ratings.csv\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_DATA_FILE), \n", 74 | " sep='::', \n", 75 | " engine='python', \n", 76 | " encoding='latin-1',\n", 77 | " names=['userid', 'movieid', 'rating', 'timestamp'])\n", 78 | "\n", 79 | "max_userid = ratings['userid'].drop_duplicates().max()\n", 80 | "max_movieid = ratings['movieid'].drop_duplicates().max()\n", 81 | "ratings['user_emb_id'] = ratings['userid'] - 1\n", 82 | "ratings['movie_emb_id'] = ratings['movieid'] - 1\n", 83 | "print(len(ratings), 'ratings loaded')\n", 84 | "ratings.to_csv(RATINGS_CSV_FILE, \n", 85 | " sep='\\t', \n", 86 | " header=True, \n", 87 | " encoding='latin-1', \n", 88 | " columns=['user_emb_id', 'movie_emb_id', 'rating', 'timestamp'])\n", 89 | "print('Saved to', RATINGS_CSV_FILE)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 7, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "6040 descriptions of 6040 users loaded.\n", 102 | "Saved to ml1m_users.csv\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "users = pd.read_csv(os.path.join(MOVIELENS_DIR, USER_DATA_FILE), \n", 108 | " sep='::', \n", 109 | " engine='python', \n", 110 | " encoding='latin-1',\n", 111 | " names=['userid', 'gender', 'age', 'occupation', 'zipcode'])\n", 112 | "users['age_desc'] = users['age'].apply(lambda x: AGES[x])\n", 113 | "users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])\n", 114 | "print(len(users), 'descriptions of', max_userid, 'users loaded.')\n", 115 | "users['user_emb_id'] = users['userid'] - 1\n", 116 | "users.to_csv(USERS_CSV_FILE, \n", 117 | " sep='\\t', \n", 118 | " header=True, \n", 119 | " encoding='latin-1',\n", 120 | " columns=['user_emb_id', 'gender', 'age', 'occupation', 'zipcode', 'age_desc', 'occ_desc'])\n", 121 | "print('Saved to', USERS_CSV_FILE)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 9, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "3883 descriptions of 3952 movies loaded.\n", 134 | "Saved to ml1m_movies.csv\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_DATA_FILE), \n", 140 | " sep='::', \n", 141 | " engine='python', \n", 142 | " encoding='latin-1',\n", 143 | " names=['movieid', 'title', 'genre'])\n", 144 | "print(len(movies), 'descriptions of', max_movieid, 'movies loaded.')\n", 145 | "movies['movie_emb_id'] = movies['movieid'] - 1\n", 146 | "movies.to_csv(MOVIES_CSV_FILE, \n", 147 | " sep='\\t', \n", 148 | " header=True, \n", 149 | " columns=['movie_emb_id', 'title', 'genre'])\n", 150 | "print('Saved to', MOVIES_CSV_FILE)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 10, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "6040 of the 6040 users rate at least one movie.\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "print(len(ratings['userid'].drop_duplicates()), 'of the', max_userid, 'users rate at least one movie.')" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 11, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "3706 of the 3952 movies are rated.\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "print(len(ratings['movieid'].drop_duplicates()), 'of the', max_movieid, 'movies are rated.')" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Autorec_Keraa3_1516", 198 | "language": "python", 199 | "name": "autorec_kera" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 3 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython3", 211 | "version": "3.6.8" 212 | } 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 1 216 | } 217 | -------------------------------------------------------------------------------- /Data_Preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Import packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "import pandas as pd" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Define constants\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 4, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "BASE_DIR = 'data' \n", 34 | "MOVIELENS_DIR = BASE_DIR + '/ml-1m/'\n", 35 | "USER_DATA_FILE = 'users.dat'\n", 36 | "MOVIE_DATA_FILE = 'movies.dat'\n", 37 | "RATING_DATA_FILE = 'ratings.dat'\n", 38 | "\n", 39 | "#from http://files.grouplens.org/datasets/movielens/ml-1m-README.txt\n", 40 | "\n", 41 | "AGES = { 1: \"Under 18\", 18: \"18-24\", 25: \"25-34\", 35: \"35-44\", 45: \"45-49\", 50: \"50-55\", 56: \"56+\" }\n", 42 | "OCCUPATIONS = { 0: \"other or not specified\", 1: \"academic/educator\", 2: \"artist\", 3: \"clerical/admin\",\n", 43 | " 4: \"college/grad student\", 5: \"customer service\", 6: \"doctor/health care\",\n", 44 | " 7: \"executive/managerial\", 8: \"farmer\", 9: \"homemaker\", 10: \"K-12 student\", 11: \"lawyer\",\n", 45 | " 12: \"programmer\", 13: \"retired\", 14: \"sales/marketing\", 15: \"scientist\", 16: \"self-employed\",\n", 46 | " 17: \"technician/engineer\", 18: \"tradesman/craftsman\", 19: \"unemployed\", 20: \"writer\" }\n", 47 | "RATINGS_CSV_FILE = 'ml1m_ratings.csv'\n", 48 | "USERS_CSV_FILE = 'ml1m_users.csv'\n", 49 | "MOVIES_CSV_FILE = 'ml1m_movies.csv'" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Download MovieLens 1M data\n", 57 | "\n", 58 | "The MovieLens 1M Dataset can be downloaded from http://files.grouplens.org/datasets/movielens/ml-1m.zip." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 6, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "1000209 ratings loaded\n", 71 | "Saved to ml1m_ratings.csv\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_DATA_FILE), \n", 77 | " sep='::', \n", 78 | " engine='python', \n", 79 | " encoding='latin-1',\n", 80 | " names=['userid', 'movieid', 'rating', 'timestamp'])\n", 81 | "\n", 82 | "max_userid = ratings['userid'].drop_duplicates().max()\n", 83 | "max_movieid = ratings['movieid'].drop_duplicates().max()\n", 84 | "ratings['user_emb_id'] = ratings['userid'] - 1\n", 85 | "ratings['movie_emb_id'] = ratings['movieid'] - 1\n", 86 | "print(len(ratings), 'ratings loaded')\n", 87 | "ratings.to_csv(RATINGS_CSV_FILE, \n", 88 | " sep='\\t', \n", 89 | " header=True, \n", 90 | " encoding='latin-1', \n", 91 | " columns=['user_emb_id', 'movie_emb_id', 'rating', 'timestamp'])\n", 92 | "print('Saved to', RATINGS_CSV_FILE)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 7, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "6040 descriptions of 6040 users loaded.\n", 105 | "Saved to ml1m_users.csv\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "users = pd.read_csv(os.path.join(MOVIELENS_DIR, USER_DATA_FILE), \n", 111 | " sep='::', \n", 112 | " engine='python', \n", 113 | " encoding='latin-1',\n", 114 | " names=['userid', 'gender', 'age', 'occupation', 'zipcode'])\n", 115 | "users['age_desc'] = users['age'].apply(lambda x: AGES[x])\n", 116 | "users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])\n", 117 | "print(len(users), 'descriptions of', max_userid, 'users loaded.')\n", 118 | "users['user_emb_id'] = users['userid'] - 1\n", 119 | "users.to_csv(USERS_CSV_FILE, \n", 120 | " sep='\\t', \n", 121 | " header=True, \n", 122 | " encoding='latin-1',\n", 123 | " columns=['user_emb_id', 'gender', 'age', 'occupation', 'zipcode', 'age_desc', 'occ_desc'])\n", 124 | "print('Saved to', USERS_CSV_FILE)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 9, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "3883 descriptions of 3952 movies loaded.\n", 137 | "Saved to ml1m_movies.csv\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_DATA_FILE), \n", 143 | " sep='::', \n", 144 | " engine='python', \n", 145 | " encoding='latin-1',\n", 146 | " names=['movieid', 'title', 'genre'])\n", 147 | "print(len(movies), 'descriptions of', max_movieid, 'movies loaded.')\n", 148 | "movies['movie_emb_id'] = movies['movieid'] - 1\n", 149 | "movies.to_csv(MOVIES_CSV_FILE, \n", 150 | " sep='\\t', \n", 151 | " header=True, \n", 152 | " columns=['movie_emb_id', 'title', 'genre'])\n", 153 | "print('Saved to', MOVIES_CSV_FILE)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 10, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "6040 of the 6040 users rate at least one movie.\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "print(len(ratings['userid'].drop_duplicates()), 'of the', max_userid, 'users rate at least one movie.')" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 11, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "3706 of the 3952 movies are rated.\n" 183 | ] 184 | } 185 | ], 186 | "source": [ 187 | "print(len(ratings['movieid'].drop_duplicates()), 'of the', max_movieid, 'movies are rated.')" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [] 196 | } 197 | ], 198 | "metadata": { 199 | "kernelspec": { 200 | "display_name": "Autorec_Keraa3_1516", 201 | "language": "python", 202 | "name": "autorec_kera" 203 | }, 204 | "language_info": { 205 | "codemirror_mode": { 206 | "name": "ipython", 207 | "version": 3 208 | }, 209 | "file_extension": ".py", 210 | "mimetype": "text/x-python", 211 | "name": "python", 212 | "nbconvert_exporter": "python", 213 | "pygments_lexer": "ipython3", 214 | "version": "3.6.8" 215 | } 216 | }, 217 | "nbformat": 4, 218 | "nbformat_minor": 1 219 | } 220 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep-AutoEncoder-Recommendation 2 | 3 | Autoencoder has been widely adopted into Collaborative Filtering (CF) for recommendation system. A classic CF problem is inferring the missing rating in an MxN matrix R where R(i, j) is the ratings given by the ith user to the jth item. This project is a Keras implementation of AutoRec [1] and Deep AutoRec [2] with additional experiments such as the impact of default rating of users or ratings. 4 | 5 | The Dataset I used for this project is MovieLens 1M Dataset and can be downloaded from [here](). 6 | 7 | The preprocessing of the dataset can be found in this [Jupyter Notebook]() 8 | 9 | The implementation of models in Keras can be found in this [Jupyter Notebook]() 10 | 11 | ## Reference 12 | 13 | [1] Sedhain, Suvash, et al. "Autorec: Autoencoders meet collaborative filtering." *Proceedings of the 24th International Conference on World Wide Web*. ACM, 2015 14 | 15 | [2] Kuchaiev, Oleksii, and Boris Ginsburg. "Training deep autoencoders for collaborative filtering." *arXiv preprint arXiv:1708.01715* (2017). 16 | 17 | [3]Wu, Yao, et al. "Collaborative denoising auto-encoders for top-n recommender systems." *Proceedings of the Ninth ACM International Conference on Web Search and Data Mining*. ACM, 2016. 18 | 19 | [4]Strub, Florian, Jérémie Mary, and Romaric Gaudel. "Hybrid collaborative filtering with autoencoders." *arXiv preprint arXiv:1603.00806* (2016). 20 | 21 | 22 | 23 | ## Github Reference 24 | 25 | https://github.com/NVIDIA/DeepRecommender 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /Report.md: -------------------------------------------------------------------------------- 1 | # Deep Autoencoder Recommendation 2 | 3 | ## Table of Contents 4 | 5 | 1. Introduction 6 | 2. Data Preprocessing 7 | 3. AutoRec and Experiments 8 | 4. Deep AutoRec and Experiments 9 | 5. Denoising Experiments 10 | 6. Hybrid Experiments 11 | 7. Other Experiments 12 | 8. Conclusion & Future Work 13 | 9. Reference 14 | 15 | The Jupyter Note book of this project can be found [here](https://github.com/RaptorMai/Deep-AutoEncoder-Recommendation) 16 | 17 | ## 1. Introduction 18 | 19 | Autoencoder has been widely adopted into Collaborative Filtering (CF) for recommendation system. A classic CF problem is inferring the missing rating in an MxN matrix R where R(i, j) is the ratings given by the ith user to the jth item. This project is a Keras implementation of AutoRec [1] and Deep AutoRec [2] and additional experiments will be run. 20 | 21 | The data I used is MovieLens 1M Dataset. 22 | 23 | ## 2. Data Preprocessing 24 | 25 | ####Raw data preprocessing 26 | 27 | The raw data file is separated by ```::``` without headers. This part is transforming the raw data file into a CSV with headers, which can be easily imported using Pandas in the following parts. All the user and movie id will be subtracted by 1 for zero-based index. The snippet shows the preprocessing for rating data and similar preprocessing is applied to users data and movies data. 28 | 29 | ```python 30 | ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_DATA_FILE), 31 | sep='::', 32 | engine='python', 33 | encoding='latin-1', 34 | names=['userid', 'movieid', 'rating', 'timestamp']) 35 | 36 | ratings['user_emb_id'] = ratings['userid'] - 1 37 | ratings['movie_emb_id'] = ratings['movieid'] - 1 38 | print(len(ratings), 'ratings loaded') 39 | ratings.to_csv(RATINGS_CSV_FILE, 40 | sep='\t', 41 | header=True, 42 | encoding='latin-1', 43 | columns=['user_emb_id', 'movie_emb_id', 'rating', 'timestamp']) 44 | print('Saved to', RATINGS_CSV_FILE) 45 | ``` 46 | 47 | 48 | 49 | #### Train, Validation and Test Split 50 | 51 | ```python 52 | df = pd.read_csv('ml1m_ratings.csv',sep='\t', encoding='latin-1', 53 | usecols=['user_emb_id', 'movie_emb_id', 'rating', 'timestamp']) 54 | num_users = df['user_emb_id'].unique().max() + 1 55 | num_movies = df['movie_emb_id'].unique().max() + 1 56 | 57 | train_df, test_df = train_test_split(df, 58 | stratify=df['user_emb_id'], 59 | test_size=0.1, 60 | random_state=999613182) 61 | train_df, validate_df = train_test_split(train_df, 62 | stratify=train_df['user_emb_id'], 63 | test_size=0.1, 64 | random_state=999613182) 65 | ``` 66 | 67 | I split the data into random 90%–10% train-test sets, and hold out 10% of the training set for validation. 68 | 69 | I used a fixed random_state = 999613182 for reproduction. When I use ```train_test_split``` from ```sklearn``` , I use **Stratify with user_id** . This setting is critical, without this setting, it's possible that reviews of one user are all split into one the training or test set and cause bias. For example if all the reviews of user A are put into the training set, then during test time, there is no test data for this user. The test RMSE will be 0 for this user. On the other hand, if all reviews are put into test set, then there is no review for this user during training time and cause the RMSE higher for this user. 70 | 71 | 72 | 73 | #### Transform DataFrame to matrix 74 | 75 | ![Screen Shot 2019-04-09 at 11.15.51 PM](/Users/zhedamai/Desktop/MIE1516/project/Deep-AutoEncoder-Recommendation/img/Screen Shot 2019-04-09 at 11.15.51 PM.png) 76 | 77 | ​ Figure 1: Dataframe containing ratings data 78 | 79 | ```python 80 | def dataPreprocessor(rating_df, num_users, num_items, init_value=0, average=False): 81 | """ 82 | INPUT: 83 | data: pandas DataFrame. columns=['index', 'userID', 'itemID', 'rating' ...] 84 | num_row: int. number of users 85 | num_col: int. number of items 86 | 87 | OUTPUT: 88 | matrix: 2D numpy array. 89 | """ 90 | if average: 91 | matrix = np.full((num_users, num_items), 0.0) 92 | for (_, userID, itemID, rating, timestamp) in rating_df.itertuples(): 93 | matrix[userID, itemID] = rating 94 | avergae = np.true_divide(matrix.sum(1), np.maximum((matrix!=0).sum(1), 1)) 95 | inds = np.where(matrix == 0) 96 | matrix[inds] = np.take(avergae, inds[0]) 97 | 98 | else: 99 | matrix = np.full((num_users, num_items), init_value) 100 | for (_, userID, itemID, rating, timestamp) in rating_df.itertuples(): 101 | matrix[userID, itemID] = rating 102 | 103 | return matrix 104 | ``` 105 | 106 | In order to apply AutoRec on the dataset, the dataset should be transformed to a MxN matrix where R(i, j) is the ratings given by the ith user to the jth item. 107 | 108 | The function ```dataPreprocessor``` is used for this transformation. The init_value is the default rating for unobserved ratings. If ```average ``` is set to ```True```, the unobvserved rating will be set as the average rating of the user. 109 | 110 | 111 | 112 | ## 3. AutoRec and Experiment 113 | 114 | ### AutoRec 115 | 116 | drawing 117 | 118 | ​ Figure2: Item-based AutoRec Model shown in the the AutoRec Paper[1] 119 | 120 | 121 | 122 | The model I am going to implement is a user-based AutoRec, which take the partially observed ratings vector of a user, project it into a low dimensional latent space and then reconstruct back to the output space to predict the missing rating. 123 | 124 | 125 | 126 | #### Loss Function 127 | 128 | Since it does not make sense to predict zero in the user rating vector, I follow the AutoRec paper to minimize the Masked Mean Squared Error(MMSE). 129 | 130 | drawing 131 | 132 | where ri is the actual rating and yi is the reconstructed rating. mi is a mask function where mi =1 where ri is non-zero else mi=0. 133 | 134 | Since Masked Mean Squared Error is not provided in Keras, so I need to customize the error function. 135 | 136 | ```python 137 | def masked_mse(y_true, y_pred): 138 | # masked function 139 | mask_true = K.cast(K.not_equal(y_true, 0), K.floatx()) 140 | # masked squared error 141 | masked_squared_error = K.square(mask_true * (y_true - y_pred)) 142 | # average the error 143 | masked_mse = K.sum(masked_squared_error, axis=-1) / K.maximum(K.sum(mask_true, 144 | axis=-1), 1) 145 | return masked_mse 146 | ``` 147 | 148 | 149 | 150 | #### Metric 151 | 152 | The performance of the model is measured by the Masked Root Mean Squared Error (MRMSE). Similar to MMSE, we only take into consideration the error where the rating is not zero in the test set. Also, I clip the predicted rating with 1 as minimum and 5 as maximum. 153 | 154 | ```python 155 | def masked_rmse_clip(y_true, y_pred): 156 | # masked function 157 | mask_true = K.cast(K.not_equal(y_true, 0), K.floatx()) 158 | # clipping the rating 159 | y_pred = K.clip(y_pred, 1, 5) 160 | # masked squared error 161 | masked_squared_error = K.square(mask_true * (y_true - y_pred)) 162 | # square root 163 | masked_mse = K.sqrt(K.sum(masked_squared_error, axis=-1) / 164 | K.maximum(K.sum(mask_true, axis=-1), 1)) 165 | return masked_mse 166 | ``` 167 | 168 | 169 | 170 | #### Model 171 | 172 | ```python 173 | def AutoRec(X, reg, first_activation, last_activation): 174 | ''' 175 | AutoRec 176 | ''' 177 | input_layer = x = Input(shape=(X.shape[1],), name='UserRating') 178 | 179 | x = Dense(500, activation=first_activation, name='LatentSpace', 180 | kernel_regularizer=regularizers.l2(reg))(x) 181 | 182 | output_layer = Dense(X.shape[1], activation=last_activation, name='UserScorePred', 183 | kernel_regularizer=regularizers.l2(reg))(x) 184 | 185 | model = Model(input_layer, output_layer) 186 | 187 | return model 188 | 189 | _________________________________________________________________ 190 | Layer (type) Output Shape Param # 191 | ================================================================= 192 | UserRating (InputLayer) (None, 3952) 0 193 | _________________________________________________________________ 194 | LatentSpace (Dense) (None, 500) 1976500 195 | _________________________________________________________________ 196 | UserScorePred (Dense) (None, 3952) 1979952 197 | ================================================================= 198 | Total params: 3,956,452 199 | Trainable params: 3,956,452 200 | Non-trainable params: 0 201 | _________________________________________________________________ 202 | ``` 203 | 204 | 205 | 206 | ​ 207 | 208 | #### Baseline Settings 209 | 210 | ```python 211 | # Build model 212 | AutoRec = AutoRec(users_items_matrix_train_zero) 213 | AutoRec.compile(optimizer = Adam(lr=0.0001), loss=masked_mse, metrics=[masked_rmse_clip]) 214 | hist_Autorec = AutoRec.fit(x=users_items_matrix_train_zero, y=users_items_matrix_train_zero, 215 | epochs=500, 216 | batch_size=256, 217 | verbose = 2, 218 | validation_data=[users_items_matrix_train_zero, 219 | users_items_matrix_validate]) 220 | ``` 221 | 222 | 223 | 224 | | L2 Regularization | Optimizer | Learning Rate | Epochs | Batch Size | Activations | Default Rating | 225 | | :---------------: | :-------: | :-----------: | ------ | :--------: | ---------------- | :------------: | 226 | | 0.001 | Adam | 0.0001 | 500 | 256 | Sigmoid + Linear | 0 | 227 | 228 | ​ Table1: Baseline settings of AutoRec 229 | 230 | ###Experiments 231 | 232 | #### Activations 233 | 234 | In the AutoRec paper, it only tested sigmoid and linear. Since the Deep_AE_CF paper [2] found that 235 | activations with **non-zero negative part** and **unbounded positive part** perform better for deep autoencoders for CF. So I tried ELU, SELU and LRELU on AutoRec. 236 | 237 | | Activation | **Modification** | **Test RMSE** | **Train RMSE** | 238 | | :-----------: | :-----------------------------------------------------: | :-----------: | :------------: | 239 | | Baseline | Keep L2 $$\lambda$$ 0.001 | ***0.916*** | ***0.914*** | 240 | | ELU, ELU | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit | 0.931 | 0.944 | 241 | | ELU, Linear | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit | 0.9338 | 0.943 | 242 | | SELU, SELU | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit | 0.9431 | 0.9543 | 243 | | SELU, Linear | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit | 0.957 | 0.964 | 244 | | LRelu, LRelu | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit | 0.9336 | 0.945 | 245 | | LRelu, Linear | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit | 0.9386 | 0.9486 | 246 | | Sigmoid, ELU | Keep L2 $$\lambda$$ 0.001 | ***0.9137*** | ***0.9156*** | 247 | | Tanh, ELU | Keep L2 $$\lambda$$ 0.001 | 0.9454 | 0.963 | 248 | 249 | ​ Table2: Comparing Activation Functions on AutoRec 250 | 251 | All the hyper-parameters including L2 $$\lambda$$ , learning rate and epochs are not fine-tuned, so the result is not as good as the AutoRec paper. But we can find out activations that perform well in Deep Autoencoder CF do not outperform the Sigmoid+Linear baseline. Also, we found that when changing the activation from Sigmod to other activations with unbounded positive part, the model is easier to overfit. 252 | 253 | 254 | 255 | #### Default Rating 256 | 257 | In the Deep_AE_CF paper, the default rating is 0 while in the AutoRec paper, the default rating is 3. So, I decided to tried different default ratings. 258 | 259 | | Default Rating | Modification | Test RMSE | Train RMSE | 260 | | :------------: | :----------: | :-------: | :--------: | 261 | | 0 (Baseline) | N/A | **0.916** | 0.914 | 262 | | 1 | N/A | 0.926 | 0.932 | 263 | | 2 | N/A | 0.933 | 0.944 | 264 | | 3 | N/A | 0.945 | 0.96 | 265 | | 4 | N/A | 0.9445 | 0.960 | 266 | | 5 | N/A | 0.943 | 0.958 | 267 | | Average | N/A | **0.912** | 0.923 | 268 | 269 | ​ Table3: Comparing default ratings for unobserved rating 270 | 271 | 272 | 273 | drawing 274 | 275 | ​ Figure3: Average Vs 0 as default rating 276 | 277 | As we can see, average and 0 outperform other default ratings. When we comparing the Val_RMSE of 0 and average, we found that **using the average as default rating converge much faster than using 0 but with more noise.** 278 | 279 | 280 | 281 | #### Unexpected finding 282 | 283 | Before I moved to the next experiment, I tried ELU + ELU as activations with the avergae as default rating, it turns out to work pretty well with the default L2 $$\lambda$$ 0.001. For Autorect, it seems this is the best model we have so far. The gap between validation loss and training loss is pretty small and increasing the regularization did not help, so I think we should move deeper to increase the model complexity. 284 | 285 | | Default Rating | Modification | Test RMSE | Train RMSE | 286 | | :------------: | :----------: | :-------: | :--------: | 287 | | 0 (Baseline) | | 0.916 | 0.914 | 288 | | Average | ELU+ELU | **0.877** | 0.806 | 289 | | Average | SELU+SELU | **0.878** | 0.748 | 290 | 291 | ​ Table4: Use Avergae as default rating works good using ELU+ELU on AutoRec 292 | 293 | 294 | 295 |

296 | 297 | 298 |

299 | 300 | 301 | 302 | ​ Figure4: Training Vs Validation Masked RMSE and Loss (Average, ELU+ELU) 303 | 304 | drawing 305 | 306 | ​ Figure5: Test RMSE as the regularization parameters varies 307 | 308 | 309 | 310 | ## 4. Deep AutoRec and Experiments 311 | 312 | ### Deep Autoencoder Collaborative Filtering 313 | 314 | 315 | 316 | drawing 317 | 318 | This paper continued the AutoRec idea to deepen Autoencoder. It uses MMSE as loss function, same as AutoRec. It proposes activation functions with non-zero negative part and unbounded positive part works better. Also, it uses dropout layers after the latent layer to avoid overfitting. It also shows that using large dropout rate after the latent layer allows it to learn robust representations. 319 | 320 | 321 | 322 | ```python 323 | def Deep_AE_model(X, layers, activation, last_activation, dropout, regularizer_encode, regularizer_decode): 324 | ''' 325 | Build Deep AE for CF 326 | INPUT: 327 | X: #_user X #_item matrix 328 | layers: List, each element is the number of neuron for a layer 329 | reg: L2 regularization parameter 330 | activation: activation function for all dense layer except the last 331 | last_activation: activation function for the last dense layer 332 | dropout: dropout rate 333 | regularizer_encode: regularizer for encoder 334 | regularizer_decode: regularizer for decoder 335 | OUTPUT: 336 | Keras model 337 | ''' 338 | 339 | # Input 340 | input_layer = x = Input(shape=(X.shape[1],), name='UserRating') 341 | 342 | # Encoder 343 | # ----------------------------- 344 | k = int(len(layers)/2) 345 | i = 0 346 | for l in layers[:k]: 347 | x = Dense(l, activation=activation, 348 | name='EncLayer{}'.format(i), 349 | kernel_regularizer=regularizers.l2(regularizer_encode))(x) 350 | i = i+1 351 | 352 | 353 | # Latent Space 354 | # ----------------------------- 355 | x = Dense(layers[k], activation=activation, 356 | name='LatentSpace', kernel_regularizer=regularizers.l2(regularizer_encode))(x) 357 | 358 | # Dropout 359 | x = Dropout(rate = dropout)(x) 360 | 361 | # Decoder 362 | # ----------------------------- 363 | for l in layers[k+1:]: 364 | i = i-1 365 | x = Dense(l, activation=activation, 366 | name='DecLayer{}'.format(i), 367 | kernel_regularizer=regularizers.l2(regularizer_decode))(x) 368 | 369 | # Output 370 | output_layer = Dense(X.shape[1], activation=last_activation, name='UserScorePred', 371 | kernel_regularizer=regularizers.l2(regularizer_decode))(x) 372 | 373 | # this model maps an input to its reconstruction 374 | model = Model(input_layer, output_layer) 375 | 376 | return model 377 | 378 | Example Summary 379 | _________________________________________________________________ 380 | Layer (type) Output Shape Param # 381 | ================================================================= 382 | UserRating (InputLayer) (None, 3952) 0 383 | _________________________________________________________________ 384 | EncLayer0 (Dense) (None, 128) 505984 385 | _________________________________________________________________ 386 | EncLayer1 (Dense) (None, 256) 33024 387 | _________________________________________________________________ 388 | LatentSpace (Dense) (None, 512) 131584 389 | _________________________________________________________________ 390 | dropout_1 (Dropout) (None, 512) 0 391 | _________________________________________________________________ 392 | DecLayer1 (Dense) (None, 256) 131328 393 | _________________________________________________________________ 394 | DecLayer0 (Dense) (None, 128) 32896 395 | _________________________________________________________________ 396 | UserScorePred (Dense) (None, 3952) 509808 397 | ================================================================= 398 | Total params: 1,344,624 399 | Trainable params: 1,344,624 400 | Non-trainable params: 0 401 | ``` 402 | 403 | Baseline setting 404 | 405 | | L2 Regularization | Optimizer | Learning Rate | Epochs | Batch Size | Activations | Default Rating | Dropout | 406 | | :---------------: | :-------: | :-----------: | ------ | :--------: | ----------- | :------------: | :-----: | 407 | | 0.001 | Adam | 0.0001 | 500 | 256 | SELU+SELU | 0 | 0.8 | 408 | 409 | 410 | 411 | ####Architecture 412 | 413 | In the paper, it proposes different architectures for different Netflix datasets. But I found that all architectures have the largest layer in the middle and smaller layers at the begining and the end. I called this structure Small-Big-Small(SBS). But conventional autoencoders have the Big-Small-Big (BSB) structure. I decided to try both structures on our dataset. 414 | 415 | | Architecture | Shape (Depth) | Number of parameters | Default Rating | Modification | Test RMSE | Train RMSE | 416 | | :-----------------------: | :-------------: | :------------------: | :--------------: | :----------------------------: | :-------: | :--------: | 417 | | [512, 256, 512] | BSB (3) | 4.3M | Avergae | | 0.8792 | 0.840 | 418 | | [512, 256, 512] | BSB(3) | 4.3M | Zero | | **0.856** | 0.764 | 419 | | [512, 256, 128, 256, 512] | BSB(5) | 4.4M | Average | | 0.895 | 0.87 | 420 | | [512, 256, 128, 256, 512] | BSB(5) | 4.4M | Zero | L2 $$\lambda$$ 0.001 to 0.0015 | 0.869 | 0.827 | 421 | | [256, 512, 256] | SBS(3) | 2.3M | Avergage | | 0.878 | 0.85 | 422 | | [256, 512, 256] | SBS(3) | 2.3M | Zero | | **0.857** | 0.760 | 423 | | [128, 256, 512, 256, 128] | SBS(5) | 1.3M | Average | | 0.881 | 0.87 | 424 | | [128, 256, 512, 256, 128] | SBS(5) | 1.3M | Zero | L2 $$\lambda$$ 0.001 to 0.0015 | 0.868 | 0.84 | 425 | 426 | ​ Table5: Comparison of differnt acrchitetures 427 | 428 | 429 | 430 | ####Comparison between average Vs zero as default rating 431 | 432 | I took [512, 256, 512] as an example. Other architectures have similar phenomena. 433 | 434 |

435 |

436 | 437 | ​ Figure6: [512, 256, 512] with average as default rating 438 | 439 |

440 |

441 | 442 | ​ Figure7: [512, 256, 512] with zero as default rating 443 | 444 | When we compared the average and zero as default rating in AutoRec, we found that average converged faster but with noise. But when the model goes deeper, **the zero default rating converged faster and with less noise.** However, when we take a look at the loss, the gap between training and validation is larger in zero default setting. This means when we use zero as default rating, the model is easier to overfit. 445 | 446 | Also, as we can see in table5, adding more layers does not help for both BSB and SBS shape. As we go deeper, it’s easier to get overfitted and increasing the regularization parameters will bring the test performance down. So, in our project, using three hidden layers is the best option. Moreover, [512, 256, 512] and [256, 512, 256] have similar performance but [256, 512, 256] has half the number of parameters. So I will use [256, 512, 256] in further experiments, as fewer parameters not only allows us to train model with less data but also can mitigate overfitting. 447 | 448 | ## 5. Denoising and Hybrid Experiment 449 | 450 | Common corruption choices are the additive Gaussian noise and multiplicative dropout noise. In the Denoising paper[3], it only used multiplicative dropout noise and I am going to test both. 451 | 452 | #### Gaussian Noise 453 | 454 | Since [256, 512, 256]+zero has the best performance, we test the denoising on this setting. 455 | 456 | ```python 457 | noise_factor = 0.1 458 | users_items_matrix_train_average_noisy = users_items_matrix_train_average + noise_factor * np.random.normal(size=users_items_matrix_train_zero.shape) 459 | ``` 460 | 461 | drawing 462 | 463 | ​ Figure8: Test RMSE on different Gaussian Noise constant 464 | 465 | According to Figure8, adding Gaussian Noise did not improve the model. As default rating has an impact on the performance, adding noise is changing the default rating and this may be one potential reason. Deep AutoRec has the similar graph as AutoRec 466 | 467 | 468 | 469 | ####Dropout Noise 470 | 471 | In the denoising paper[3], it masked out non-zero elements randomly in each batch and use the masked input. However, using Keras to implement this feature will be the same as using pure TensorFlow. Due to the time limit of this project, I will leave this as future work and I made a compromise by adding a dropout layer between input and first dense layer. This dropout will mask out all elements randomly with a dropout rate. As we can see in Figure8, when the dropout rate increase for the noise, the RMSE started increasing. When the rate was 0.1, the performance actually was better than the baseline but since it’s only 0.002 difference, it may still be in the range of error. It needs cross-validation for further verification. 472 | 473 | ```python 474 | # Input 475 | input_layer = x = Input(shape=(X.shape[1],), name='UserRating') 476 | 477 | # Dropout Noise 478 | x = Dropout(rate = noise)(x) 479 | 480 | # Encoder 481 | # ----------------------------- 482 | k = int(len(layers)/2) 483 | i = 0 484 | for l in layers[:k]: 485 | x = Dense(l, activation=activation, 486 | name='EncLayer{}'.format(i), 487 | kernel_regularizer=regularizers.l2(regularizer_encode))(x) 488 | i = i+1 489 | ``` 490 | 491 | drawing 492 | 493 | ​ Figure8: Test RMSE on different Dropout Noise 494 | 495 | ## 6. Hybrid Experiments 496 | 497 | Since we have the information about each user, I want to try adding the side-information in this model. 498 | 499 | For each user, we have gender, age and occupation and after transforming to one hot encoding format, each user has totally 30 features. 500 | 501 | ```python 502 | user_df = pd.read_csv('ml1m_users.csv',sep='\t', encoding='latin-1', 503 | usecols=['user_emb_id', 'gender', 'age', 'occupation']) 504 | user_df['age'] = preprocessing.LabelEncoder(). 505 | fit(user_df['age']).transform(user_df['age']) 506 | 507 | user_df['gender']=preprocessing.LabelEncoder().fit(user_df['gender']). 508 | transform(user_df['gender']) 509 | 510 | onehot_df = preprocessing.OneHotEncoder(handle_unknown='ignore',sparse=False). 511 | fit(user_df[['gender', 'age','occupation']]). 512 | transform(user_df[['gender', 'age', 'occupation']]) 513 | ``` 514 | 515 | ### Concatenate side-information to rating 516 | 517 | For this method, I concatenated the side information to the rating matrix, so the shape of the matrix will be changed from 6040x3952 to 6040x3982. We still want to reconstruct only the rating matrix, so the output shape is 6040x3952. The only change in the code is I add a new argument called side_infor_size in Deep_AE_model and change the output size back to 6040x3982 518 | 519 | ```python 520 | #6040x3982 521 | user_items_user_info = np.concatenate((users_items_matrix_train_zero, onehot_df), axis=1) 522 | ``` 523 | 524 | 525 | 526 | ```python 527 | def Deep_AE_model(X, layers, activation, last_activation, dropout, regularizer_encode, regularizer_decode, side_infor_size=0): 528 | ''' 529 | Build Deep AE for CF 530 | INPUT: 531 | X: #_user X #_item matrix 532 | layers: List, each element is the number of neuron for a layer 533 | reg: L2 regularization parameter 534 | activation: activation function for all dense layer except the last 535 | last_activation: activation function for the last dense layer 536 | dropout: dropout rate 537 | regularizer_encode: regularizer for encoder 538 | regularizer_decode: regularizer for decoder 539 | side_infor_size: size of the one hot encoding side information 540 | OUTPUT: 541 | Keras model 542 | ''' 543 | 544 | # Input 545 | input_layer = x = Input(shape=(X.shape[1],), name='UserRating') 546 | 547 | # Encoder 548 | # ----------------------------- 549 | k = int(len(layers)/2) 550 | i = 0 551 | for l in layers[:k]: 552 | x = Dense(l, activation=activation, 553 | name='EncLayer{}'.format(i), 554 | kernel_regularizer=regularizers.l2(regularizer_encode))(x) 555 | i = i+1 556 | 557 | 558 | # Latent Space 559 | # ----------------------------- 560 | x = Dense(layers[k], activation=activation, 561 | name='LatentSpace', 562 | kernel_regularizer=regularizers.l2(regularizer_encode))(x) 563 | 564 | # Dropout 565 | x = Dropout(rate = dropout)(x) 566 | 567 | # Decoder 568 | # ----------------------------- 569 | for l in layers[k+1:]: 570 | i = i-1 571 | x = Dense(l, activation=activation, 572 | name='DecLayer{}'.format(i), 573 | kernel_regularizer=regularizers.l2(regularizer_decode))(x) 574 | 575 | # change the output size 576 | output_layer = Dense(X.shape[1]-side_infor_size, activation=last_activation, 577 | name='UserScorePred', 578 | kernel_regularizer= 579 | regularizers.l2(regularizer_decode))(x) 580 | 581 | # this model maps an input to its reconstruction 582 | model = Model(input_layer, output_layer) 583 | 584 | return model 585 | ``` 586 | 587 | I tested this model on the setting of [256, 512, 256]+zero. Adding the side information does not have a limited impact on the result. The error graph, Val RMSE graph and test RMSE are similar to the model without side information. As the repartition of known entries in the dataset is not uniform, the estimates are biased towards users with a lot of rating. For these users, the dataset already has a lot of information and comparing with 3952 rating features, 30 side information feature will have limited effect. But according to [4], when the users have fewer ratings, the side information will have more effect. 588 | 589 | 590 | 591 | ## 7. Other Experiments 592 | 593 | In papers mentioned above, every user(item) is treated equally to update the weights. I thought the assumption under this is that all the ratings from a user are generated from the same distribution. But different people should have different distributions. We can not have one autoencoder for every user but what if we can have one autoencoder for every group of users. We assume users in each group rate movie similarly. 594 | 595 | Based on this, my first idea is we can generate the userXuser similarity matrix and cluster them into different groups. We train an autoencoder for each group. But due to the time limit of this project, I did a small experiment and leave above as future work. 596 | 597 | drawing 598 | 599 | 600 | 601 | I took a look at the age and gender distribution and selected a group with most people, age_group_2 + gender_group_1. This group has 1538 users and train an autoencoder for this group. The test RMSE was only 0.89. But this result may cause by the limited number of users in the training set, as we have 3952 features but only 1538 samples. 602 | 603 | 604 | 605 | ## 8. Conclusion & Future work 606 | 607 | In this project, I implemented AutoRec and Deep AutoRec using Keras and ran some experiments. Below is the summary of experiments I ran and some findings. 608 | 609 | 1. Keras provides very user-friendly, high-level interfaces and it’s very useful and convenient when our model is standard. But when we want to customize some lower level features, Keras is not convenient as PyTorch. For example, in the proposal, I said I want to experiment on the dense re-feeding module, but if I implemented this experiment in Keras, it’s basically the same as writing in TensorFlow. 610 | 2. When I tried some activations with non-zero negative part and unbounded positive part in the original AutoRec, their performances were not as good as sigmoid+linear and the model became easier to overfit. 611 | 3. I compared different default rating and found that using the average as default rating had similar performance as using 0 but converge much faster than using 0 with more noise. 612 | 613 | 4. Using average as default rating and ELU+ELU in AutoRec gave the best performance and improved the baseline by 4.3% 614 | 5. When the model went deeper, using 0 as default rating converged faster and with less noise. For our dataset, [256, 512, 256] and [512, 256, 512] performed similarly but the former architecture has half parameter as the latter one. 615 | 6. Adding Gaussian Noise to the dataset did not help improve the model. As default rating has an impact on the performance, adding noise is changing the default rating and this may be one potential reason 616 | 7. Adding side information to user-based AutoRec has limited impact. As the repartition of known entries in the dataset is not uniform, the estimates are biased towards users with a lot of rating. For these users, the dataset already has a lot of information and comparing with 3952 rating features, 30 side information feature will have limited effect. 617 | 618 | ###Future Work 619 | 620 | 1. I did not have the chance to test MLflow as I did not have GPU and need to run all the experiment on Google Colab. I will try to use MLflow to manage ML life cycle when I can rum experiment locally. 621 | 2. Dense re-feeding and dropout noise are not fully implemented. 622 | 3. I will try to implement the idea I described in Section7. Generating the userXuser similarity matrix and cluster them into different groups. Then train an autoencoder for each group. 623 | 4. Implement cross validation for more accurate result and better hyper-parameters tuning. 624 | 625 | 626 | 627 | ## Reference 628 | 629 | [1] Sedhain, Suvash, et al. "Autorec: Autoencoders meet collaborative filtering." *Proceedings of the 24th International Conference on World Wide Web*. ACM, 2015 630 | 631 | [2] Kuchaiev, Oleksii, and Boris Ginsburg. "Training deep autoencoders for collaborative filtering." *arXiv preprint arXiv:1708.01715* (2017). 632 | 633 | [3]Wu, Yao, et al. "Collaborative denoising auto-encoders for top-n recommender systems." *Proceedings of the Ninth ACM International Conference on Web Search and Data Mining*. ACM, 2016. 634 | 635 | [4]Strub, Florian, Jérémie Mary, and Romaric Gaudel. "Hybrid collaborative filtering with autoencoders." *arXiv preprint arXiv:1603.00806* (2016). 636 | 637 | -------------------------------------------------------------------------------- /Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/Report.pdf -------------------------------------------------------------------------------- /img/512_256_512_001_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/512_256_512_001_0.png -------------------------------------------------------------------------------- /img/512_256_512_001_0_rmse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/512_256_512_001_0_rmse.png -------------------------------------------------------------------------------- /img/512_256_512_001_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/512_256_512_001_loss.png -------------------------------------------------------------------------------- /img/512_256_512_avg_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/512_256_512_avg_001.png -------------------------------------------------------------------------------- /img/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/6.png -------------------------------------------------------------------------------- /img/AutoEncoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/AutoEncoder.png -------------------------------------------------------------------------------- /img/AutoRec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/AutoRec.png -------------------------------------------------------------------------------- /img/Screen Shot 2019-04-09 at 11.15.51 PM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/Screen Shot 2019-04-09 at 11.15.51 PM.png -------------------------------------------------------------------------------- /img/Screen Shot 2019-04-09 at 11.27.53 PM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/Screen Shot 2019-04-09 at 11.27.53 PM.png -------------------------------------------------------------------------------- /img/Screen Shot 2019-04-09 at 11.29.47 PM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/Screen Shot 2019-04-09 at 11.29.47 PM.png -------------------------------------------------------------------------------- /img/age_gender.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/age_gender.png -------------------------------------------------------------------------------- /img/average_elu_elu_0002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/average_elu_elu_0002.png -------------------------------------------------------------------------------- /img/average_elu_elu_1000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/average_elu_elu_1000.png -------------------------------------------------------------------------------- /img/average_elu_elu_500.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/average_elu_elu_500.png -------------------------------------------------------------------------------- /img/denoise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/denoise.png -------------------------------------------------------------------------------- /img/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/download.png -------------------------------------------------------------------------------- /img/dropout noise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/dropout noise.png --------------------------------------------------------------------------------