├── .ipynb_checkpoints
    ├── Code_Report-checkpoint.ipynb
    └── Data_Preprocessing-checkpoint.ipynb
├── Data_Preprocessing.ipynb
├── DeepAE_Rec.ipynb
├── README.md
├── Report.md
├── Report.pdf
└── img
    ├── 512_256_512_001_0.png
    ├── 512_256_512_001_0_rmse.png
    ├── 512_256_512_001_loss.png
    ├── 512_256_512_avg_001.png
    ├── 6.png
    ├── AutoEncoder.png
    ├── AutoRec.png
    ├── Screen Shot 2019-04-09 at 11.15.51 PM.png
    ├── Screen Shot 2019-04-09 at 11.27.53 PM.png
    ├── Screen Shot 2019-04-09 at 11.29.47 PM.png
    ├── age_gender.png
    ├── average_elu_elu_0002.png
    ├── average_elu_elu_1000.png
    ├── average_elu_elu_500.png
    ├── denoise.png
    ├── download.png
    └── dropout noise.png


/.ipynb_checkpoints/Code_Report-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Data_Preprocessing-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Import packages"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "import pandas as pd"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Define constants\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 4,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "BASE_DIR = 'data' \n",
 34 |     "MOVIELENS_DIR = BASE_DIR + '/ml-1m/'\n",
 35 |     "USER_DATA_FILE = 'users.dat'\n",
 36 |     "MOVIE_DATA_FILE = 'movies.dat'\n",
 37 |     "RATING_DATA_FILE = 'ratings.dat'\n",
 38 |     "AGES = { 1: \"Under 18\", 18: \"18-24\", 25: \"25-34\", 35: \"35-44\", 45: \"45-49\", 50: \"50-55\", 56: \"56+\" }\n",
 39 |     "OCCUPATIONS = { 0: \"other or not specified\", 1: \"academic/educator\", 2: \"artist\", 3: \"clerical/admin\",\n",
 40 |     "                4: \"college/grad student\", 5: \"customer service\", 6: \"doctor/health care\",\n",
 41 |     "                7: \"executive/managerial\", 8: \"farmer\", 9: \"homemaker\", 10: \"K-12 student\", 11: \"lawyer\",\n",
 42 |     "                12: \"programmer\", 13: \"retired\", 14: \"sales/marketing\", 15: \"scientist\", 16: \"self-employed\",\n",
 43 |     "                17: \"technician/engineer\", 18: \"tradesman/craftsman\", 19: \"unemployed\", 20: \"writer\" }\n",
 44 |     "RATINGS_CSV_FILE = 'ml1m_ratings.csv'\n",
 45 |     "USERS_CSV_FILE = 'ml1m_users.csv'\n",
 46 |     "MOVIES_CSV_FILE = 'ml1m_movies.csv'"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "## Download MovieLens 1M data\n",
 54 |     "\n",
 55 |     "The MovieLens 1M Dataset can be downloaded from http://files.grouplens.org/datasets/movielens/ml-1m.zip."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 6,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "name": "stdout",
 65 |      "output_type": "stream",
 66 |      "text": [
 67 |       "1000209 ratings loaded\n",
 68 |       "Saved to ml1m_ratings.csv\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_DATA_FILE), \n",
 74 |     "                    sep='::', \n",
 75 |     "                    engine='python', \n",
 76 |     "                    encoding='latin-1',\n",
 77 |     "                    names=['userid', 'movieid', 'rating', 'timestamp'])\n",
 78 |     "\n",
 79 |     "max_userid = ratings['userid'].drop_duplicates().max()\n",
 80 |     "max_movieid = ratings['movieid'].drop_duplicates().max()\n",
 81 |     "ratings['user_emb_id'] = ratings['userid'] - 1\n",
 82 |     "ratings['movie_emb_id'] = ratings['movieid'] - 1\n",
 83 |     "print(len(ratings), 'ratings loaded')\n",
 84 |     "ratings.to_csv(RATINGS_CSV_FILE, \n",
 85 |     "               sep='\\t', \n",
 86 |     "               header=True, \n",
 87 |     "               encoding='latin-1', \n",
 88 |     "               columns=['user_emb_id', 'movie_emb_id', 'rating', 'timestamp'])\n",
 89 |     "print('Saved to', RATINGS_CSV_FILE)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 7,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "6040 descriptions of 6040 users loaded.\n",
102 |       "Saved to ml1m_users.csv\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "users = pd.read_csv(os.path.join(MOVIELENS_DIR, USER_DATA_FILE), \n",
108 |     "                    sep='::', \n",
109 |     "                    engine='python', \n",
110 |     "                    encoding='latin-1',\n",
111 |     "                    names=['userid', 'gender', 'age', 'occupation', 'zipcode'])\n",
112 |     "users['age_desc'] = users['age'].apply(lambda x: AGES[x])\n",
113 |     "users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])\n",
114 |     "print(len(users), 'descriptions of', max_userid, 'users loaded.')\n",
115 |     "users['user_emb_id'] = users['userid'] - 1\n",
116 |     "users.to_csv(USERS_CSV_FILE, \n",
117 |     "             sep='\\t', \n",
118 |     "             header=True, \n",
119 |     "             encoding='latin-1',\n",
120 |     "             columns=['user_emb_id', 'gender', 'age', 'occupation', 'zipcode', 'age_desc', 'occ_desc'])\n",
121 |     "print('Saved to', USERS_CSV_FILE)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 9,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "name": "stdout",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "3883 descriptions of 3952 movies loaded.\n",
134 |       "Saved to ml1m_movies.csv\n"
135 |      ]
136 |     }
137 |    ],
138 |    "source": [
139 |     "movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_DATA_FILE), \n",
140 |     "                    sep='::', \n",
141 |     "                    engine='python', \n",
142 |     "                    encoding='latin-1',\n",
143 |     "                    names=['movieid', 'title', 'genre'])\n",
144 |     "print(len(movies), 'descriptions of', max_movieid, 'movies loaded.')\n",
145 |     "movies['movie_emb_id'] = movies['movieid'] - 1\n",
146 |     "movies.to_csv(MOVIES_CSV_FILE, \n",
147 |     "              sep='\\t', \n",
148 |     "              header=True, \n",
149 |     "              columns=['movie_emb_id', 'title', 'genre'])\n",
150 |     "print('Saved to', MOVIES_CSV_FILE)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 10,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "name": "stdout",
160 |      "output_type": "stream",
161 |      "text": [
162 |       "6040 of the 6040 users rate at least one movie.\n"
163 |      ]
164 |     }
165 |    ],
166 |    "source": [
167 |     "print(len(ratings['userid'].drop_duplicates()), 'of the', max_userid, 'users rate at least one movie.')"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 11,
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "name": "stdout",
177 |      "output_type": "stream",
178 |      "text": [
179 |       "3706 of the 3952 movies are rated.\n"
180 |      ]
181 |     }
182 |    ],
183 |    "source": [
184 |     "print(len(ratings['movieid'].drop_duplicates()), 'of the', max_movieid, 'movies are rated.')"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": []
193 |   }
194 |  ],
195 |  "metadata": {
196 |   "kernelspec": {
197 |    "display_name": "Autorec_Keraa3_1516",
198 |    "language": "python",
199 |    "name": "autorec_kera"
200 |   },
201 |   "language_info": {
202 |    "codemirror_mode": {
203 |     "name": "ipython",
204 |     "version": 3
205 |    },
206 |    "file_extension": ".py",
207 |    "mimetype": "text/x-python",
208 |    "name": "python",
209 |    "nbconvert_exporter": "python",
210 |    "pygments_lexer": "ipython3",
211 |    "version": "3.6.8"
212 |   }
213 |  },
214 |  "nbformat": 4,
215 |  "nbformat_minor": 1
216 | }
217 | 


--------------------------------------------------------------------------------
/Data_Preprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Import packages"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "import pandas as pd"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Define constants\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 4,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "BASE_DIR = 'data' \n",
 34 |     "MOVIELENS_DIR = BASE_DIR + '/ml-1m/'\n",
 35 |     "USER_DATA_FILE = 'users.dat'\n",
 36 |     "MOVIE_DATA_FILE = 'movies.dat'\n",
 37 |     "RATING_DATA_FILE = 'ratings.dat'\n",
 38 |     "\n",
 39 |     "#from http://files.grouplens.org/datasets/movielens/ml-1m-README.txt\n",
 40 |     "\n",
 41 |     "AGES = { 1: \"Under 18\", 18: \"18-24\", 25: \"25-34\", 35: \"35-44\", 45: \"45-49\", 50: \"50-55\", 56: \"56+\" }\n",
 42 |     "OCCUPATIONS = { 0: \"other or not specified\", 1: \"academic/educator\", 2: \"artist\", 3: \"clerical/admin\",\n",
 43 |     "                4: \"college/grad student\", 5: \"customer service\", 6: \"doctor/health care\",\n",
 44 |     "                7: \"executive/managerial\", 8: \"farmer\", 9: \"homemaker\", 10: \"K-12 student\", 11: \"lawyer\",\n",
 45 |     "                12: \"programmer\", 13: \"retired\", 14: \"sales/marketing\", 15: \"scientist\", 16: \"self-employed\",\n",
 46 |     "                17: \"technician/engineer\", 18: \"tradesman/craftsman\", 19: \"unemployed\", 20: \"writer\" }\n",
 47 |     "RATINGS_CSV_FILE = 'ml1m_ratings.csv'\n",
 48 |     "USERS_CSV_FILE = 'ml1m_users.csv'\n",
 49 |     "MOVIES_CSV_FILE = 'ml1m_movies.csv'"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "## Download MovieLens 1M data\n",
 57 |     "\n",
 58 |     "The MovieLens 1M Dataset can be downloaded from http://files.grouplens.org/datasets/movielens/ml-1m.zip."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 6,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "1000209 ratings loaded\n",
 71 |       "Saved to ml1m_ratings.csv\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_DATA_FILE), \n",
 77 |     "                    sep='::', \n",
 78 |     "                    engine='python', \n",
 79 |     "                    encoding='latin-1',\n",
 80 |     "                    names=['userid', 'movieid', 'rating', 'timestamp'])\n",
 81 |     "\n",
 82 |     "max_userid = ratings['userid'].drop_duplicates().max()\n",
 83 |     "max_movieid = ratings['movieid'].drop_duplicates().max()\n",
 84 |     "ratings['user_emb_id'] = ratings['userid'] - 1\n",
 85 |     "ratings['movie_emb_id'] = ratings['movieid'] - 1\n",
 86 |     "print(len(ratings), 'ratings loaded')\n",
 87 |     "ratings.to_csv(RATINGS_CSV_FILE, \n",
 88 |     "               sep='\\t', \n",
 89 |     "               header=True, \n",
 90 |     "               encoding='latin-1', \n",
 91 |     "               columns=['user_emb_id', 'movie_emb_id', 'rating', 'timestamp'])\n",
 92 |     "print('Saved to', RATINGS_CSV_FILE)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 7,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "6040 descriptions of 6040 users loaded.\n",
105 |       "Saved to ml1m_users.csv\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "users = pd.read_csv(os.path.join(MOVIELENS_DIR, USER_DATA_FILE), \n",
111 |     "                    sep='::', \n",
112 |     "                    engine='python', \n",
113 |     "                    encoding='latin-1',\n",
114 |     "                    names=['userid', 'gender', 'age', 'occupation', 'zipcode'])\n",
115 |     "users['age_desc'] = users['age'].apply(lambda x: AGES[x])\n",
116 |     "users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])\n",
117 |     "print(len(users), 'descriptions of', max_userid, 'users loaded.')\n",
118 |     "users['user_emb_id'] = users['userid'] - 1\n",
119 |     "users.to_csv(USERS_CSV_FILE, \n",
120 |     "             sep='\\t', \n",
121 |     "             header=True, \n",
122 |     "             encoding='latin-1',\n",
123 |     "             columns=['user_emb_id', 'gender', 'age', 'occupation', 'zipcode', 'age_desc', 'occ_desc'])\n",
124 |     "print('Saved to', USERS_CSV_FILE)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 9,
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "name": "stdout",
134 |      "output_type": "stream",
135 |      "text": [
136 |       "3883 descriptions of 3952 movies loaded.\n",
137 |       "Saved to ml1m_movies.csv\n"
138 |      ]
139 |     }
140 |    ],
141 |    "source": [
142 |     "movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_DATA_FILE), \n",
143 |     "                    sep='::', \n",
144 |     "                    engine='python', \n",
145 |     "                    encoding='latin-1',\n",
146 |     "                    names=['movieid', 'title', 'genre'])\n",
147 |     "print(len(movies), 'descriptions of', max_movieid, 'movies loaded.')\n",
148 |     "movies['movie_emb_id'] = movies['movieid'] - 1\n",
149 |     "movies.to_csv(MOVIES_CSV_FILE, \n",
150 |     "              sep='\\t', \n",
151 |     "              header=True, \n",
152 |     "              columns=['movie_emb_id', 'title', 'genre'])\n",
153 |     "print('Saved to', MOVIES_CSV_FILE)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 10,
159 |    "metadata": {},
160 |    "outputs": [
161 |     {
162 |      "name": "stdout",
163 |      "output_type": "stream",
164 |      "text": [
165 |       "6040 of the 6040 users rate at least one movie.\n"
166 |      ]
167 |     }
168 |    ],
169 |    "source": [
170 |     "print(len(ratings['userid'].drop_duplicates()), 'of the', max_userid, 'users rate at least one movie.')"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 11,
176 |    "metadata": {},
177 |    "outputs": [
178 |     {
179 |      "name": "stdout",
180 |      "output_type": "stream",
181 |      "text": [
182 |       "3706 of the 3952 movies are rated.\n"
183 |      ]
184 |     }
185 |    ],
186 |    "source": [
187 |     "print(len(ratings['movieid'].drop_duplicates()), 'of the', max_movieid, 'movies are rated.')"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": []
196 |   }
197 |  ],
198 |  "metadata": {
199 |   "kernelspec": {
200 |    "display_name": "Autorec_Keraa3_1516",
201 |    "language": "python",
202 |    "name": "autorec_kera"
203 |   },
204 |   "language_info": {
205 |    "codemirror_mode": {
206 |     "name": "ipython",
207 |     "version": 3
208 |    },
209 |    "file_extension": ".py",
210 |    "mimetype": "text/x-python",
211 |    "name": "python",
212 |    "nbconvert_exporter": "python",
213 |    "pygments_lexer": "ipython3",
214 |    "version": "3.6.8"
215 |   }
216 |  },
217 |  "nbformat": 4,
218 |  "nbformat_minor": 1
219 | }
220 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep-AutoEncoder-Recommendation
 2 | 
 3 | Autoencoder has been widely adopted into Collaborative Filtering (CF) for recommendation system. A classic CF problem is inferring the missing rating in an MxN matrix R where R(i, j) is the ratings given by the i<sup>th</sup> user to the j<sup>th</sup> item. This project is a Keras implementation of  AutoRec [1] and Deep AutoRec [2] with additional experiments such as the impact of default rating of users or ratings. 
 4 | 
 5 | The Dataset I used for this project is MovieLens 1M Dataset and can be downloaded from [here](<https://grouplens.org/datasets/movielens/1m/>). 
 6 | 
 7 | The preprocessing of the dataset can be found in this [Jupyter Notebook](<https://github.com/RaptorMai/Deep-AutoEncoder-Recommendation/blob/master/Data_Preprocessing.ipynb>)
 8 | 
 9 | The implementation of models in Keras can be found in this [Jupyter Notebook](<https://github.com/RaptorMai/Deep-AutoEncoder-Recommendation/blob/master/DeepAE_Rec.ipynb>)
10 | 
11 | ## Reference
12 | 
13 | [1] Sedhain, Suvash, et al. "Autorec: Autoencoders meet collaborative filtering." *Proceedings of the 24th International Conference on World Wide Web*. ACM, 2015
14 | 
15 | [2] Kuchaiev, Oleksii, and Boris Ginsburg. "Training deep autoencoders for collaborative filtering." *arXiv preprint arXiv:1708.01715* (2017).
16 | 
17 | [3]Wu, Yao, et al. "Collaborative denoising auto-encoders for top-n recommender systems." *Proceedings of the Ninth ACM International Conference on Web Search and Data Mining*. ACM, 2016.
18 | 
19 | [4]Strub, Florian, Jérémie Mary, and Romaric Gaudel. "Hybrid collaborative filtering with autoencoders." *arXiv preprint arXiv:1603.00806* (2016).
20 | 
21 | 
22 | 
23 | ## Github Reference
24 | 
25 | https://github.com/NVIDIA/DeepRecommender
26 | 
27 | <https://github.com/gtshs2/Autorec>
28 | 
29 | <https://github.com/henry0312/CDAE>
30 | 
31 | <https://github.com/cheungdaven/DeepRec>
32 | 


--------------------------------------------------------------------------------
/Report.md:
--------------------------------------------------------------------------------
  1 | # Deep Autoencoder Recommendation 
  2 | 
  3 | ## Table of Contents
  4 | 
  5 | 1. Introduction 
  6 | 2. Data Preprocessing
  7 | 3. AutoRec and Experiments
  8 | 4. Deep AutoRec and Experiments
  9 | 5. Denoising Experiments
 10 | 6. Hybrid Experiments
 11 | 7. Other Experiments
 12 | 8. Conclusion & Future Work
 13 | 9. Reference
 14 | 
 15 | The Jupyter Note book of this project can be found [here](https://github.com/RaptorMai/Deep-AutoEncoder-Recommendation)
 16 | 
 17 | ## 1. Introduction
 18 | 
 19 | Autoencoder has been widely adopted into Collaborative Filtering (CF) for recommendation system. A classic CF problem is inferring the missing rating in an MxN matrix R where R(i, j) is the ratings given by the i<sup>th</sup> user to the j<sup>th</sup> item. This project is a Keras implementation of  AutoRec [1] and Deep AutoRec [2] and additional experiments will be run. 
 20 | 
 21 | The data I used is MovieLens 1M Dataset.
 22 | 
 23 | ## 2. Data Preprocessing
 24 | 
 25 | ####Raw data preprocessing
 26 | 
 27 | The raw data file is separated by ```::``` without headers. This part is transforming the raw data file into a CSV with headers, which can be easily imported using Pandas in the following parts. All the user and movie id will be subtracted by 1 for zero-based index. The snippet shows the preprocessing for rating data and similar preprocessing is applied to users data and movies data.
 28 | 
 29 | ```python
 30 | ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_DATA_FILE), 
 31 |                     sep='::', 
 32 |                     engine='python', 
 33 |                     encoding='latin-1',
 34 |                     names=['userid', 'movieid', 'rating', 'timestamp'])
 35 | 
 36 | ratings['user_emb_id'] = ratings['userid'] - 1
 37 | ratings['movie_emb_id'] = ratings['movieid'] - 1
 38 | print(len(ratings), 'ratings loaded')
 39 | ratings.to_csv(RATINGS_CSV_FILE, 
 40 |                sep='\t', 
 41 |                header=True, 
 42 |                encoding='latin-1', 
 43 |                columns=['user_emb_id', 'movie_emb_id', 'rating', 'timestamp'])
 44 | print('Saved to', RATINGS_CSV_FILE)
 45 | ```
 46 | 
 47 | 
 48 | 
 49 | #### Train, Validation and Test Split
 50 | 
 51 | ```python
 52 | df = pd.read_csv('ml1m_ratings.csv',sep='\t', encoding='latin-1', 
 53 |                       usecols=['user_emb_id', 'movie_emb_id', 'rating', 'timestamp'])
 54 | num_users = df['user_emb_id'].unique().max() + 1
 55 | num_movies = df['movie_emb_id'].unique().max() + 1
 56 | 
 57 | train_df, test_df = train_test_split(df,
 58 |                                      stratify=df['user_emb_id'],
 59 |                                      test_size=0.1,
 60 |                                      random_state=999613182)   
 61 | train_df, validate_df = train_test_split(train_df,
 62 |                                  stratify=train_df['user_emb_id'],
 63 |                                  test_size=0.1,
 64 |                                  random_state=999613182)  
 65 | ```
 66 | 
 67 | I split the data into random 90%–10% train-test sets, and hold out 10% of the training set for validation.
 68 | 
 69 | I used a fixed random_state = 999613182 for reproduction.  When I use ```train_test_split``` from ```sklearn``` , I use **Stratify with user_id** . This setting is critical, without this setting,  it's possible that reviews of one user are all split into one the training or test set and cause bias. For example if all the reviews of user A are put into the training set, then during test time, there is no test data for this user. The test RMSE will be 0 for this user. On the other hand, if all reviews are put into test set, then there is no review for this user during training time and cause the RMSE higher for this user.  
 70 | 
 71 | 
 72 | 
 73 | #### Transform DataFrame to matrix
 74 | 
 75 | ![Screen Shot 2019-04-09 at 11.15.51 PM](/Users/zhedamai/Desktop/MIE1516/project/Deep-AutoEncoder-Recommendation/img/Screen Shot 2019-04-09 at 11.15.51 PM.png)
 76 | 
 77 | ​							Figure 1: Dataframe containing ratings data
 78 | 
 79 | ```python
 80 | def dataPreprocessor(rating_df, num_users, num_items, init_value=0, average=False):
 81 |     """
 82 |         INPUT: 
 83 |             data: pandas DataFrame. columns=['index', 'userID', 'itemID', 'rating' ...]
 84 |             num_row: int. number of users
 85 |             num_col: int. number of items
 86 |             
 87 |         OUTPUT:
 88 |             matrix: 2D numpy array. 
 89 |     """
 90 |     if average:
 91 |       matrix = np.full((num_users, num_items), 0.0)
 92 |       for (_, userID, itemID, rating, timestamp) in rating_df.itertuples():
 93 |         matrix[userID, itemID] = rating
 94 |       avergae = np.true_divide(matrix.sum(1), np.maximum((matrix!=0).sum(1), 1))
 95 |       inds = np.where(matrix == 0)
 96 |       matrix[inds] = np.take(avergae, inds[0])
 97 |       
 98 |     else:
 99 |       matrix = np.full((num_users, num_items), init_value)
100 |       for (_, userID, itemID, rating, timestamp) in rating_df.itertuples():
101 |         matrix[userID, itemID] = rating
102 | 
103 |     return matrix
104 | ```
105 | 
106 | In order to apply AutoRec on the dataset, the dataset should be transformed to a MxN matrix where R(i, j) is the ratings given by the i<sup>th</sup> user to the j<sup>th</sup> item. 
107 | 
108 | The function ```dataPreprocessor``` is used for this transformation. The init_value is the default rating for unobserved ratings. If ```average ``` is set to ```True```, the unobvserved rating will be set as the average rating of the user.
109 | 
110 | 
111 | 
112 | ## 3. AutoRec and Experiment
113 | 
114 | ### AutoRec
115 | 
116 | <img src="/Users/zhedamai/Desktop/MIE1516/project/Deep-AutoEncoder-Recommendation/img/Screen Shot 2019-04-09 at 11.27.53 PM.png" alt="drawing" width="300"/>
117 | 
118 | ​			Figure2: Item-based AutoRec Model shown in the the AutoRec Paper[1]
119 | 
120 | 
121 | 
122 | The model I am going to implement is a user-based AutoRec, which take the partially observed ratings vector of a user, project it into a low dimensional latent space and then reconstruct back to the output space to predict the missing rating. 
123 | 
124 | 
125 | 
126 | #### Loss Function
127 | 
128 | Since it does not make sense to predict zero in the user rating vector, I follow the AutoRec paper to minimize the Masked Mean Squared Error(MMSE). 
129 | 
130 | <img src="/Users/zhedamai/Desktop/MIE1516/project/Deep-AutoEncoder-Recommendation/img/Screen Shot 2019-04-09 at 11.29.47 PM.png" alt="drawing" width="200"/>
131 | 
132 | where r<sub>i</sub> is the actual rating and y<sub>i</sub> is the reconstructed rating. m<sub>i</sub> is a mask function where m<sub>i</sub> =1 where  r<sub>i</sub> is non-zero else m<sub>i</sub>=0. 
133 | 
134 | Since Masked Mean Squared Error is not provided in Keras, so I need to customize the error function.
135 | 
136 | ```python
137 | def masked_mse(y_true, y_pred):
138 |   			# masked function
139 |         mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
140 |       	# masked squared error
141 |         masked_squared_error = K.square(mask_true * (y_true - y_pred))
142 | 				# average the error
143 |         masked_mse = K.sum(masked_squared_error, axis=-1) / K.maximum(K.sum(mask_true, 
144 |                                                                             axis=-1), 1)
145 |         return masked_mse
146 | ```
147 | 
148 | 
149 | 
150 | #### Metric
151 | 
152 | The performance of the model is measured by the Masked Root Mean Squared Error (MRMSE). Similar to MMSE, we only take into consideration the error where the rating is not zero in the test set. Also, I clip the predicted rating with 1 as minimum and 5 as maximum. 
153 | 
154 | ```python
155 | def masked_rmse_clip(y_true, y_pred):
156 |   			# masked function
157 |         mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
158 |       	# clipping the rating
159 |         y_pred = K.clip(y_pred, 1, 5)
160 |         # masked squared error
161 |         masked_squared_error = K.square(mask_true * (y_true - y_pred))
162 |         # square root
163 |         masked_mse = K.sqrt(K.sum(masked_squared_error, axis=-1) / 
164 |                             K.maximum(K.sum(mask_true, axis=-1), 1))
165 |         return masked_mse
166 | ```
167 | 
168 | 
169 | 
170 | #### Model
171 | 
172 | ```python
173 | def AutoRec(X, reg, first_activation, last_activation):
174 |     '''
175 |     AutoRec
176 |     '''
177 |     input_layer = x = Input(shape=(X.shape[1],), name='UserRating')
178 |     
179 |     x = Dense(500, activation=first_activation, name='LatentSpace', 
180 |               kernel_regularizer=regularizers.l2(reg))(x)
181 |     
182 |     output_layer = Dense(X.shape[1], activation=last_activation, name='UserScorePred', 
183 |                          kernel_regularizer=regularizers.l2(reg))(x)
184 |     
185 |     model = Model(input_layer, output_layer)
186 | 
187 |     return model
188 |   
189 | _________________________________________________________________
190 | Layer (type)                 Output Shape              Param #   
191 | =================================================================
192 | UserRating (InputLayer)      (None, 3952)              0         
193 | _________________________________________________________________
194 | LatentSpace (Dense)          (None, 500)               1976500   
195 | _________________________________________________________________
196 | UserScorePred (Dense)        (None, 3952)              1979952   
197 | =================================================================
198 | Total params: 3,956,452
199 | Trainable params: 3,956,452
200 | Non-trainable params: 0
201 | _________________________________________________________________
202 | ```
203 | 
204 | 
205 | 
206 | ​			
207 | 
208 | #### Baseline Settings
209 | 
210 | ```python
211 | # Build model
212 | AutoRec = AutoRec(users_items_matrix_train_zero)
213 | AutoRec.compile(optimizer = Adam(lr=0.0001), loss=masked_mse, metrics=[masked_rmse_clip])
214 | hist_Autorec = AutoRec.fit(x=users_items_matrix_train_zero, y=users_items_matrix_train_zero,
215 |                   epochs=500,
216 |                   batch_size=256,
217 |                   verbose = 2, 
218 |                   validation_data=[users_items_matrix_train_zero,
219 |                                    users_items_matrix_validate])
220 | ```
221 | 
222 | 
223 | 
224 | | L2 Regularization | Optimizer | Learning Rate | Epochs | Batch Size | Activations      | Default Rating |
225 | | :---------------: | :-------: | :-----------: | ------ | :--------: | ---------------- | :------------: |
226 | |       0.001       |   Adam    |    0.0001     | 500    |    256     | Sigmoid + Linear |       0        |
227 | 
228 | ​							Table1: Baseline settings of AutoRec
229 | 
230 | ###Experiments
231 | 
232 | #### Activations
233 | 
234 | In the AutoRec paper, it only tested sigmoid and linear. Since the Deep_AE_CF paper [2] found that
235 | activations with **non-zero negative part** and **unbounded positive part** perform better for deep autoencoders for CF. So I tried ELU, SELU and LRELU on AutoRec.
236 | 
237 | |  Activation   |                    **Modification**                     | **Test RMSE** | **Train RMSE** |
238 | | :-----------: | :-----------------------------------------------------: | :-----------: | :------------: |
239 | |   Baseline    |                Keep L2 $$\lambda$$ 0.001                |  ***0.916***  |  ***0.914***   |
240 | |   ELU, ELU    | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit |     0.931     |     0.944      |
241 | |  ELU, Linear  | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit |    0.9338     |     0.943      |
242 | |  SELU, SELU   | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit |    0.9431     |     0.9543     |
243 | | SELU, Linear  | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit |     0.957     |     0.964      |
244 | | LRelu, LRelu  | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit |    0.9336     |     0.945      |
245 | | LRelu, Linear | change L2 $$\lambda$$ from 0.001 to 0.01 due to overfit |    0.9386     |     0.9486     |
246 | | Sigmoid, ELU  |                Keep L2 $$\lambda$$ 0.001                | ***0.9137***  |  ***0.9156***  |
247 | |   Tanh, ELU   |                Keep L2 $$\lambda$$ 0.001                |    0.9454     |     0.963      |
248 | 
249 | ​						Table2: Comparing Activation Functions on AutoRec
250 | 
251 | All the hyper-parameters including L2 $$\lambda$$ , learning rate and epochs are not fine-tuned, so the result is  not as good as the AutoRec paper. But we can find out activations that perform well in Deep Autoencoder CF do not outperform the Sigmoid+Linear baseline. Also, we found that when changing the activation from Sigmod to other activations with unbounded positive part, the model is easier to overfit. 
252 | 
253 | 
254 | 
255 | #### Default Rating
256 | 
257 | In the Deep_AE_CF paper, the default rating is 0 while in the AutoRec paper, the default rating is 3. So, I decided to tried different default ratings.
258 | 
259 | | Default Rating | Modification | Test RMSE | Train RMSE |
260 | | :------------: | :----------: | :-------: | :--------: |
261 | |  0 (Baseline)  |     N/A      | **0.916** |   0.914    |
262 | |       1        |     N/A      |   0.926   |   0.932    |
263 | |       2        |     N/A      |   0.933   |   0.944    |
264 | |       3        |     N/A      |   0.945   |    0.96    |
265 | |       4        |     N/A      |  0.9445   |   0.960    |
266 | |       5        |     N/A      |   0.943   |   0.958    |
267 | |    Average     |     N/A      | **0.912** |   0.923    |
268 | 
269 | ​						Table3: Comparing default ratings for unobserved rating
270 | 
271 | 
272 | 
273 | <img src="img/download.png" alt="drawing" width="400"/>
274 | 
275 | ​						                Figure3: Average Vs 0 as default rating
276 | 
277 | As we can see, average and 0 outperform other default ratings. When we comparing the Val_RMSE of 0 and average, we found that **using the average as default rating converge much faster than using 0 but with more noise.** 
278 | 
279 | 
280 | 
281 | #### Unexpected finding
282 | 
283 | Before I moved to the next experiment, I tried ELU + ELU as activations with the avergae as default rating, it turns out to work pretty well with the default L2 $$\lambda$$ 0.001. For Autorect, it seems this is the best model we have so far. The gap between validation loss and training loss is pretty small and increasing the regularization did not help, so I think we should move deeper to increase the model complexity.
284 | 
285 | | Default Rating | Modification | Test RMSE | Train RMSE |
286 | | :------------: | :----------: | :-------: | :--------: |
287 | |  0 (Baseline)  |              |   0.916   |   0.914    |
288 | |    Average     |   ELU+ELU    | **0.877** |   0.806    |
289 | |    Average     |  SELU+SELU   | **0.878** |   0.748    |
290 | 
291 | ​			Table4: Use Avergae as default rating works good using ELU+ELU on AutoRec
292 | 
293 | 
294 | 
295 | <p>
296 |   <img src="img/average_elu_elu_1000.png" width="300" />
297 |   <img src="average_elu_elu_0002.png" width="300" /> 
298 | </p>
299 | 
300 | 
301 | 
302 | ​					Figure4: Training Vs Validation Masked RMSE and Loss (Average, ELU+ELU)
303 | 
304 | <img src="img/6.png" alt="drawing" width="300"/>
305 | 
306 | ​						Figure5: Test RMSE as the regularization parameters varies
307 | 
308 | 
309 | 
310 | ## 4. Deep AutoRec and Experiments
311 | 
312 | ### Deep Autoencoder Collaborative Filtering
313 | 
314 | 
315 | 
316 | <img src="img/AutoEncoder.png" alt="drawing" width="400"/>
317 | 
318 | This paper continued the AutoRec idea to deepen Autoencoder. It uses MMSE as loss function, same as AutoRec. It proposes activation functions with non-zero negative part and unbounded positive part works better. Also, it uses dropout layers after the latent layer to avoid overfitting. It also shows that using large dropout rate after the latent layer allows it to learn robust representations. 
319 | 
320 | 
321 | 
322 | ```python
323 |   def Deep_AE_model(X, layers, activation, last_activation, dropout, regularizer_encode, regularizer_decode):
324 |     '''
325 |         Build Deep AE for CF
326 |         INPUT: 
327 |             X: #_user X #_item matrix
328 |             layers: List, each element is the number of neuron for a layer
329 |             reg: L2 regularization parameter
330 |             activation: activation function for all dense layer except the last
331 |             last_activation: activation function for the last dense layer
332 |             dropout: dropout rate
333 |             regularizer_encode: regularizer for encoder
334 |             regularizer_decode: regularizer for decoder
335 |         OUTPUT:
336 |             Keras model
337 |     '''
338 | 
339 |     # Input
340 |     input_layer = x = Input(shape=(X.shape[1],), name='UserRating')
341 |     
342 |     # Encoder
343 |     # -----------------------------
344 |     k = int(len(layers)/2)
345 |     i = 0
346 |     for l in layers[:k]:
347 |       x = Dense(l, activation=activation,     
348 |                 name='EncLayer{}'.format(i), 
349 |                 kernel_regularizer=regularizers.l2(regularizer_encode))(x)
350 |       i = i+1
351 |       
352 |       
353 |     # Latent Space
354 |     # -----------------------------
355 |     x = Dense(layers[k], activation=activation, 
356 |               name='LatentSpace', kernel_regularizer=regularizers.l2(regularizer_encode))(x)
357 |     
358 |     # Dropout
359 |     x = Dropout(rate = dropout)(x)
360 |     
361 |     # Decoder
362 |     # -----------------------------
363 |     for l in layers[k+1:]:
364 |       i = i-1
365 |       x = Dense(l, activation=activation, 
366 |                 name='DecLayer{}'.format(i),
367 |                 kernel_regularizer=regularizers.l2(regularizer_decode))(x)
368 |     
369 |     # Output
370 |     output_layer = Dense(X.shape[1], activation=last_activation, name='UserScorePred', 
371 |                          kernel_regularizer=regularizers.l2(regularizer_decode))(x)
372 | 
373 |     # this model maps an input to its reconstruction
374 |     model = Model(input_layer, output_layer)
375 | 
376 |     return model
377 |   
378 | Example Summary
379 | _________________________________________________________________
380 | Layer (type)                 Output Shape              Param #   
381 | =================================================================
382 | UserRating (InputLayer)      (None, 3952)              0         
383 | _________________________________________________________________
384 | EncLayer0 (Dense)            (None, 128)               505984    
385 | _________________________________________________________________
386 | EncLayer1 (Dense)            (None, 256)               33024     
387 | _________________________________________________________________
388 | LatentSpace (Dense)          (None, 512)               131584    
389 | _________________________________________________________________
390 | dropout_1 (Dropout)          (None, 512)               0         
391 | _________________________________________________________________
392 | DecLayer1 (Dense)            (None, 256)               131328    
393 | _________________________________________________________________
394 | DecLayer0 (Dense)            (None, 128)               32896     
395 | _________________________________________________________________
396 | UserScorePred (Dense)        (None, 3952)              509808    
397 | =================================================================
398 | Total params: 1,344,624
399 | Trainable params: 1,344,624
400 | Non-trainable params: 0
401 | ```
402 | 
403 | Baseline setting
404 | 
405 | | L2 Regularization | Optimizer | Learning Rate | Epochs | Batch Size | Activations | Default Rating | Dropout |
406 | | :---------------: | :-------: | :-----------: | ------ | :--------: | ----------- | :------------: | :-----: |
407 | |       0.001       |   Adam    |    0.0001     | 500    |    256     | SELU+SELU   |       0        |   0.8   |
408 | 
409 | 
410 | 
411 | ####Architecture
412 | 
413 | In the paper, it proposes different architectures for different Netflix datasets. But I found that all architectures have the largest layer in the middle and smaller layers at the begining and the end. I called this structure Small-Big-Small(SBS). But conventional autoencoders have the Big-Small-Big (BSB) structure. I decided to try  both structures on our dataset. 
414 | 
415 | |       Architecture        | Shape   (Depth) | Number of parameters | Default   Rating |          Modification          | Test RMSE | Train RMSE |
416 | | :-----------------------: | :-------------: | :------------------: | :--------------: | :----------------------------: | :-------: | :--------: |
417 | |      [512, 256, 512]      |     BSB (3)     |         4.3M         |     Avergae      |                                |  0.8792   |   0.840    |
418 | |      [512, 256, 512]      |     BSB(3)      |         4.3M         |       Zero       |                                | **0.856** |   0.764    |
419 | | [512, 256, 128, 256, 512] |     BSB(5)      |         4.4M         |     Average      |                                |   0.895   |    0.87    |
420 | | [512, 256, 128, 256, 512] |     BSB(5)      |         4.4M         |       Zero       | L2 $$\lambda$$ 0.001 to 0.0015 |   0.869   |   0.827    |
421 | |      [256, 512, 256]      |     SBS(3)      |         2.3M         |     Avergage     |                                |   0.878   |    0.85    |
422 | |      [256, 512, 256]      |     SBS(3)      |         2.3M         |       Zero       |                                | **0.857** |   0.760    |
423 | | [128, 256, 512, 256, 128] |     SBS(5)      |         1.3M         |     Average      |                                |   0.881   |    0.87    |
424 | | [128, 256, 512, 256, 128] |     SBS(5)      |         1.3M         |       Zero       | L2 $$\lambda$$ 0.001 to 0.0015 |   0.868   |    0.84    |
425 | 
426 | ​								Table5: Comparison of differnt acrchitetures
427 | 
428 | 
429 | 
430 | ####Comparison between average Vs zero as default rating
431 | 
432 | I took [512, 256, 512] as an example. Other architectures have similar phenomena.
433 | 
434 | <p>   <img src="img/512_256_512_001_loss.png" width="300" />   
435 |   <img src="img/512_256_512_avg_001.png" width="300" />  </p>
436 | 
437 | ​						Figure6: [512, 256, 512] with average as default rating
438 | 
439 | <p>   <img src="img/512_256_512_001_0.png" width="300" />   
440 |   <img src="img/512_256_512_001_0_rmse.png" width="300" />  </p>
441 | 
442 | ​					     Figure7: [512, 256, 512] with zero as default rating
443 | 
444 | When we compared the average and zero as default rating in AutoRec, we found that average converged faster but with noise. But when the model goes deeper, **the zero default rating converged faster and with less noise.** However, when we take a look at the loss, the gap between training and validation is larger in zero default setting. This means when we use zero as default rating, the model is easier to overfit. 
445 | 
446 | Also, as we can see in table5, adding more layers does not help for both BSB and SBS shape. As we go deeper, it’s easier to get overfitted and increasing the regularization parameters will bring the test performance down. So, in our project, using three hidden layers is the best option. Moreover, [512, 256, 512] and [256, 512, 256] have similar performance but [256, 512, 256] has half the number of parameters. So I will use [256, 512, 256] in further experiments, as fewer parameters not only allows us to train model with less data but also can mitigate overfitting. 
447 | 
448 | ## 5. Denoising and Hybrid Experiment
449 | 
450 | Common corruption choices are the additive Gaussian noise and multiplicative dropout noise. In the Denoising paper[3], it only used multiplicative dropout noise and I am going to test both.
451 | 
452 | #### Gaussian Noise
453 | 
454 | Since [256, 512, 256]+zero has the best performance, we test the denoising on this setting. 
455 | 
456 | ```python
457 | noise_factor = 0.1
458 | users_items_matrix_train_average_noisy = users_items_matrix_train_average + noise_factor * np.random.normal(size=users_items_matrix_train_zero.shape) 
459 | ```
460 | 
461 | <img src="img/denoise.png" alt="drawing" width="300"/>
462 | 
463 | ​							Figure8: Test RMSE on different Gaussian Noise constant
464 | 
465 | According to Figure8, adding Gaussian Noise did not improve the model. As default rating has an impact on the performance, adding noise is changing the default rating and this may be one potential reason. Deep AutoRec has the similar graph as AutoRec
466 | 
467 | 
468 | 
469 | ####Dropout Noise
470 | 
471 | In the denoising paper[3], it masked out non-zero elements randomly in each batch and use the masked input. However, using Keras to implement this feature will be the same as using pure TensorFlow. Due to the time limit of this project, I will leave this as future work and I made a compromise by adding a dropout layer between input and first dense layer. This dropout will mask out all elements randomly with a dropout rate.  As we can see in Figure8, when the dropout rate increase for the noise, the RMSE started increasing. When the rate was 0.1, the performance actually was better than the baseline but since it’s only 0.002 difference, it may still be in the range of error. It needs cross-validation for further verification.
472 | 
473 | ```python
474 |     # Input
475 |     input_layer = x = Input(shape=(X.shape[1],), name='UserRating')
476 |     
477 |     # Dropout Noise
478 |     x = Dropout(rate = noise)(x)
479 |     
480 |     # Encoder
481 |     # -----------------------------
482 |     k = int(len(layers)/2)
483 |     i = 0
484 |     for l in layers[:k]:
485 |       x = Dense(l, activation=activation,
486 |             		name='EncLayer{}'.format(i),
487 |                 kernel_regularizer=regularizers.l2(regularizer_encode))(x)
488 |       i = i+1
489 | ```
490 | 
491 | <img src="img/dropout noise.png" alt="drawing" width="300"/>
492 | 
493 | ​							Figure8: Test RMSE on different Dropout Noise
494 | 
495 | ## 6. Hybrid Experiments
496 | 
497 | Since we have the information about each user, I want to try adding the side-information in this model.
498 | 
499 | For each user, we have gender, age and occupation and after transforming to one hot encoding format, each user has totally 30 features.
500 | 
501 | ```python
502 | user_df = pd.read_csv('ml1m_users.csv',sep='\t', encoding='latin-1', 
503 |                       usecols=['user_emb_id', 'gender', 'age', 'occupation'])
504 | user_df['age'] = preprocessing.LabelEncoder().
505 | 								 fit(user_df['age']).transform(user_df['age'])
506 | 
507 | user_df['gender']=preprocessing.LabelEncoder().fit(user_df['gender']).
508 | 									transform(user_df['gender'])
509 | 
510 | onehot_df = preprocessing.OneHotEncoder(handle_unknown='ignore',sparse=False).
511 | 						fit(user_df[['gender', 'age','occupation']]).
512 | 						transform(user_df[['gender', 'age', 'occupation']])
513 | ```
514 | 
515 | ### Concatenate side-information to rating
516 | 
517 | For this method, I concatenated the side information to the rating matrix, so the shape of the matrix will be changed from 6040x3952 to 6040x3982. We still want to reconstruct only the rating matrix, so the output shape is 6040x3952. The only change in the code is I add a new argument called side_infor_size in Deep_AE_model and change the output size back to 6040x3982
518 | 
519 | ```python
520 | #6040x3982
521 | user_items_user_info = np.concatenate((users_items_matrix_train_zero, onehot_df), axis=1)
522 | ```
523 | 
524 | 
525 | 
526 | ```python
527 |   def Deep_AE_model(X, layers, activation, last_activation, dropout, regularizer_encode, regularizer_decode, side_infor_size=0):
528 |     '''
529 |         Build Deep AE for CF
530 |         INPUT: 
531 |             X: #_user X #_item matrix
532 |             layers: List, each element is the number of neuron for a layer
533 |             reg: L2 regularization parameter
534 |             activation: activation function for all dense layer except the last
535 |             last_activation: activation function for the last dense layer
536 |             dropout: dropout rate
537 |             regularizer_encode: regularizer for encoder
538 |             regularizer_decode: regularizer for decoder
539 |             side_infor_size: size of the one hot encoding side information
540 |         OUTPUT:
541 |             Keras model
542 |     '''
543 | 
544 |     # Input
545 |     input_layer = x = Input(shape=(X.shape[1],), name='UserRating')
546 |     
547 |     # Encoder
548 |     # -----------------------------
549 |     k = int(len(layers)/2)
550 |     i = 0
551 |     for l in layers[:k]:
552 |       x = Dense(l, activation=activation,
553 |                 name='EncLayer{}'.format(i), 							
554 |                 kernel_regularizer=regularizers.l2(regularizer_encode))(x)
555 |       i = i+1
556 |       
557 |       
558 |     # Latent Space
559 |     # -----------------------------
560 |     x = Dense(layers[k], activation=activation, 
561 |               name='LatentSpace', 
562 |               kernel_regularizer=regularizers.l2(regularizer_encode))(x)
563 |     
564 |     # Dropout
565 |     x = Dropout(rate = dropout)(x)
566 |     
567 |     # Decoder
568 |     # -----------------------------
569 |     for l in layers[k+1:]:
570 |       i = i-1
571 |       x = Dense(l, activation=activation, 
572 |                 name='DecLayer{}'.format(i), 
573 |                 kernel_regularizer=regularizers.l2(regularizer_decode))(x)
574 |       
575 |     # change the output size
576 |     output_layer = Dense(X.shape[1]-side_infor_size, activation=last_activation, 
577 |                          name='UserScorePred', 
578 |                          kernel_regularizer=
579 |                          regularizers.l2(regularizer_decode))(x)
580 | 
581 |     # this model maps an input to its reconstruction
582 |     model = Model(input_layer, output_layer)
583 | 
584 |     return model
585 | ```
586 | 
587 | I tested this model on the setting of [256, 512, 256]+zero. Adding the side information does not have a limited impact on the result. The error graph, Val RMSE graph and test RMSE are similar to the model without side information. As the repartition of known entries in the dataset is not uniform, the estimates are biased towards users with a lot of rating. For these users, the dataset already has a lot of information and comparing with 3952 rating features, 30 side information feature will have limited effect. But according to [4], when the users have fewer ratings, the side information will have more effect. 
588 | 
589 | 
590 | 
591 | ## 7. Other Experiments
592 | 
593 | In papers mentioned above, every user(item) is treated equally to update the weights. I thought the assumption under this is that all the ratings from a user are generated from the same distribution. But different people should have different distributions. We can not have one autoencoder for every user but what if we can have one autoencoder for every group of users. We assume users in each group rate movie similarly. 
594 | 
595 | Based on this, my first idea is we can generate the userXuser similarity matrix and cluster them into different groups. We train an autoencoder for each group. But due to the time limit of this project, I did a small experiment and leave above as future work.
596 | 
597 | <img src="img/age_gender.png" alt="drawing" width="300"/>
598 | 
599 | 
600 | 
601 | I took a look at the age and gender distribution and selected a group with most people, age_group_2 + gender_group_1. This group has 1538 users and train an autoencoder for this group. The test RMSE was only 0.89. But this result may cause by the limited number of users in the training set, as we have 3952 features but only 1538 samples.  
602 | 
603 | 
604 | 
605 | ## 8. Conclusion & Future work
606 | 
607 | In this project, I implemented AutoRec and Deep AutoRec using Keras and ran some experiments. Below is the summary of experiments I ran and some findings.
608 | 
609 | 1. Keras provides very user-friendly,  high-level interfaces and it’s very useful and convenient when our model is standard. But when we want to customize some lower level features, Keras is not convenient as PyTorch. For example, in the proposal, I said I want to experiment on the dense re-feeding module, but if I implemented this experiment in Keras, it’s basically the same as writing in TensorFlow. 
610 | 2. When I tried some activations with non-zero negative part and unbounded positive part in the original AutoRec, their performances were not as good as sigmoid+linear and the model became easier to overfit.
611 | 3. I compared different default rating and found that using the average as default rating had similar performance as using 0 but converge much faster than using 0  with more noise. 
612 | 
613 | 4. Using average as default rating and ELU+ELU in AutoRec gave the best performance and improved the baseline by 4.3%
614 | 5. When the model went deeper, using 0 as default rating converged faster and with less noise.  For our dataset, [256, 512, 256] and [512, 256, 512] performed similarly but the former architecture has half parameter as the latter one. 
615 | 6. Adding Gaussian Noise to the dataset did not help improve the model.  As default rating has an impact on the performance, adding noise is changing the default rating and this may be one potential reason
616 | 7. Adding side information to user-based AutoRec has limited impact. As the repartition of known entries in the dataset is not uniform, the estimates are biased towards users with a lot of rating. For these users, the dataset already has a lot of information and comparing with 3952 rating features, 30 side information feature will have limited effect.
617 | 
618 | ###Future Work
619 | 
620 | 1. I did not have the chance to test MLflow as I did not have GPU and need to run all the experiment on Google Colab. I will try to use MLflow to manage ML life cycle when I can rum experiment locally.
621 | 2. Dense re-feeding and dropout noise are not fully implemented. 
622 | 3. I will try to implement the idea I described in Section7. Generating the userXuser similarity matrix and cluster them into different groups. Then train an autoencoder for each group. 
623 | 4. Implement cross validation for more accurate result and better hyper-parameters tuning.
624 | 
625 | 
626 | 
627 | ## Reference
628 | 
629 | [1] Sedhain, Suvash, et al. "Autorec: Autoencoders meet collaborative filtering." *Proceedings of the 24th International Conference on World Wide Web*. ACM, 2015
630 | 
631 | [2] Kuchaiev, Oleksii, and Boris Ginsburg. "Training deep autoencoders for collaborative filtering." *arXiv preprint arXiv:1708.01715* (2017).
632 | 
633 | [3]Wu, Yao, et al. "Collaborative denoising auto-encoders for top-n recommender systems." *Proceedings of the Ninth ACM International Conference on Web Search and Data Mining*. ACM, 2016.
634 | 
635 | [4]Strub, Florian, Jérémie Mary, and Romaric Gaudel. "Hybrid collaborative filtering with autoencoders." *arXiv preprint arXiv:1603.00806* (2016).
636 | 
637 | 


--------------------------------------------------------------------------------
/Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/Report.pdf


--------------------------------------------------------------------------------
/img/512_256_512_001_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/512_256_512_001_0.png


--------------------------------------------------------------------------------
/img/512_256_512_001_0_rmse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/512_256_512_001_0_rmse.png


--------------------------------------------------------------------------------
/img/512_256_512_001_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/512_256_512_001_loss.png


--------------------------------------------------------------------------------
/img/512_256_512_avg_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/512_256_512_avg_001.png


--------------------------------------------------------------------------------
/img/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/6.png


--------------------------------------------------------------------------------
/img/AutoEncoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/AutoEncoder.png


--------------------------------------------------------------------------------
/img/AutoRec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/AutoRec.png


--------------------------------------------------------------------------------
/img/Screen Shot 2019-04-09 at 11.15.51 PM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/Screen Shot 2019-04-09 at 11.15.51 PM.png


--------------------------------------------------------------------------------
/img/Screen Shot 2019-04-09 at 11.27.53 PM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/Screen Shot 2019-04-09 at 11.27.53 PM.png


--------------------------------------------------------------------------------
/img/Screen Shot 2019-04-09 at 11.29.47 PM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/Screen Shot 2019-04-09 at 11.29.47 PM.png


--------------------------------------------------------------------------------
/img/age_gender.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/age_gender.png


--------------------------------------------------------------------------------
/img/average_elu_elu_0002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/average_elu_elu_0002.png


--------------------------------------------------------------------------------
/img/average_elu_elu_1000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/average_elu_elu_1000.png


--------------------------------------------------------------------------------
/img/average_elu_elu_500.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/average_elu_elu_500.png


--------------------------------------------------------------------------------
/img/denoise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/denoise.png


--------------------------------------------------------------------------------
/img/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/download.png


--------------------------------------------------------------------------------
/img/dropout noise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RaptorMai/Deep-AutoEncoder-Recommendation/dbffc89f7c2cee8b071593f7d60ff63867ba82ad/img/dropout noise.png


--------------------------------------------------------------------------------