├── PrepareData ├── __init__.py ├── readme.txt └── Movies1MDataset.py ├── RecommenderSystem ├── __init__.py ├── readme.txt ├── Consts.py ├── Nets.py └── MIDS.py ├── docs ├── readme.txt ├── readme_en.html ├── readme_en.pdf ├── readme_ru.html ├── readme_ru.pdf └── autoencoders.png ├── data └── readme.txt ├── tests └── readme.txt ├── ml-1m └── readme.txt ├── readme.txt ├── Rscripts ├── learning_lines.R ├── readme.txt ├── prepare_data.R └── show_ids.R ├── LICENSE └── .gitignore /PrepareData/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /RecommenderSystem/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freegraphics/MIDS/HEAD/docs/readme.txt -------------------------------------------------------------------------------- /data/readme.txt: -------------------------------------------------------------------------------- 1 | See https://github.com/freegraphics/MIDSdata/tree/master/data for the data -------------------------------------------------------------------------------- /docs/readme_en.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freegraphics/MIDS/HEAD/docs/readme_en.html -------------------------------------------------------------------------------- /docs/readme_en.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freegraphics/MIDS/HEAD/docs/readme_en.pdf -------------------------------------------------------------------------------- /docs/readme_ru.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freegraphics/MIDS/HEAD/docs/readme_ru.html -------------------------------------------------------------------------------- /docs/readme_ru.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freegraphics/MIDS/HEAD/docs/readme_ru.pdf -------------------------------------------------------------------------------- /tests/readme.txt: -------------------------------------------------------------------------------- 1 | see https://github.com/freegraphics/MIDSdata/tree/master/tests for the data -------------------------------------------------------------------------------- /PrepareData/readme.txt: -------------------------------------------------------------------------------- 1 | use Movies1MDataset.py to prepare data for system from Movies 1M data set. -------------------------------------------------------------------------------- /docs/autoencoders.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freegraphics/MIDS/HEAD/docs/autoencoders.png -------------------------------------------------------------------------------- /ml-1m/readme.txt: -------------------------------------------------------------------------------- 1 | see https://github.com/freegraphics/MIDSdata/tree/master/ml-1m -- for the data -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- 1 | Recommender system demo. 2 | 3 | See docs folder for the simple description. 4 | 5 | Demo uses numpy, panda, theano python libs. (It was tested for python 2.7 and 3.4.) 6 | 7 | 8 | 9 | (P.S. Looking for a job.) -------------------------------------------------------------------------------- /Rscripts/learning_lines.R: -------------------------------------------------------------------------------- 1 | # 2 | # the script to draw learning lines 3 | # 4 | 5 | # [F:\\works\\projects\\python\\RecommenderSystem\\tests\\16.09.24 -- 17-35\\trace.txt] -- the folder where we run recommender system 6 | 7 | learning <- read.table("F:\\works\\projects\\python\\RecommenderSystem\\tests\\16.09.24 -- 17-35\\trace.txt", quote="\"", comment.char="") 8 | x <- c(min(learning$V1),max(learning$V1)) 9 | y <- c(0,max(learning$V2)) 10 | par(col="black") 11 | plot(x,y) 12 | lines(learning$V1,learning$V2) 13 | lines(learning$V1,learning$V3) 14 | lines(learning$V1,learning$V4) 15 | lines(learning$V1,learning$V5) 16 | lines(learning$V1,learning$V6) 17 | lines(learning$V1,learning$V7) 18 | #lines(learning$V1,learning$V9) 19 | y <- c(min(learning$V2),min(learning$V2)) 20 | lines(x,y) 21 | y <- c(min(learning$V3),min(learning$V3)) 22 | lines(x,y) 23 | 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 freegraphics 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Rscripts/readme.txt: -------------------------------------------------------------------------------- 1 | Scripts for preparing MovieLens 1M data set for recommender system and for analysing system work. 2 | 3 | I used prepare_data.R to create files in the data directory. 4 | 5 | Use show_ids.R and learning_lines.R scripts to watch learning process. 6 | To see ids 7 | 1. Set in mids.py 8 | train_mode = False 9 | get_best_films_for_users_mode = False 10 | and set index of result folder to watch ids from. For example set 11 | indexes = [10] 12 | to see 10`s result folder ids 13 | 2. Run 14 | python mids.py 15 | 3. Change directory pathes in the show_ids.R 16 | items_ids <- read.delim("F:\\works\\projects\\python\\RecommenderSystem\\tests\\16.09.24 -- 17-35\\ids_005\\items_ids.dta", header=FALSE) 17 | users_ids <- read.delim("F:\\works\\projects\\python\\RecommenderSystem\\tests\\16.09.24 -- 17-35\\ids_005\\users_ids.dta", header=FALSE) 18 | 19 | To see learning lines 20 | 1. Change directory path in learning_lines.R 21 | learning <- read.table("F:\\works\\projects\\python\\RecommenderSystem\\tests\\16.09.24 -- 17-35\\trace.txt", quote="\"", comment.char="") 22 | to the path of generated trace.txt by the run python mids.py for training 23 | -------------------------------------------------------------------------------- /RecommenderSystem/readme.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------- 2 | The file Consts.py contains class Consts of the constants for the recommender system demo. 3 | 4 | ------------------------------------------------- 5 | The file Nets.py contains simple classes for the neural net and the autoencoder net. 6 | 7 | ------------------------------------------------- 8 | The file MIDS.py contains demo for base functions of the recommender system. 9 | 10 | 1. To prepare data for the recommender system demo user scripts from Rscripts and PrepareData folders. 11 | 12 | 2. To train the recommender system demo set 13 | train_mode = True 14 | in the module run if statement. 15 | and run 16 | python mids.py 17 | 18 | 3. To get nearest movies and convert npy ids files to the text format set 19 | train_mode = False 20 | get_best_films_for_users_mode = False 21 | and run 22 | python mids.py 23 | (may be your should set movie id(s) in function nearest_movies() at line 24 | movie_ids = [1251] #,1974 25 | to see nearest movies for the specific movie ids) 26 | 27 | 4. To get best movies for rand users set 28 | train_mode = False 29 | get_best_films_for_users_mode = True 30 | and set 31 | UserLines.main(14) 32 | id of folder to load model from. 33 | 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /Rscripts/prepare_data.R: -------------------------------------------------------------------------------- 1 | # 2 | # the script to prepare data for the recommender system 3 | # 4 | 5 | # first convert MovieLens 1M dataset files to the cvs file format 6 | 7 | MovieLens1MPath <- "d:\\works\\github\\mids\\ml-1m\\" 8 | ResultCSVPath <- "d:\\works\\github\\mids\\data\\" 9 | 10 | MovieLens1MMovieFileName <- paste(MovieLens1MPath,"movies.dat", sep="") 11 | movies1m <- read.delim(MovieLens1MMovieFileName,header=FALSE, quote="",stringsAsFactors=FALSE) 12 | movies1mSp <- strsplit(movies1m$V1,"::",perl=TRUE) 13 | 14 | movies_csv <- data.frame(matrix(nrow = length(movies1mSp),ncol = 3)) 15 | for(i in 1:length(movies1mSp)) 16 | { 17 | movies_csv[i,1] <- as.integer(movies1mSp[[i]][1]) 18 | movies_csv[i,2] <- movies1mSp[[i]][2] 19 | movies_csv[i,3] <- movies1mSp[[i]][3] 20 | } 21 | MoviesCSVFileName <- paste(ResultCSVPath,"pre_movies.csv") 22 | write.table(movies_csv,file = MoviesCSVFileName, row.names = FALSE, col.names=FALSE, sep=";", quote = TRUE) 23 | 24 | MovieLens1MUsersFileName <- paste(MovieLens1MPath,"users.dat",sep="") 25 | users1m <- read.delim(MovieLens1MUsersFileName,header=FALSE, quote="",stringsAsFactors=FALSE) 26 | users1mSp <- strsplit(users1m$V1,"::",perl=TRUE) 27 | 28 | users_csv <- data.frame(matrix(nrow = length(users1mSp),ncol = 5)) 29 | for(i in 1:length(users1mSp)) 30 | { 31 | users_csv[i,1] <- as.integer(users1mSp[[i]][1]) 32 | users_csv[i,2] <- users1mSp[[i]][2] 33 | users_csv[i,3] <- users1mSp[[i]][3] 34 | users_csv[i,4] <- users1mSp[[i]][4] 35 | users_csv[i,5] <- users1mSp[[i]][5] 36 | } 37 | 38 | UsersCSVFileName <- paste(ResultCSVPath,"pre_users.csv") 39 | write.table(users_csv,file = UsersCSVFileName, row.names = FALSE, col.names=FALSE, sep=";", quote = TRUE) 40 | 41 | MovieLens1MRatingsFileName <- paste(MovieLens1MPath,"ratings.dat",sep = "") 42 | ratings1m <- read.delim(MovieLens1MRatingsFileName,header=FALSE, quote="",stringsAsFactors=FALSE) 43 | ratings1mSp <- strsplit(ratings1m$V1,"::",perl=TRUE) 44 | 45 | ratings_csv <- data.frame(matrix(nrow = length(ratings1mSp),ncol = 4)) 46 | for(i in 1:length(ratings1mSp)) 47 | { 48 | ratings_csv[i,1] <- ratings1mSp[[i]][1] 49 | ratings_csv[i,2] <- ratings1mSp[[i]][2] 50 | ratings_csv[i,3] <- ratings1mSp[[i]][3] 51 | ratings_csv[i,4] <- ratings1mSp[[i]][4] 52 | } 53 | 54 | RatingsCSVFileName <- paste(ResultCSVPath,"pre_ratings.csv") 55 | write.table(ratings_csv,file = RatingsCSVFileName, row.names = FALSE, col.names=FALSE, sep=";", quote = TRUE) 56 | 57 | -------------------------------------------------------------------------------- /Rscripts/show_ids.R: -------------------------------------------------------------------------------- 1 | # 2 | # the script to draw ids of the recommender system run 3 | # 4 | 5 | # [d:\\works\\projects\\RecommenderSystem\\tests\\] -- the folder where we run recommender system 6 | items_ids <- read.delim("F:\\works\\projects\\python\\RecommenderSystem\\tests\\16.09.24 -- 17-35\\ids_005\\items_ids.dta", header=FALSE) 7 | users_ids <- read.delim("F:\\works\\projects\\python\\RecommenderSystem\\tests\\16.09.24 -- 17-35\\ids_005\\users_ids.dta", header=FALSE) 8 | 9 | par(col="black") 10 | plot(items_ids_01$V2,items_ids_01$V3,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 11 | plot(items_ids$V2,items_ids$V3,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 12 | plot(items_ids_01$V4,items_ids_01$V5,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 13 | plot(items_ids$V4,items_ids$V5,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 14 | plot(items_ids_01$V6,items_ids_01$V7,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 15 | plot(items_ids$V6,items_ids$V7,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 16 | plot(items_ids_01$V8,items_ids_01$V9,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 17 | plot(items_ids$V8,items_ids$V9,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 18 | plot(items_ids_01$V10,items_ids_01$V11,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 19 | plot(items_ids$V10,items_ids$V11,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 20 | plot(items_ids_01$V12,items_ids_01$V13,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 21 | plot(items_ids$V12,items_ids$V13,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 22 | plot(items_ids_01$V14,items_ids_01$V15,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 23 | plot(items_ids$V14,items_ids$V15,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 24 | plot(users_ids_01$V2,users_ids_01$V3,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 25 | plot(users_ids$V2,users_ids$V3,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 26 | plot(users_ids_01$V4,users_ids_01$V5,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 27 | plot(users_ids$V4,users_ids$V5,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 28 | plot(users_ids_01$V6,users_ids_01$V7,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 29 | plot(users_ids$V6,users_ids$V7,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 30 | plot(users_ids_01$V8,users_ids_01$V9,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 31 | plot(users_ids$V8,users_ids$V9,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 32 | plot(users_ids_01$V10,users_ids_01$V11,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 33 | plot(users_ids$V10,users_ids$V11,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 34 | plot(users_ids_01$V12,users_ids_01$V13,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 35 | plot(users_ids$V12,users_ids$V13,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 36 | plot(users_ids_01$V14,users_ids_01$V15,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 37 | plot(users_ids$V14,users_ids$V15,xlim = c(-0.5,0.5),ylim = c(-0.5,0.5)) 38 | 39 | hist(items_ids$V2,breaks = 35) 40 | hist(items_ids$V3,breaks = 35) 41 | hist(items_ids$V4,breaks = 35) 42 | hist(items_ids$V5,breaks = 35) 43 | hist(items_ids$V6,breaks = 35) 44 | hist(items_ids$V7,breaks = 35) 45 | hist(items_ids$V8,breaks = 35) 46 | hist(items_ids$V9,breaks = 35) 47 | hist(items_ids$V10,breaks = 35) 48 | hist(items_ids$V11,breaks = 35) 49 | hist(items_ids$V12,breaks = 35) 50 | hist(items_ids$V13,breaks = 35) 51 | hist(items_ids$V14,breaks = 35) 52 | hist(items_ids$V15,breaks = 35) 53 | 54 | # update ids for the next step 55 | items_ids_01 <- items_ids 56 | users_ids_01 <- users_ids 57 | -------------------------------------------------------------------------------- /RecommenderSystem/Consts.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 16.08.10 3 | 4 | @author: klizardin 5 | 6 | The MIT License (MIT) 7 | 8 | Copyright (c) 2016 klizardin 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is furnished 15 | to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 22 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 23 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 24 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 25 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 | 27 | ''' 28 | 29 | import os 30 | 31 | import theano 32 | import theano.tensor as T 33 | from theano.tensor.shared_randomstreams import RandomStreams 34 | 35 | class Consts(object): 36 | ''' 37 | default constants of classes 38 | ''' 39 | 40 | def __init__(self): 41 | ''' 42 | the constructor 43 | ''' 44 | 45 | self.load_from_ids = int(0) 46 | 47 | # rates constants 48 | self.MaxRate = int(5) 49 | 50 | # paths constants 51 | self.data_path = "F:\\works\\projects\\python\\RecommenderSystem\\data" 52 | self.result_path = "F:\\works\\projects\\python\\RecommenderSystem\\tests\\16.09.24 -- 17-35" 53 | self.trained_path = "F:\\works\\projects\\python\\RecommenderSystem\\tests\\16.09.24 -- 17-35\\ids_000" 54 | 55 | # source data file names 56 | self.users_cvs_file_name = os.path.join(self.data_path,"users.csv") 57 | self.movies_cvs_file_name = os.path.join(self.data_path,"movies.csv") 58 | self.ratings_cvs_file_name = os.path.join(self.data_path,"ratings.csv") 59 | self.userids_npy_file_name = os.path.join(self.data_path,"userids.npy") 60 | self.moviesids_npy_file_name = os.path.join(self.data_path,"moviesids.npy") 61 | self.ratings_by_user_npy_file_name = os.path.join(self.data_path,"ratings_by_user.npy") 62 | self.ratings_by_user_ids_npy_file_name = os.path.join(self.data_path,"ratings_by_user_ids.npy") 63 | self.ratings_by_user_idx_npy_file_name = os.path.join(self.data_path,"ratings_by_user_idx.npy") 64 | self.ratings_by_movie_npy_file_name = os.path.join(self.data_path,"ratings_by_movie.npy") 65 | self.ratings_by_movie_ids_npy_file_name = os.path.join(self.data_path,"ratings_by_movie_ids.npy") 66 | self.ratings_by_movie_idx_npy_file_name = os.path.join(self.data_path,"ratings_by_movie_idx.npy") 67 | 68 | # result data file names 69 | self.users_ids_file_name = os.path.join(self.result_path,"users_ids.npy") 70 | self.items_ids_file_name = os.path.join(self.result_path,"items_ids.npy") 71 | self.nearest_movies_file_name = os.path.join(self.result_path,"nearest_movies_%d_%03d.txt") 72 | self.knearest_movies_file_name = os.path.join(self.result_path,"knearest_movies_%03d.txt") 73 | self.users_ids_to_item_id_autoencoder_file_name = os.path.join(self.result_path,"users_ids_ldt_%s.npy") 74 | self.items_ids_to_user_id_autoencoder_file_name = os.path.join(self.result_path,"items_ids_ldt_%s.npy") 75 | self.user_ids_rate_to_item_ids_net_file_name = os.path.join(self.result_path,"user_ids_rate_to_item_ids_ldt_%s.npy") 76 | self.result_net_file_name = os.path.join(self.result_path,"result_ldt_%s.npy") 77 | self.users_ids_dta_file_name = os.path.join(self.result_path,"users_ids.dta") 78 | self.items_ids_dta_file_name = os.path.join(self.result_path,"items_ids.dta") 79 | self.trace_file_name = os.path.join(self.result_path,"trace.txt") 80 | self.trace_rates_file_name = os.path.join(self.result_path,"trace_rates.txt") 81 | self.user_line_file_name = os.path.join(self.result_path,"%04d_line.txt") 82 | self.best_movies_for_user_file_name = os.path.join(self.result_path,"%04d_best_movies.txt") 83 | self.user_rates_of_movies_file_name = os.path.join(self.result_path,"%04d_movies_rate.txt") 84 | self.user_movies_by_rates_file_name = os.path.join(self.result_path,"%04d_movies_by_rate.txt") 85 | 86 | self.save_cycles = int(50) 87 | 88 | # MIDS constants 89 | self.user_id_size = int(31) 90 | self.item_id_size = int(31) 91 | 92 | self.user_max_distance = float(256.0*self.user_id_size) 93 | self.item_max_distance = float(256.0*self.item_id_size) 94 | 95 | self.encode_elements_count = int(5) 96 | 97 | # encoder defaults 98 | self.encoder_batch_size = int(32) 99 | self.encoder_learning_rate = float(0.1) 100 | self.encoder_corruption_rate = float(0.0) 101 | self.encoder_hidden_layers_count = int(6) 102 | self.encoder_hidden_layers_activation = T.nnet.relu 103 | self.encoder_hidden_layer_size = int(256) 104 | self.encoder_L1_decay = float(0.0) 105 | self.encoder_L2_decay = float(1.0e-4) 106 | self.encoder_loss_k = float(1.0e-3) 107 | 108 | # result defaults 109 | self.result_batch_size = int(32) 110 | self.result_learning_rate = float(0.1) 111 | self.result_hidden_layers_count = int(6) 112 | self.result_hidden_layers_activation = T.nnet.relu 113 | self.result_hidden_layer_size = int(256) 114 | self.result_L1_decay = float(0.0) 115 | self.result_L2_decay = float(1.0e-4) 116 | self.result_loss_k = float(1.0e-3) 117 | 118 | # items ids net defaults 119 | self.itemids_batch_size = int(32) 120 | self.itemids_learning_rate = float(0.1) 121 | self.itemids_hidden_layers_count = int(6) 122 | self.itemids_hidden_layers_activation = T.nnet.relu 123 | self.itemids_hidden_layer_size = int(256) 124 | self.itemids_L1_decay = float(0.0) 125 | self.itemids_L2_decay = float(1.0e-4) 126 | self.itemids_loss_k = float(1.0e-3) 127 | 128 | self.train_rate = float(0.9) 129 | self.validate_cycles = int(5) 130 | 131 | # move ids constants 132 | self.users_ids_move_elem_count_rate = float(0.1) 133 | self.items_ids_move_elem_count_rate = float(0.2) 134 | self.users_ids_move_elem_count_rate1 = float(0.1) 135 | self.items_ids_move_elem_count_rate1 = float(0.1) 136 | self.users_ids_avg_rate = float(0.2) 137 | self.items_ids_avg_rate = float(0.2) 138 | self.new_user_cycles = int(100) 139 | self.new_item_cycles = int(100) 140 | self.ids_update_users_normilized_vs_avg_rate = float(0.99) 141 | self.ids_update_items_normilized_vs_avg_rate = float(0.99) 142 | 143 | self.avg_dx_item_small_weight = float(0.3) 144 | self.min_max_compresion_rate = float(0.95) 145 | 146 | self.ids_move_count = int(1000) 147 | self.ids_move_count_coef = float(0.5) 148 | self.dist_sqrt_coef = float(9.0) 149 | 150 | self.train_rates_rate = float(0.5) 151 | self.train_itemids_rate = float(0.5) 152 | 153 | self.update_index(self.load_from_ids*self.save_cycles) 154 | return 155 | 156 | def get_file_name_by_index(self,index,file_name): 157 | path,fname = os.path.split(file_name) 158 | path = os.path.join(path,"ids_%03d" % (index)) 159 | if not os.path.exists(path): 160 | os.mkdir(path) 161 | return os.path.join(path,fname) 162 | 163 | def update_index(self,index): 164 | if index=int(100) and index=int(500): 182 | r = float(index-500)/float(1000.0*3.0) 183 | self.users_ids_move_elem_count_rate = float(0.25) 184 | self.items_ids_move_elem_count_rate = float(0.25) 185 | self.users_ids_move_elem_count_rate1 = float(1.0) 186 | self.items_ids_move_elem_count_rate1 = float(1.0) 187 | self.ids_move_count = int(2000) 188 | self.ids_move_count_coef = float(0.5/(1.0+r)) 189 | return 190 | return 191 | -------------------------------------------------------------------------------- /PrepareData/Movies1MDataset.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 23.09.2016 3 | 4 | @author: hp 5 | ''' 6 | 7 | import os 8 | import pandas 9 | import numpy 10 | import datetime 11 | import pytz 12 | import re 13 | import csv 14 | import time 15 | import sys 16 | 17 | 18 | class Consts(object): 19 | def __init__(self): 20 | self.data_path = "F:\\works\\projects\\python\\RecommenderSystem\\data" 21 | self.timezone_path = "F:\\works\\projects\\python\\RecommenderSystem\\zipcode.csv" 22 | 23 | self.movies_path = os.path.join(self.data_path,"pre_movies.csv") 24 | self.users_path = os.path.join(self.data_path,"pre_users.csv") 25 | self.ratings_path = os.path.join(self.data_path,"pre_ratings.csv") 26 | self.users_csv_file_name = os.path.join(self.data_path,"users.csv") 27 | self.movies_csv_file_name = os.path.join(self.data_path,"movies.csv") 28 | self.ratings_csv_file_name = os.path.join(self.data_path,"ratings.csv") 29 | self.min_year = 1900 30 | 31 | self.users_cvs_file_name = os.path.join(self.data_path,"users.csv") 32 | self.movies_cvs_file_name = os.path.join(self.data_path,"movies.csv") 33 | self.ratings_cvs_file_name = os.path.join(self.data_path,"ratings.csv") 34 | self.userids_npy_file_name = os.path.join(self.data_path,"userids.npy") 35 | self.moviesids_npy_file_name = os.path.join(self.data_path,"moviesids.npy") 36 | self.ratings_by_user_npy_file_name = os.path.join(self.data_path,"ratings_by_user.npy") 37 | self.ratings_by_user_ids_npy_file_name = os.path.join(self.data_path,"ratings_by_user_ids.npy") 38 | self.ratings_by_user_idx_npy_file_name = os.path.join(self.data_path,"ratings_by_user_idx.npy") 39 | self.ratings_by_movie_npy_file_name = os.path.join(self.data_path,"ratings_by_movie.npy") 40 | self.ratings_by_movie_ids_npy_file_name = os.path.join(self.data_path,"ratings_by_movie_ids.npy") 41 | self.ratings_by_movie_idx_npy_file_name = os.path.join(self.data_path,"ratings_by_movie_idx.npy") 42 | 43 | self.MaxRate = 5 44 | return 45 | pass 46 | 47 | 48 | def convert_csv(consts): 49 | print("converting csv...") 50 | timezones_cvs = pandas.read_csv( 51 | consts.timezone_path 52 | ,dtype = { 53 | 'zip':numpy.str 54 | ,'city':numpy.str 55 | ,'state':numpy.str 56 | ,'latitude':numpy.float32 57 | ,'longitude':numpy.float32 58 | ,'timezone':numpy.int32 59 | ,'dst':numpy.int32 60 | } 61 | ,index_col = False 62 | ) 63 | print("timezone data was loaded") 64 | movies_cvs = pandas.read_csv( 65 | consts.movies_path 66 | ,sep=";" 67 | ,header=None 68 | ,quotechar='"' 69 | ,encoding="cp1251" 70 | ,names=("MovieID","Name","Genders") 71 | ,dtype = { 72 | 'MovieID':numpy.int32 73 | ,'Name':numpy.str 74 | ,'Genders':numpy.str 75 | } 76 | ,index_col = False 77 | ) 78 | print("movies data was loaded") 79 | users_cvs = pandas.read_csv( 80 | consts.users_path 81 | ,sep=";" 82 | ,header=None 83 | ,quotechar='"' 84 | ,encoding="cp1251" 85 | ,names=("UserID","Gender","Age","Occupation","ZipCode") 86 | ,dtype = { 87 | 'UserID':numpy.int32 88 | ,'Gender':numpy.str 89 | ,'Age':numpy.int32 90 | ,'Occupation':numpy.int32 91 | ,"ZipCode":numpy.str 92 | } 93 | ,index_col = False 94 | ) 95 | print("users data was loaded") 96 | ratings_cvs = pandas.read_csv( 97 | consts.ratings_path 98 | ,sep=";" 99 | ,header=None 100 | ,quotechar='"' 101 | ,encoding="cp1251" 102 | ,names=("UserID","MovieID","Rating","Timestamp") 103 | ,dtype = { 104 | 'UserID':numpy.int32 105 | ,'MovieID':numpy.int32 106 | ,'Rating':numpy.float32 107 | ,'Timestamp':numpy.int32 108 | } 109 | ,index_col = False 110 | ) 111 | print("ratings data was loaded") 112 | 113 | lt = time.time() 114 | prog = re.compile(pattern = "\((\d+)\)$") 115 | movies_cvs['year'] = int(consts.min_year) 116 | for i in numpy.arange(movies_cvs.shape[0]-1): 117 | name = str(movies_cvs.at[i,"Name"]) 118 | m = prog.search(name) 119 | if m: 120 | movies_cvs.at[i,'year'] = int(m.group(1)) 121 | pass 122 | t1 = time.time() 123 | if t1>lt+1: 124 | p = float(i)/float(movies_cvs.shape[0])*100.0 125 | sys.stdout.write("\t\t\t\t\t\t\t\t\t\r") 126 | sys.stdout.write("movies csv data process %f %%\r" % (p,)) 127 | lt = lt+1 128 | pass 129 | print("movies cvs data was prepared") 130 | 131 | users_cvs['latitude'] = float(0) 132 | users_cvs['longitude'] = float(0) 133 | users_cvs['timezone'] = int(0) 134 | users_cvs['dts'] = int(0) 135 | for i in numpy.arange(users_cvs.shape[0]-1): 136 | zipcode = users_cvs.loc[i,'ZipCode'] 137 | zc = timezones_cvs[timezones_cvs.zip.isin([zipcode])] 138 | if len(zc)==1: 139 | users_cvs.at[i,'timezone'] = int(zc['timezone']) 140 | users_cvs.at[i,'latitude'] = float(zc['latitude']) 141 | users_cvs.at[i,'longitude'] = float(zc['longitude']) 142 | users_cvs.at[i,'dts'] = int(zc['dst']) 143 | pass 144 | t1 = time.time() 145 | if t1>lt+1: 146 | p = float(i)/float(users_cvs.shape[0])*100.0 147 | sys.stdout.write("\t\t\t\t\t\t\t\t\t\r") 148 | sys.stdout.write("users csv data process %f %%\r" % (p,)) 149 | lt = lt+1 150 | pass 151 | print("users cvs data was prepared") 152 | 153 | ratings_cvs["wday"] = int(0) 154 | ratings_cvs["yday"] = int(0) 155 | ratings_cvs["year"] = int(consts.min_year) 156 | 157 | for i in numpy.arange(ratings_cvs.shape[0]-1): 158 | user_id = int(ratings_cvs.at[i,"UserID"]) 159 | t0 = ratings_cvs.at[i,"Timestamp"] 160 | ui = users_cvs[users_cvs.UserID.isin([user_id])] 161 | if len(ui)==1: 162 | timezone = int(ui["timezone"]) - 2 163 | tt = datetime.datetime.fromtimestamp(t0,datetime.timezone(datetime.timedelta(hours=timezone))).timetuple() 164 | ratings_cvs.at[i,"wday"] = tt.tm_wday 165 | ratings_cvs.at[i,"yday"] = tt.tm_yday 166 | ratings_cvs.at[i,"year"] = tt.tm_year 167 | pass 168 | t1 = time.time() 169 | if t1>lt+1: 170 | p = float(i)/float(ratings_cvs.shape[0])*100.0 171 | sys.stdout.write("\t\t\t\t\t\t\t\t\t\r") 172 | sys.stdout.write("ratings csv data process %f %%\r" % (p,)) 173 | lt = lt+1 174 | pass 175 | print("ratings cvs data was prepared") 176 | 177 | users_cvs.to_csv( 178 | path_or_buf = consts.users_csv_file_name, 179 | sep = ";" 180 | ,header = False 181 | ,index = False 182 | ,encoding = "utf-8" 183 | ,quoting = csv.QUOTE_ALL 184 | ,quotechar = '"' 185 | ,line_terminator = "\n" 186 | ,doublequote = True 187 | ) 188 | movies_cvs.to_csv( 189 | path_or_buf = consts.movies_csv_file_name, 190 | sep = ";" 191 | ,header = False 192 | ,index = False 193 | ,encoding = "utf-8" 194 | ,quoting = csv.QUOTE_ALL 195 | ,quotechar = '"' 196 | ,line_terminator = "\n" 197 | ,doublequote = True 198 | ) 199 | ratings_cvs.to_csv( 200 | path_or_buf = consts.ratings_csv_file_name 201 | ,sep = ";" 202 | ,header = False 203 | ,index = False 204 | ,encoding = "utf-8" 205 | ,quoting = csv.QUOTE_ALL 206 | ,quotechar = '"' 207 | ,line_terminator = "\n" 208 | ,doublequote = True 209 | ) 210 | print("converting done") 211 | return 212 | 213 | def get_aranged(value,min_value,max_value): 214 | if abs(max_value-min_value)<1e-9: 215 | return 0 216 | return (float(value)-float(min_value))/(float(max_value)-float(min_value)) - float(0.5) 217 | 218 | def prepare_data(consts = Consts()): 219 | print("loading data...") 220 | 221 | user_indice = 0 222 | movie_indice = 1 223 | 224 | 225 | # user_cvs 226 | # columns: 227 | # id -- int (key); sex -- ['M'|'F']; age -- int; 228 | # occupation -- int; latitude -- real; longitude -- real; 229 | # timezone -- int; dts -- [0|1]; 230 | # 231 | users_cvs = pandas.read_csv( 232 | consts.users_cvs_file_name 233 | ,names = ("id","sex","age","occupation","zipcode","latitude","longitude","timezone","dts") 234 | ,dtype = { 235 | 'id':numpy.int32 236 | ,'sex':numpy.str 237 | ,'age':numpy.int32 238 | ,'occupation':numpy.int32 239 | ,"zipcode":numpy.str 240 | ,'latitude':numpy.float32 241 | ,'longitude':numpy.float32 242 | ,'timezone':numpy.int32 243 | ,'dts':numpy.int32 244 | } 245 | ,sep=";" 246 | ,skipinitialspace = False 247 | ,header=None 248 | ,index_col = False 249 | ,quoting = csv.QUOTE_ALL 250 | ,quotechar='"' 251 | ,encoding="utf-8" 252 | ,na_values='' 253 | ) 254 | print("The users_cvs was loaded.") 255 | #print(users_cvs) 256 | 257 | # movies_cvs 258 | # columns: 259 | # id -- int (key); name -- string; gender -- string; year -- int; 260 | movies_cvs = pandas.read_csv( 261 | consts.movies_cvs_file_name 262 | ,sep=";" 263 | ,names = ["id","name","gender","year"] 264 | ,dtype = { 265 | 'id':numpy.int32 266 | ,'name':numpy.str 267 | ,'gender':numpy.str 268 | ,'year':numpy.int32 269 | } 270 | ,skipinitialspace = False 271 | ,header=None 272 | ,index_col = False 273 | ,quoting = csv.QUOTE_ALL 274 | ,quotechar='"' 275 | ,encoding="utf-8" 276 | ) 277 | print("The movies_cvs was loaded.") 278 | #print(movies_cvs) 279 | 280 | # ratings_cvs 281 | # columns: 282 | # userid -- int (from users_cvs id key); filmid -- int (from movies_cvs id key); 283 | # rate -- real; wday -- int; yday -- int; year -- int; 284 | ratings_cvs = pandas.read_csv( 285 | consts.ratings_cvs_file_name 286 | ,sep=";" 287 | ,names=["userid","filmid","rate",'Timestamp',"wday","yday","year"] 288 | ,dtype = { 289 | 'userid':numpy.int32 290 | ,'filmid':numpy.str 291 | ,'rate':numpy.float32 292 | ,'Timestamp':numpy.int32 293 | ,'wday':numpy.int32 294 | ,'yday':numpy.int32 295 | ,'year':numpy.int32 296 | } 297 | ,skipinitialspace = False 298 | ,header=None 299 | ,index_col = False 300 | ,quoting = csv.QUOTE_ALL 301 | ,quotechar='"' 302 | ,encoding="utf-8" 303 | ) 304 | print("The ratings_cvs was loaded.") 305 | #print(ratings_cvs) 306 | 307 | 308 | # usersids 309 | # columns: 310 | # sex -- +0.5 - 'M', -0.5 - 'F' 311 | # age -- -0.5 - min, +0.5 - max 312 | 313 | last_user_id = users_cvs["id"][len(users_cvs)-1] 314 | usersids = numpy.zeros(dtype=numpy.float32,shape=(last_user_id,2)) 315 | age_min = 1 316 | age_max = 56 317 | for i in numpy.arange(len(users_cvs)): 318 | if users_cvs["sex"][i]=="M": 319 | usersids[users_cvs["id"][i]-1,0] = 0.5 320 | else: 321 | usersids[users_cvs["id"][i]-1,0] = -0.5 322 | usersids[users_cvs["id"][i]-1,1] = get_aranged(value = users_cvs["age"][i], min_value = age_min, max_value = age_max) 323 | print(usersids[0:100,]) 324 | 325 | # moviesids 326 | # columns: 327 | # year -- -0.5 - min, +0.5 - max 328 | 329 | last_film_id = movies_cvs["id"][len(movies_cvs)-1] 330 | moviesids = numpy.zeros(dtype=numpy.float32,shape=(last_film_id,1)) 331 | min_year = float(movies_cvs["year"].min()) 332 | max_year = float(movies_cvs["year"].max()) 333 | d_year = max_year - min_year 334 | min_year = min_year - d_year*0.1 335 | max_year = max_year + d_year*0.1 336 | for i in numpy.arange(len(movies_cvs)): 337 | moviesids[movies_cvs["id"][i]-1,0] = get_aranged(value = movies_cvs["year"][i], min_value = min_year, max_value = max_year) 338 | print(moviesids[0:100,]) 339 | 340 | 341 | ratings_cvs["id"] = numpy.arange(len(ratings_cvs)) 342 | ratings_cvs["UserRate"] = ratings_cvs["rate"] 343 | ratings_cvs["MeanRate"] = ratings_cvs["rate"] 344 | grouped_by_user = ratings_cvs.groupby(by="userid") 345 | #mean_rate_by_user = grouped_by_user["rate"].mean() 346 | lt = time.time() 347 | i = 0 348 | for name,group in grouped_by_user: 349 | mean_rate_by_user = group["rate"].mean() 350 | ratings_cvs.loc[group["id"],"UserRate"] = ratings_cvs.loc[group["id"],"UserRate"] - mean_rate_by_user 351 | ratings_cvs.loc[group["id"],"MeanRate"] = mean_rate_by_user 352 | t1 = time.time() 353 | if t1>lt+1: 354 | p = float(i)/float(len(grouped_by_user))*100.0 355 | print("UserRates %f %%" % (p)) 356 | lt = lt+1 357 | i = i + 1 358 | ratings_cvs["UserRate"] = ratings_cvs["UserRate"]/(2*consts.MaxRate) 359 | print("The UserRates column was calculated") 360 | print(ratings_cvs.head(100)) 361 | 362 | # ratings_by_user_idx 363 | # columns: 364 | # for one user_id, ratings_by_user_ids and ratings_by_user indexes pair 365 | # 366 | # start_indice -- int 367 | # end_indice -- int 368 | 369 | # ratings_by_user_ids 370 | # columns: 371 | # user_id -- int 372 | # film_id -- int 373 | 374 | # ratings_by_user 375 | # every row for one ratings_by_user_ids row i.m. for one pair (user_id,film_id) 376 | # columns: 377 | # user_rate -- -0.5 - min .. +0.5 - max; 378 | # wday -- -0.5 - min .. + 0.5 - max; 379 | 380 | ratings_by_user = numpy.zeros(dtype=numpy.float32,shape=(len(ratings_cvs),2)) 381 | ratings_by_user_ids = numpy.zeros(dtype=numpy.int64,shape=(len(ratings_cvs),2)) 382 | ratings_by_user_idx = numpy.zeros(dtype=numpy.int64,shape=(len(grouped_by_user),2)) 383 | i = 0 384 | li = 0 385 | lt = time.time() 386 | j = 0 387 | for name,group in grouped_by_user: 388 | user_id = numpy.int64(name) 389 | for row_id in group["id"]: 390 | ratings_by_user_ids[j,user_indice] = user_id 391 | ratings_by_user_ids[j,movie_indice] = numpy.int64(ratings_cvs.loc[row_id,"filmid"]) 392 | ratings_by_user[j,0] = ratings_cvs.loc[row_id,"UserRate"] 393 | ratings_by_user[j,1] = get_aranged(value = ratings_cvs.loc[row_id,"wday"], min_value = 0, max_value = 6) 394 | j = j + 1 395 | ratings_by_user_idx[i,] = [li,li+len(group)] 396 | li = li + len(group) 397 | t1 = time.time() 398 | if t1>lt+1: 399 | print("rating_by_user %f %%" % (float(i)/float(len(grouped_by_user))*100)) 400 | lt = lt+1 401 | i = i + 1 402 | print("ratings_by_user rates was calculated") 403 | 404 | # ratings_by_movie_idx 405 | # columns: 406 | # for one movie_id, ratings_by_movie_ids and ratings_by_movie indexes pair 407 | # 408 | # start_indice -- int 409 | # end_indice -- int 410 | 411 | # ratings_by_movie_ids 412 | # columns: 413 | # user_id -- int 414 | # film_id -- int 415 | 416 | # ratings_by_movie 417 | # every row for one ratings_by_movie_ids row i.m. for one pair (user_id,film_id) 418 | # columns: 419 | # user_rate -- -0.5 - min .. +0.5 - max; 420 | # wday -- -0.5 - min .. + 0.5 - max; 421 | 422 | group_by_movie = ratings_cvs.groupby(by="filmid") 423 | ratings_by_movie = numpy.zeros(dtype=numpy.float32,shape=(len(ratings_cvs),2)) 424 | ratings_by_movie_ids = numpy.zeros(dtype=numpy.int64,shape=(len(ratings_cvs),2)) 425 | ratings_by_movie_idx = numpy.zeros(dtype=numpy.int64,shape=(len(group_by_movie),2)) 426 | i = 0 427 | li = 0 428 | lt = time.time() 429 | j = 0 430 | for name,group in group_by_movie: 431 | film_id = numpy.int64(name) 432 | for row_id in group["id"]: 433 | ratings_by_movie_ids[j,user_indice] = numpy.int64(ratings_cvs.loc[row_id,"userid"]) 434 | ratings_by_movie_ids[j,movie_indice] = film_id 435 | ratings_by_movie[j,0] = ratings_cvs.loc[row_id,"UserRate"] 436 | ratings_by_movie[j,1] = get_aranged(value = ratings_cvs.loc[row_id,"wday"], min_value = 0, max_value = 6) 437 | j = j + 1 438 | ratings_by_movie_idx[i,] = [li,li+len(group)] 439 | li = li + len(group) 440 | t1 = time.time() 441 | if t1>lt+1: 442 | print("rating_by_movie %f %%" % (float(i)/float(len(group_by_movie))*100)) 443 | lt = lt+1 444 | i = i + 1 445 | print("ratings_by_movie rates was calculated") 446 | 447 | numpy.save(file=consts.userids_npy_file_name, arr=usersids) 448 | numpy.save(file=consts.moviesids_npy_file_name, arr=moviesids) 449 | numpy.save(file=consts.ratings_by_user_npy_file_name, arr=ratings_by_user) 450 | numpy.save(file=consts.ratings_by_user_ids_npy_file_name, arr=ratings_by_user_ids) 451 | numpy.save(file=consts.ratings_by_user_idx_npy_file_name, arr=ratings_by_user_idx) 452 | numpy.save(file=consts.ratings_by_movie_npy_file_name, arr=ratings_by_movie) 453 | numpy.save(file=consts.ratings_by_movie_ids_npy_file_name, arr=ratings_by_movie_ids) 454 | numpy.save(file=consts.ratings_by_movie_idx_npy_file_name, arr=ratings_by_movie_idx) 455 | print("data was prepared and was saved.") 456 | return 457 | 458 | 459 | 460 | if __name__ == '__main__': 461 | consts = Consts() 462 | convert_csv(consts) 463 | prepare_data(consts) 464 | pass -------------------------------------------------------------------------------- /RecommenderSystem/Nets.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 16.08.10 3 | 4 | @author: klizardin 5 | 6 | The MIT License (MIT) 7 | 8 | Copyright (c) 2016 klizardin 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is furnished 15 | to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 22 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 23 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 24 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 25 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 | 27 | ''' 28 | 29 | import os 30 | import numpy 31 | 32 | import theano 33 | import theano.tensor as T 34 | from theano.tensor.shared_randomstreams import RandomStreams 35 | 36 | 37 | def save_layer(file_name,obj_name,value,consts): 38 | #file_name = consts.get_file_name_by_index(indx,file_name) 39 | file_name = file_name % (obj_name,) 40 | numpy.save(file = file_name, arr = value) 41 | return 42 | 43 | def load_layer(file_name,obj_name,consts): 44 | #file_name = consts.get_file_name_by_index(indx,file_name) 45 | file_name = file_name % (obj_name,) 46 | if not os.path.isfile(path = file_name): 47 | return None 48 | return numpy.asarray(a = numpy.load(file = file_name),dtype=theano.config.floatX) 49 | 50 | class ApproxNet(object): 51 | ''' 52 | The deep net for regression 53 | ''' 54 | def __create_layer(self, numpy_rng, batch_size, layer_size, W, b, prev_layer, i): 55 | if not W or not W[i]: 56 | delta = numpy.sqrt(6 / (float(prev_layer) + float(layer_size))) 57 | initial_W = numpy.asarray( 58 | numpy_rng.uniform( 59 | low = -delta, 60 | high = delta, 61 | size = (prev_layer, layer_size))) 62 | self.W.append(theano.shared(value = initial_W, name = 'W' + str(i))) 63 | #print("W%d size = (%d,%d)" % (i,prev_layer, layer_size)) 64 | else: 65 | self.W.append(W[i]) 66 | 67 | if not b or not b[i]: 68 | self.b.append(theano.shared(value = numpy.zeros(layer_size, dtype=theano.config.floatX),name = 'b'+str(i))) 69 | #print("b%d size = (%d,%d)" % (i,1,layer_size)) 70 | else: 71 | self.b.append(b[i]) 72 | 73 | self.Result.append(theano.shared(value = numpy.zeros((batch_size,layer_size), dtype=theano.config.floatX),name = 'Result'+str(i))) 74 | #print("Result%d size = (%d,%d)" % (i,batch_size,layer_size)) 75 | return layer_size 76 | 77 | def __create_hidden_layers(self, numpy_rng, batch_size, hidden_count, hidden_size, W, b, prev_layer,base_i): 78 | for i in numpy.arange(hidden_count): 79 | prev_layer = self.__create_layer(numpy_rng, batch_size, hidden_size, W, b, prev_layer, base_i+i) 80 | return prev_layer 81 | 82 | def __get_processed(self, input_x): 83 | """ 84 | Computes the values of the encoded layer 85 | """ 86 | data = input_x 87 | for idx in numpy.arange(self.hidden_count): 88 | self.Result[idx] = self.hidden_activation(T.dot(data, self.W[idx]) + self.b[idx]) 89 | data = self.Result[idx] 90 | self.Result[self.hidden_count] = T.tanh(T.dot(data, self.W[self.hidden_count]) + self.b[self.hidden_count]) 91 | return self.Result[self.hidden_count] 92 | 93 | def __get_L1(self): 94 | self.L1 = 0 95 | if len(self.W)==0: 96 | return self.L2 97 | for W in self.W: 98 | self.L1 = self.L1 + T.mean(T.abs_(W)) 99 | return self.L1/len(self.W) 100 | 101 | def __get_L2(self): 102 | self.L2 = 0 103 | if len(self.W)==0: 104 | return self.L2 105 | for W in self.W: 106 | self.L2 = self.L2 + T.mean(T.sqr(W)) 107 | return self.L2/len(self.W) 108 | 109 | def __get_cost_updates(self, target,learning_rate,L1_decay,L2_decay): 110 | """ This function computes the cost and the updates for one trainng 111 | step of the dA """ 112 | 113 | y = self.__get_processed(self.input_x) 114 | # note : we sum over the size of a datapoint; if we are using 115 | # minibatches, L will be a vector, with one entry per 116 | # example in minibatch 117 | L = T.mean(T.sqr(y-target),axis=1) 118 | # note : L is now a vector, where each element is the 119 | # cross-entropy cost of the reconstruction of the 120 | # corresponding example of the minibatch. We need to 121 | # compute the average of all these to get the cost of 122 | # the minibatch 123 | cost = T.mean(L) + self.__get_L2() * L2_decay + self.__get_L1() * L1_decay 124 | 125 | # compute the gradients of the cost of the `dA` with respect 126 | # to its parameters 127 | gparams = T.grad(cost, self.params) 128 | # generate the list of updates 129 | updates = [] 130 | updates.extend([ 131 | (param, param - learning_rate * gparam) 132 | for param, gparam in zip(self.params, gparams) 133 | ]) 134 | 135 | return (cost, updates) 136 | 137 | def __get_run(self): 138 | return self.__get_processed(self.input_x) 139 | 140 | def __init__(self 141 | ,batch_size 142 | ,input_size 143 | ,output_size 144 | ,hidden_count,hidden_size,hidden_activation 145 | ,numpy_rng 146 | ,theano_rng = None 147 | ,L1_decay = 0 148 | ,L2_decay = 0 149 | ,W = None 150 | ,b = None 151 | ,input_x = None 152 | ,target_y = None 153 | ,result_y = None 154 | ): 155 | 156 | 157 | if not theano_rng: 158 | theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) 159 | 160 | self.theano_rng = theano_rng 161 | self.input_size = input_size 162 | self.output_size = output_size 163 | self.hidden_count = hidden_count 164 | self.hiden_size = hidden_size 165 | self.hidden_activation = hidden_activation 166 | if not input_x: 167 | input_x = T.matrix(name="x",dtype=theano.config.floatX) 168 | if not target_y: 169 | target_y = T.matrix(name="target",dtype=theano.config.floatX) 170 | if not result_y: 171 | result_y = T.matrix(name="y",dtype=theano.config.floatX) 172 | 173 | self.input_x = input_x 174 | self.target_y = target_y 175 | self.result_y = result_y 176 | 177 | self.W = [] 178 | self.b = [] 179 | self.Result = [] 180 | 181 | prev_layer = input_size 182 | prev_layer = self.__create_hidden_layers(numpy_rng, batch_size, hidden_count, hidden_size, W, b, prev_layer,0) 183 | prev_layer = self.__create_layer(numpy_rng, batch_size, output_size, W, b, prev_layer, hidden_count) 184 | 185 | self.params = [] 186 | self.params.extend(self.W) 187 | self.params.extend(self.b) 188 | 189 | self.learning_rate = T.scalar(name = "learning_rate",dtype=theano.config.floatX) 190 | self.L1 = T.scalar(name = "L1",dtype=theano.config.floatX) 191 | self.L2 = T.scalar(name = "L2",dtype=theano.config.floatX) 192 | 193 | # create functions of deep net 194 | cost,updates = self.__get_cost_updates(target = self.target_y, learning_rate = self.learning_rate,L1_decay = L1_decay,L2_decay = L2_decay) 195 | self.train_fn = theano.function(inputs = [self.input_x,self.target_y,self.learning_rate],outputs = [cost],updates=updates) 196 | self.result_y = self.__get_run() 197 | self.run_fn = theano.function(inputs=[self.input_x],outputs=[self.result_y]) 198 | 199 | return 200 | 201 | def save_state(self,file_name,consts): 202 | i = 0; 203 | for W in self.W: 204 | save_layer(file_name,"W"+str(i),W.get_value(),consts) 205 | i=i+1 206 | i = 0 207 | for b in self.b: 208 | save_layer(file_name,"b" + str(i),b.get_value(),consts) 209 | i=i+1 210 | return 211 | 212 | def load_state(self,file_name,consts): 213 | i = 0; 214 | for W in self.W: 215 | layer = load_layer(file_name,"W"+str(i),consts) 216 | if layer is None: 217 | return False 218 | W.set_value(layer) 219 | i=i+1 220 | i = 0 221 | for b in self.b: 222 | layer = load_layer(file_name,"b" + str(i),consts) 223 | if layer is None: 224 | return False 225 | b.set_value(layer) 226 | i=i+1 227 | return True 228 | 229 | def print_state(self): 230 | i = 0; 231 | for W in self.W: 232 | print("W"+str(i)); 233 | print(W.get_value()) 234 | i=i+1 235 | i = 0 236 | for b in self.b: 237 | print("b" + str(i)) 238 | print(b.get_value()) 239 | #i = 0 240 | #for result in self.Result: 241 | # print("Result"+str(i)) 242 | # print(result.get_value()) 243 | return 244 | 245 | 246 | class AutoEncoder(object): 247 | ''' 248 | The auto encoder deep net. 249 | ''' 250 | 251 | def __create_layer(self, numpy_rng, mini_batch_size, layer_size, W, b, prev_layer, i): 252 | if not W or not W[i]: 253 | delta = numpy.sqrt(6 / (float(prev_layer) + float(layer_size))) 254 | initial_W = numpy.asarray( 255 | numpy_rng.uniform( 256 | low = -delta, 257 | high = delta, 258 | size = (prev_layer, layer_size)) 259 | ,dtype=theano.config.floatX 260 | ) 261 | self.W.append(theano.shared(value = initial_W, name = 'W' + str(i))) 262 | #print("W%d size = (%d,%d)" % (i,prev_layer, layer_size)) 263 | else: 264 | self.W.append(W[i]) 265 | if not b or not b[i]: 266 | self.b.append(theano.shared(value = numpy.zeros(layer_size, dtype=theano.config.floatX),name = 'b'+str(i))) 267 | #print("b%d size = (%d,%d)" % (i,1,layer_size)) 268 | else: 269 | self.b.append(b[i]) 270 | self.Result.append(theano.shared(value = numpy.zeros((mini_batch_size,layer_size), dtype=theano.config.floatX),name = 'Result'+str(i))) 271 | #print("Result%d size = (%d,%d)" % (i,mini_batch_size,layer_size)) 272 | return layer_size 273 | 274 | def __create_hidden_layers(self, numpy_rng, mini_batch_size, hidden_count, hidden_size, W, b, prev_layer,base_i): 275 | for i in numpy.arange(hidden_count): 276 | prev_layer = self.__create_layer(numpy_rng, mini_batch_size, hidden_size, W, b, prev_layer, base_i+i) 277 | return prev_layer 278 | 279 | def __get_corrupted_input(self, input_x, corruption_level): 280 | """This function keeps ``1-corruption_level`` entries of the inputs the 281 | same and zero-out randomly selected subset of size ``coruption_level`` 282 | Note : first argument of theano.rng.binomial is the shape(size) of 283 | random numbers that it should produce 284 | second argument is the number of trials 285 | third argument is the probability of success of any trial 286 | 287 | this will produce an array of 0s and 1s where 1 has a 288 | probability of 1 - ``corruption_level`` and 0 with 289 | ``corruption_level`` 290 | 291 | The binomial function return int64 data type by 292 | default. int64 multiplicated by the input 293 | type(floatX) always return float64. To keep all data 294 | in floatX when floatX is float32, we set the dtype of 295 | the binomial to floatX. As in our case the value of 296 | the binomial is always 0 or 1, this don't change the 297 | result. This is needed to allow the gpu to work 298 | correctly as it only support float32 for now. 299 | 300 | """ 301 | return self.theano_rng.binomial( 302 | size=input_x.shape, n=1, 303 | p= 1 - corruption_level, 304 | dtype=theano.config.floatX) * input_x 305 | 306 | def __get_encoded(self, input_x): 307 | """ 308 | Computes the values of the encoded layer 309 | """ 310 | data = input_x 311 | for idx in numpy.arange(self.hidden_count): 312 | self.Result[idx] = self.activation(T.dot(data, self.W[idx]) + self.b[idx]) 313 | data = self.Result[idx] 314 | self.Result[self.hidden_count] = T.tanh(T.dot(data, self.W[self.hidden_count]) + self.b[self.hidden_count])*float(0.5) 315 | return self.Result[self.hidden_count] 316 | 317 | def __get_reconstructed(self,encoded): 318 | """ 319 | Computes the values of the result layer 320 | """ 321 | data = encoded 322 | base_i = self.hidden_count+1 323 | for idx in numpy.arange(self.hidden_count): 324 | self.Result[base_i+idx] = self.activation(T.dot(data, self.W[base_i+idx]) + self.b[base_i+idx]) 325 | data = self.Result[base_i+idx] 326 | self.Result[base_i+self.hidden_count] = T.tanh(T.dot(data, self.W[base_i+self.hidden_count]) + self.b[base_i+self.hidden_count]) 327 | return self.Result[base_i+self.hidden_count] 328 | 329 | def __get_L1(self): 330 | self.L1 = 0 331 | if len(self.W)==0: 332 | return self.L2 333 | for W in self.W: 334 | self.L1 = self.L1 + T.mean(T.abs_(W)) 335 | return self.L1/len(self.W) 336 | 337 | def __get_L2(self): 338 | self.L2 = 0 339 | if len(self.W)==0: 340 | return self.L2 341 | for W in self.W: 342 | self.L2 = self.L2 + T.mean(T.sqr(W)) 343 | return self.L2/len(self.W) 344 | 345 | def __get_cost_updates(self, corruption_level, learning_rate,L1_decay,L2_decay): 346 | """ This function computes the cost and the updates for one trainng 347 | step of the dA """ 348 | 349 | tilde_x = self.__get_corrupted_input(self.input_x, corruption_level) 350 | y = self.__get_encoded(tilde_x) 351 | z = self.__get_reconstructed(y) 352 | # note : we sum over the size of a datapoint; if we are using 353 | # minibatches, L will be a vector, with one entry per 354 | # example in minibatch 355 | #L = - T.sum(self.input_x * T.log(z) + (1 - self.input_x) * T.log(1 - z), axis=1) 356 | L = T.mean(T.sqr(z-self.input_x),axis=1) 357 | # note : L is now a vector, where each element is the 358 | # cross-entropy cost of the reconstruction of the 359 | # corresponding example of the minibatch. We need to 360 | # compute the average of all these to get the cost of 361 | # the minibatch 362 | cost = T.mean(L) + self.__get_L2() * L2_decay + self.__get_L1() * L1_decay 363 | 364 | # compute the gradients of the cost of the `dA` with respect 365 | # to its parameters 366 | gparams = T.grad(cost, self.params) 367 | # generate the list of updates 368 | updates = [] 369 | updates.extend([ 370 | (param, param - learning_rate * gparam) 371 | for param, gparam in zip(self.params, gparams) 372 | ]) 373 | 374 | return (cost, updates) 375 | 376 | def __get_run(self): 377 | return self.__get_encoded(self.input_x) 378 | 379 | def __init__(self, 380 | mini_batch_size, 381 | input_size,encoded_size, 382 | hidden_count,hidden_size,activation, 383 | L1_decay,L2_decay, 384 | numpy_rng, 385 | theano_rng = None, 386 | W = None, 387 | b = None, 388 | input_x = None 389 | ): 390 | ''' 391 | Constructor 392 | ''' 393 | 394 | if not theano_rng: 395 | theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) 396 | 397 | self.theano_rng = theano_rng 398 | self.input_size = input_size 399 | self.encoded_size = encoded_size 400 | self.hidden_count = hidden_count 401 | self.hiden_size = hidden_size 402 | self.activation = activation 403 | if not input_x: 404 | input_x = T.matrix(name="x",dtype=theano.config.floatX) 405 | 406 | self.input_x = input_x 407 | 408 | self.W = [] 409 | self.b = [] 410 | self.Result = [] 411 | 412 | prev_layer = input_size 413 | prev_layer = self.__create_hidden_layers(numpy_rng, mini_batch_size, hidden_count, hidden_size, W, b, prev_layer,0) 414 | prev_layer = self.__create_layer(numpy_rng, mini_batch_size, encoded_size, W, b, prev_layer, hidden_count) 415 | prev_layer = self.__create_hidden_layers(numpy_rng, mini_batch_size, hidden_count, hidden_size, W, b, prev_layer,hidden_count+1) 416 | prev_layer = self.__create_layer(numpy_rng, mini_batch_size, input_size, W, b, prev_layer, 2*hidden_count+1) 417 | 418 | self.params = [] 419 | self.params.extend(self.W) 420 | self.params.extend(self.b) 421 | 422 | self.learning_rate = T.scalar(name = "learning_rate",dtype=theano.config.floatX) 423 | self.corruption_level = T.scalar(name = "learning_rate",dtype=theano.config.floatX) 424 | self.L1 = T.scalar(name = "L1",dtype=theano.config.floatX) 425 | self.L2 = T.scalar(name = "L2",dtype=theano.config.floatX) 426 | 427 | # create functions of autoencoder 428 | cost,updates = self.__get_cost_updates(corruption_level = self.corruption_level, learning_rate = self.learning_rate,L1_decay = L1_decay,L2_decay = L2_decay) 429 | self.train_fn = theano.function(inputs = [self.input_x,self.learning_rate,self.corruption_level],outputs = [cost],updates=updates) 430 | self.encoded = self.__get_run() 431 | self.get_encoded_fn = theano.function(inputs=[self.input_x],outputs=[self.encoded]) 432 | 433 | return 434 | 435 | def save_state(self,file_name,consts): 436 | i = 0; 437 | for W in self.W: 438 | save_layer(file_name,"W"+str(i),W.get_value(),consts) 439 | i=i+1 440 | i = 0 441 | for b in self.b: 442 | save_layer(file_name,"b" + str(i),b.get_value(),consts) 443 | i=i+1 444 | return 445 | 446 | def load_state(self,file_name,consts): 447 | i = 0; 448 | for W in self.W: 449 | layer = load_layer(file_name,"W"+str(i),consts) 450 | if layer is None: 451 | return False 452 | W.set_value(layer) 453 | i=i+1 454 | i = 0 455 | for b in self.b: 456 | layer = load_layer(file_name,"b" + str(i),consts) 457 | if layer is None: 458 | return False 459 | b.set_value(layer) 460 | i=i+1 461 | return True 462 | 463 | def print_state(self): 464 | i = 0; 465 | for W in self.W: 466 | print("W"+str(i)); 467 | print(W.get_value()) 468 | i=i+1 469 | i = 0 470 | for b in self.b: 471 | print("b" + str(i)) 472 | print(b.get_value()) 473 | #i = 0 474 | #for result in self.Result: 475 | # print("Result"+str(i)) 476 | # print(result.get_value()) 477 | return 478 | -------------------------------------------------------------------------------- /RecommenderSystem/MIDS.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on may 25, 2016 3 | 4 | @author: klizardin 5 | 6 | The MIT License (MIT) 7 | 8 | Copyright (c) 2016 klizardin 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is furnished 15 | to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 22 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 23 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 24 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 25 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 26 | 27 | 28 | ''' 29 | from __future__ import print_function 30 | 31 | import sys 32 | import os 33 | import time 34 | import numpy 35 | import pandas 36 | import csv 37 | 38 | from Nets import ApproxNet 39 | from Nets import AutoEncoder 40 | from Consts import Consts 41 | 42 | import theano 43 | import theano.tensor as T 44 | from theano.tensor.shared_randomstreams import RandomStreams 45 | 46 | 47 | def save_ids(ids,name): 48 | consts = Consts() 49 | name = os.path.join(consts.result_path,name) 50 | result = open(name,"wt") 51 | for row in numpy.arange(ids.shape[0]): 52 | for j in numpy.arange(ids.shape[1]): 53 | result.write("%f\t" % (ids[row,j],)) 54 | result.write("\n") 55 | return 56 | 57 | class IDsUpdater(object): 58 | ''' 59 | The class to update ids 60 | ''' 61 | 62 | def __init__(self,ids_count,avg_rate,normilized_vs_avg_rate,max_distance,dist_sqrt_coef,avg_dx_item_small_weight,min_max_compresion_rate,name): 63 | self.ids_count = ids_count 64 | self.avg_rate = avg_rate 65 | self.normilized_vs_avg_rate = normilized_vs_avg_rate 66 | self.name = name 67 | self.max_distance = max_distance 68 | self.dist_sqrt_coef = dist_sqrt_coef 69 | self.avg_dx_item_small_weight = avg_dx_item_small_weight 70 | self.min_max_compresion_rate = min_max_compresion_rate 71 | return 72 | 73 | def __distance(self,ids_values,i0,i1): 74 | if i0==i1: 75 | return self.max_distance 76 | return numpy.sum(numpy.square(ids_values[i0,]-ids_values[i1,])) 77 | 78 | def __calc_new_ids(self,ids_values,ids_new_values,avg_rate,mask,p,item_weight): 79 | max_values = numpy.zeros(shape=ids_new_values.shape[1],dtype=theano.config.floatX) 80 | min_values = numpy.zeros(shape=ids_new_values.shape[1],dtype=theano.config.floatX) 81 | v1 = numpy.zeros(shape=(2,ids_new_values.shape[1]),dtype=theano.config.floatX) 82 | first = True 83 | dx_cnt = 0 84 | for i in numpy.arange(ids_values.shape[0]): 85 | if mask[i]==0: 86 | continue 87 | if first: 88 | max_values = ids_new_values[i,] 89 | min_values = ids_new_values[i,] 90 | else: 91 | v1[0,] = ids_new_values[i,] 92 | v1[1,] = max_values 93 | max_values = numpy.max(a = v1,axis = 0) 94 | v1[1,] = min_values 95 | min_values = numpy.min(a = v1,axis = 0) 96 | first = False 97 | dx_cnt = dx_cnt + 1 98 | 99 | ids_dx = numpy.zeros(shape=ids_values.shape,dtype=theano.config.floatX) 100 | 101 | delta_values = max_values - min_values 102 | for i in numpy.arange(ids_values.shape[0]): 103 | if mask[i]==0: 104 | ids_dx[i,] = 0 105 | ids_new_values[i,] = ids_values[i,] 106 | else: 107 | ids_new_values[i,] = ((ids_new_values[i,]-min_values)/delta_values - 0.5) * self.min_max_compresion_rate 108 | ids_dx[i,] = ids_new_values[i,] - ids_values[i,] 109 | 110 | #save_ids(ids_new_values,"items_new_ids.dta_001_new1.txt") 111 | 112 | avg_dx = numpy.zeros(shape=ids_values.shape,dtype=theano.config.floatX) 113 | avg_cnt = int(dx_cnt*avg_rate) 114 | lt = time.time() 115 | for i in numpy.arange(ids_values.shape[0]): 116 | if mask[i]==0: 117 | avg_dx[i,] = 0 118 | continue 119 | dist = numpy.zeros(shape=(ids_values.shape[0]),dtype=theano.config.floatX) 120 | for j in numpy.arange(ids_values.shape[0]): 121 | if mask[j]==0: 122 | dist[j] = self.max_distance 123 | continue 124 | dist[j] = self.__distance(ids_values,i,j) 125 | sorted_indices = numpy.argsort(dist) 126 | 127 | # sum nearest 128 | sumw = 1.0e-8 129 | for j in numpy.arange(avg_cnt): 130 | w = numpy.exp(-dist[sorted_indices[j]]/self.dist_sqrt_coef) 131 | sumw = sumw + w 132 | avg_dx[i,] = avg_dx[i,] + w*ids_dx[sorted_indices[j],] 133 | # add to sum dx with small weight 134 | w = item_weight 135 | sumw = sumw + w 136 | avg_dx[i,] = avg_dx[i,] + w*ids_dx[i,] 137 | 138 | avg_dx[i,] = avg_dx[i,] / sumw 139 | t1 = time.time() 140 | if t1>lt+1: 141 | sys.stdout.write("\t\t\t\t\t\t\t\t\t\r") 142 | sys.stdout.write("calc_%s_avg_dx %f %%\r" % (self.name,float(i)/float(ids_values.shape[0])*100)) 143 | lt = lt+1 144 | 145 | for i in numpy.arange(ids_values.shape[0]): 146 | if mask[i]==0: 147 | ids_new_values[i,] = ids_values[i,] 148 | continue 149 | new_smooth_move = avg_dx[i,] 150 | new_rearrange = ids_dx[i,] - avg_dx[i,] 151 | ids_new_values[i,] = ids_values[i,] + (new_smooth_move*(1.0-p) + new_rearrange*p) 152 | mv = max(numpy.max(ids_new_values[i,]),abs(numpy.min(ids_new_values[i,]))) 153 | if mv>0.5: 154 | ids_new_values[i,] = ids_new_values[i,] * 0.5/mv 155 | 156 | #save_ids(ids_values,"items_new_ids.dta_001_old.txt") 157 | #save_ids(ids_new_values,"items_new_ids.dta_001_new.txt") 158 | #save_ids(avg_dx,"items_new_ids.dta_001_avg.txt") 159 | 160 | return ids_new_values 161 | 162 | def calc_new_ids(self,ids_values,ids_new_values,mask): 163 | return self.__calc_new_ids( 164 | ids_values = ids_values 165 | ,ids_new_values = ids_new_values 166 | ,avg_rate = self.avg_rate 167 | ,mask = mask 168 | ,p = self.normilized_vs_avg_rate 169 | ,item_weight =self.avg_dx_item_small_weight 170 | ) 171 | 172 | class MatrixIDs(object): 173 | ''' 174 | The class to get IDs by the sparse matrix of the data (the both axes data and rates data in the matrix). 175 | ''' 176 | 177 | def __init__(self 178 | ,usersids_data,itemsids_data 179 | ,ratings_by_user,ratings_by_user_ids,ratings_by_user_idx 180 | ,ratings_by_item,ratings_by_item_ids,ratings_by_item_idx 181 | ,users_cnt 182 | ,items_cnt 183 | ,rng 184 | ,theano_rng 185 | ,consts = Consts() 186 | ,users_ids = None 187 | ,items_ids = None 188 | ): 189 | ''' 190 | Constructor 191 | ''' 192 | 193 | self.usersids_data = usersids_data 194 | self.itemsids_data = itemsids_data 195 | self.ratings_by_user = ratings_by_user 196 | self.ratings_by_user_ids = ratings_by_user_ids 197 | self.ratings_by_user_idx = ratings_by_user_idx 198 | self.ratings_by_item = ratings_by_item 199 | self.ratings_by_item_ids = ratings_by_item_ids 200 | self.ratings_by_item_idx = ratings_by_item_idx 201 | self.rng = rng 202 | 203 | self.user_indice = 0 204 | self.movie_indice = 1 205 | 206 | self.item_id_size = consts.item_id_size 207 | self.user_id_size = consts.user_id_size 208 | self.new_user_cycles = consts.new_user_cycles 209 | self.new_item_cycles = consts.new_item_cycles 210 | #self.users_ids_move_elem_count_rate = consts.users_ids_move_elem_count_rate 211 | #self.items_ids_move_elem_count_rate = consts.items_ids_move_elem_count_rate 212 | self.items_count = items_cnt 213 | self.users_count = users_cnt 214 | if not items_ids: 215 | items_ids = rng.uniform(low = -0.5,high = 0.5, size = (self.items_count,self.item_id_size)).astype(theano.config.floatX) 216 | if not users_ids: 217 | users_ids = rng.uniform(low = -0.5,high = 0.5, size = (self.users_count,self.user_id_size)).astype(theano.config.floatX) 218 | self.items_ids = items_ids 219 | self.items_ids_base = self.items_ids.copy() 220 | self.new_items_ids = self.items_ids.copy() 221 | self.users_ids = users_ids 222 | self.users_ids_base = self.users_ids.copy() 223 | self.new_users_ids = self.users_ids.copy() 224 | 225 | 226 | self.mini_batch_size = consts.encoder_batch_size 227 | self.encoder_elements_count = consts.encode_elements_count 228 | 229 | self.items_ids_to_user_id_autoencoder = AutoEncoder( 230 | mini_batch_size = self.mini_batch_size 231 | ,input_size = self.encoder_elements_count*(self.itemsids_data.shape[1]+self.item_id_size+ratings_by_user.shape[1]) 232 | ,encoded_size = self.user_id_size 233 | ,hidden_count = consts.encoder_hidden_layers_count,hidden_size = consts.encoder_hidden_layer_size,activation = consts.encoder_hidden_layers_activation 234 | ,L1_decay = consts.encoder_L1_decay,L2_decay = consts.encoder_L2_decay 235 | ,numpy_rng = rng 236 | ,theano_rng = theano_rng 237 | ) #+usersids_data.shape[1] 238 | self.users_ids_to_item_id_autoencoder = AutoEncoder( 239 | mini_batch_size = self.mini_batch_size 240 | ,input_size = self.encoder_elements_count*(self.usersids_data.shape[1]+self.user_id_size+ratings_by_item.shape[1]) 241 | ,encoded_size = self.item_id_size 242 | ,hidden_count = consts.encoder_hidden_layers_count,hidden_size = consts.encoder_hidden_layer_size,activation = consts.encoder_hidden_layers_activation 243 | ,L1_decay = consts.encoder_L1_decay,L2_decay = consts.encoder_L2_decay 244 | ,numpy_rng = rng 245 | ,theano_rng = theano_rng 246 | ) #+itemsids_data.shape[1] 247 | 248 | self.loss_users_to_item = float(0.0) 249 | self.loss_items_to_user = float(0.0) 250 | self.loss_k = consts.encoder_loss_k 251 | 252 | self.users_ids_updater = IDsUpdater( 253 | ids_count = self.users_count 254 | ,avg_rate = consts.users_ids_avg_rate 255 | ,normilized_vs_avg_rate = consts.ids_update_users_normilized_vs_avg_rate 256 | ,max_distance= consts.user_max_distance 257 | ,dist_sqrt_coef = consts.dist_sqrt_coef 258 | ,avg_dx_item_small_weight = consts.avg_dx_item_small_weight 259 | ,min_max_compresion_rate = consts.min_max_compresion_rate 260 | ,name = "users" 261 | ) 262 | self.items_ids_updater = IDsUpdater( 263 | ids_count = self.items_count 264 | ,avg_rate = consts.items_ids_avg_rate 265 | ,normilized_vs_avg_rate = consts.ids_update_items_normilized_vs_avg_rate 266 | ,max_distance= consts.item_max_distance 267 | ,dist_sqrt_coef = consts.dist_sqrt_coef 268 | ,avg_dx_item_small_weight = consts.avg_dx_item_small_weight 269 | ,min_max_compresion_rate = consts.min_max_compresion_rate 270 | ,name = "items" 271 | ) 272 | self.index = 0 273 | #self.ids_move_count = consts.ids_move_count 274 | #self.ids_move_count_coef = consts.ids_move_count_coef 275 | #self.ids_move_count = consts.ids_move_count 276 | #self.ids_move_count_coef = consts.ids_move_count_coef 277 | 278 | self.itemsids_mini_batch_size = consts.itemids_batch_size 279 | self.user_ids_rate_to_item_ids_net = ApproxNet( 280 | batch_size = self.itemsids_mini_batch_size 281 | ,input_size = self.usersids_data.shape[1]+self.user_id_size+ratings_by_user.shape[1] 282 | ,output_size = self.item_id_size 283 | ,hidden_count = consts.itemids_hidden_layers_count 284 | ,hidden_size = consts.itemids_hidden_layer_size 285 | ,hidden_activation = consts.itemids_hidden_layers_activation 286 | ,L1_decay = consts.itemids_L1_decay 287 | ,L2_decay = consts.itemids_L2_decay 288 | ,numpy_rng = rng 289 | ,theano_rng = theano_rng 290 | ) 291 | self.itemids_loss_k = consts.itemids_loss_k 292 | self.loss_itemids = float(0.0) 293 | 294 | return 295 | 296 | def update_user_ids(self,consts): 297 | i0 = int(float(self.index)/float(consts.ids_move_count)*self.users_ids.shape[0]) 298 | i1 = int(float(self.index+1)/float(consts.ids_move_count)*self.users_ids.shape[0]) 299 | if i0==i1: 300 | i1 = i0+1 301 | i0 = min(i0,self.users_ids.shape[0]-1) 302 | i1 = min(i1,self.users_ids.shape[0]-1) 303 | self.users_ids[i0:i1,] = self.new_users_ids[i0:i1,] 304 | return 305 | 306 | def update_item_ids(self,consts): 307 | i0 = int(float(self.index)/float(consts.ids_move_count)*self.items_ids.shape[0]) 308 | i1 = int(float(self.index+1)/float(consts.ids_move_count)*self.items_ids.shape[0]) 309 | if i0==i1: 310 | i1 = i0+1 311 | i0 = min(i0,self.items_ids.shape[0]-1) 312 | i1 = min(i1,self.items_ids.shape[0]-1) 313 | self.items_ids[i0:i1,] = self.new_items_ids[i0:i1,] 314 | return 315 | 316 | def train_encoders(self,learning_rate,corruption_level,consts): 317 | encoder_size = self.encoder_elements_count*(self.itemsids_data.shape[1]+self.item_id_size+self.ratings_by_user.shape[1]) #+self.usersids_data.shape[1] 318 | x_value = numpy.zeros((self.mini_batch_size,encoder_size), dtype=theano.config.floatX) 319 | for j in numpy.arange(self.mini_batch_size): 320 | user_idx1 = self.rng.randint(low=0 ,high = len(self.ratings_by_user_idx)) 321 | curr_user_idx_index = curr_user_idx_index = self.ratings_by_user_idx[user_idx1,0] 322 | values = [] 323 | user_id = self.ratings_by_item_ids[curr_user_idx_index,self.user_indice] - 1 324 | for k in numpy.arange(self.encoder_elements_count): 325 | item_idx1 = self.rng.randint(low = 0,high = self.ratings_by_user_idx[user_idx1,1]-curr_user_idx_index) 326 | rating_by_user_offs = curr_user_idx_index+item_idx1 327 | #user_id = self.ratings_by_user_ids[rating_by_user_offs,0] - 1 328 | item_id = self.ratings_by_user_ids[rating_by_user_offs,self.movie_indice] - 1 329 | values.append((self.items_ids[item_id,],self.itemsids_data[item_id,],self.ratings_by_user[rating_by_user_offs,])) 330 | values = sorted(values,key = lambda x : x[2][0]) 331 | for k in numpy.arange(self.encoder_elements_count): 332 | i0 = (self.itemsids_data.shape[1]+self.item_id_size+self.ratings_by_user.shape[1])*k # 333 | i1 = i0 + self.item_id_size 334 | x_value[j,i0:i1] = values[k][0] 335 | i0 = i1; 336 | i1 = i1 + self.itemsids_data.shape[1] 337 | x_value[j,i0:i1] = values[k][1] 338 | i0 = i1 339 | i1 = i1 + self.ratings_by_item.shape[1] 340 | x_value[j,i0:i1] = values[k][2][0] 341 | #i0 = (self.item_id_size+self.ratings_by_user.shape[1])*self.encoder_elements_count #self.itemsids_data.shape[1]+ 342 | #i1 = i0 + self.usersids_data.shape[1] 343 | #x_value[j,i0:i1] = self.usersids_data[user_id,] 344 | 345 | loss = self.items_ids_to_user_id_autoencoder.train_fn(x_value,learning_rate,corruption_level) 346 | if loss[0]>=0: 347 | loss = numpy.sqrt(loss[0]) 348 | else: 349 | loss = 0 350 | if self.loss_items_to_user==0: 351 | self.loss_items_to_user = loss 352 | else: 353 | self.loss_items_to_user += (loss - self.loss_items_to_user)*self.loss_k 354 | 355 | encoder_size = self.encoder_elements_count*(self.usersids_data.shape[1]+self.user_id_size+self.ratings_by_item.shape[1]) #+self.itemsids_data.shape[1] 356 | x_value = numpy.zeros((self.mini_batch_size,encoder_size), dtype=theano.config.floatX) 357 | for j in numpy.arange(self.mini_batch_size): 358 | item_idx1 = self.rng.randint(low = 0,high = len(self.ratings_by_item_idx)) 359 | curr_item_idx_index = self.ratings_by_item_idx[item_idx1,0] 360 | values = [] 361 | item_id = self.ratings_by_item_ids[curr_item_idx_index,self.movie_indice] - 1 362 | for k in numpy.arange(self.encoder_elements_count): 363 | user_idx1 = self.rng.randint(low = 0,high = self.ratings_by_item_idx[item_idx1,1]-curr_item_idx_index) 364 | rating_by_item_offs = curr_item_idx_index+user_idx1 365 | user_id = self.ratings_by_item_ids[rating_by_item_offs,self.user_indice] - 1 366 | #item_id = self.ratings_by_item_ids[rating_by_item_offs,1] - 1 367 | values.append((self.users_ids[user_id,],self.usersids_data[user_id,],self.ratings_by_item[rating_by_item_offs,])) 368 | values = sorted(values,key = lambda x : x[2][0]) 369 | for k in numpy.arange(self.encoder_elements_count): 370 | i0 = (self.usersids_data.shape[1]+self.user_id_size+self.ratings_by_item.shape[1])*k # 371 | i1 = i0 + self.user_id_size 372 | x_value[j,i0:i1] = values[k][0] 373 | i0 = i1; 374 | i1 = i1 + self.usersids_data.shape[1] 375 | x_value[j,i0:i1] = values[k][1] 376 | i0 = i1 377 | i1 = i1 + self.ratings_by_user.shape[1] 378 | x_value[j,i0:i1] = values[k][2][0] 379 | #i0 = (self.user_id_size+self.ratings_by_user.shape[1])*self.encoder_elements_count #self.usersids_data.shape[1]+ 380 | #i1 = i0 + self.itemsids_data.shape[1] 381 | #x_value[j,i0:i1] = self.itemsids_data[item_id,] 382 | 383 | loss = self.users_ids_to_item_id_autoencoder.train_fn(x_value,learning_rate,corruption_level) 384 | if loss[0]>=0: 385 | loss = numpy.sqrt(loss[0]) 386 | else: 387 | loss = 0 388 | if self.loss_users_to_item==0: 389 | self.loss_users_to_item = loss 390 | else: 391 | self.loss_users_to_item += (loss - self.loss_users_to_item)*self.loss_k 392 | 393 | 394 | # update ids 395 | self.update_user_ids(consts) 396 | self.update_item_ids(consts) 397 | 398 | self.index = self.index + 1 399 | 400 | return self.loss_items_to_user,self.loss_users_to_item 401 | 402 | def train_itemids(self,learning_rate,consts): 403 | input_size = self.usersids_data.shape[1]+self.user_id_size+self.ratings_by_user.shape[1] 404 | output_size = self.item_id_size 405 | x_value = numpy.zeros((self.itemsids_mini_batch_size,input_size),dtype=theano.config.floatX) 406 | y_target = numpy.zeros((self.itemsids_mini_batch_size,output_size),dtype=theano.config.floatX) 407 | for bi in numpy.arange(self.itemsids_mini_batch_size): 408 | user_idx1 = self.rng.randint(low=0 ,high = len(self.ratings_by_user_idx)) 409 | curr_user_idx_index = self.ratings_by_user_idx[user_idx1,0] 410 | rating_by_user_offs = self.rng.randint(low = curr_user_idx_index,high = self.ratings_by_user_idx[user_idx1,1]) 411 | #user_id = self.ratings_by_user_ids[rating_by_user_offs,0] - 1 412 | user_id = self.ratings_by_item_ids[rating_by_user_offs,self.user_indice] - 1 413 | item_id = self.ratings_by_user_ids[rating_by_user_offs,self.movie_indice] - 1 414 | 415 | i0 = 0 416 | i1 = i0 + self.usersids_data.shape[1] 417 | x_value[bi,i0:i1] = self.usersids_data[user_id,:] 418 | i0 = i1 419 | i1 = i0 + self.user_id_size 420 | x_value[bi,i0:i1] = self.users_ids[user_id,:] 421 | i0 = i1 422 | i1 = i0 + self.ratings_by_user.shape[1] 423 | x_value[bi,i0:i1] = self.ratings_by_user[rating_by_user_offs,:] 424 | y_target[bi,:] = self.items_ids[item_id,:] 425 | pass 426 | 427 | loss = self.user_ids_rate_to_item_ids_net.train_fn(x_value,y_target,learning_rate) 428 | if loss[0]>=0: 429 | loss = numpy.sqrt(loss[0]) 430 | else: 431 | loss = float(0) 432 | if self.loss_itemids==0: 433 | self.loss_itemids = loss 434 | else: 435 | self.loss_itemids += (loss - self.loss_itemids)*self.itemids_loss_k 436 | return self.loss_itemids 437 | 438 | def __get_user_ids(self,user_idxes): 439 | encoder_size = self.encoder_elements_count*(self.itemsids_data.shape[1]+self.item_id_size+self.ratings_by_user.shape[1]) #+self.usersids_data.shape[1] 440 | x_value = numpy.zeros((self.mini_batch_size,encoder_size), dtype=theano.config.floatX) 441 | for j in numpy.arange(self.mini_batch_size): 442 | user_idx1 = 0 443 | if j=updates_elements_cnt: 523 | i1 = updates_elements_cnt-1 524 | #user_ids = self.ratings_by_user_ids[self.ratings_by_user_idx[user_ids_ind[i0:i1],0],0] 525 | if i0==i1: 526 | continue 527 | encoded = self.__get_user_ids(user_ids_ind[i0:i1]) 528 | for i in numpy.arange(self.mini_batch_size): 529 | if i0+i>=updates_elements_cnt: 530 | continue 531 | user_id = self.ratings_by_user_ids[self.ratings_by_user_idx[user_ids_ind[i0+i],0],self.user_indice] - 1 532 | if cycle==0: 533 | self.new_users_ids[user_id,] = encoded[i,] 534 | else: 535 | self.new_users_ids[user_id,] = self.new_users_ids[user_id,] + encoded[i,] 536 | t1 = time.time() 537 | if t1>lt+1: 538 | rate = float(i00+cycle*updates_elements_cnt/self.mini_batch_size)/float(updates_elements_cnt/self.mini_batch_size*self.new_user_cycles) 539 | sys.stdout.write("\t\t\t\t\t\t\t\t\t\r") 540 | sys.stdout.write("get_new_user_ids %f %%\r" % (rate*100)) 541 | lt = lt+1 542 | 543 | user_ids = self.ratings_by_user_ids[self.ratings_by_user_idx[user_ids_ind[0:updates_elements_cnt],0],self.user_indice] - 1 544 | 545 | user_id_max = users_cnt 546 | self.users_ids_mask = numpy.zeros(shape=user_id_max, dtype=numpy.int8) 547 | self.users_ids_mask[user_ids] = 1 548 | self.new_users_ids[user_ids,] = self.new_users_ids[user_ids,] / float(self.new_user_cycles) 549 | 550 | #for i in numpy.arange(updates_elements_cnt): 551 | # user_id = user_ids_ind[i] 552 | # self.new_users_ids[user_id,] = self.new_users_ids[user_id,] / float(self.new_user_cycles) 553 | 554 | self.new_users_ids = self.users_ids_updater.calc_new_ids( 555 | ids_values = self.users_ids_base 556 | ,ids_new_values = self.new_users_ids 557 | ,mask = self.users_ids_mask 558 | ) 559 | 560 | updates_elements_cnt_lo = int(updates_elements_cnt*consts.users_ids_move_elem_count_rate1) 561 | user_ids = self.ratings_by_user_ids[self.ratings_by_user_idx[user_ids_ind[updates_elements_cnt_lo:updates_elements_cnt],0],self.user_indice] - 1 562 | self.new_users_ids[user_ids,] = self.users_ids_base[user_ids,] 563 | 564 | p=consts.ids_move_count_coef 565 | self.new_users_ids = (1.0 - p)*self.users_ids_base + p*self.new_users_ids 566 | self.index = 0 567 | return 568 | 569 | def get_new_item_ids(self,movies_cnt,consts): 570 | lt = time.time() 571 | self.items_ids_base = self.items_ids.copy() 572 | self.new_items_ids = self.items_ids.copy() 573 | item_ids_ind = numpy.arange(len(self.ratings_by_item_idx)) 574 | numpy.random.shuffle(item_ids_ind) 575 | updates_elements_cnt = int(len(item_ids_ind)*consts.items_ids_move_elem_count_rate) 576 | updates_elements_cnt = int(((updates_elements_cnt/self.mini_batch_size)+1)*self.mini_batch_size) 577 | for cycle in numpy.arange(self.new_item_cycles): 578 | for i00 in numpy.arange(updates_elements_cnt/self.mini_batch_size): 579 | i0 = int(i00*self.mini_batch_size) 580 | i1 = int((i00+1)*self.mini_batch_size) 581 | if i1>=updates_elements_cnt: 582 | i1 = updates_elements_cnt-1 583 | #item_ids = self.ratings_by_item_ids[self.ratings_by_item_idx[item_ids_ind[i0:i1],0],1] 584 | if i0==i1: 585 | continue 586 | encoded = self.__get_item_ids(item_ids_ind[i0:i1]) 587 | for i in numpy.arange(self.mini_batch_size): 588 | if i0+i>=updates_elements_cnt: 589 | continue 590 | item_id = self.ratings_by_item_ids[self.ratings_by_item_idx[item_ids_ind[i0+i],0],self.movie_indice] - 1 591 | if cycle==0: 592 | self.new_items_ids[item_id,] = encoded[i,] 593 | else: 594 | self.new_items_ids[item_id,] = self.new_items_ids[item_id,] + encoded[i,] 595 | t1 = time.time() 596 | if t1>lt+1: 597 | rate = float(i00+cycle*updates_elements_cnt/self.mini_batch_size)/float(updates_elements_cnt/self.mini_batch_size*self.new_item_cycles) 598 | sys.stdout.write("\t\t\t\t\t\t\t\t\t\r") 599 | sys.stdout.write("get_new_item_ids %f %%\r" % (rate*100)) 600 | lt = lt+1 601 | 602 | item_ids = self.ratings_by_item_ids[self.ratings_by_item_idx[item_ids_ind[0:updates_elements_cnt],0],self.movie_indice] - 1 603 | 604 | item_id_max = movies_cnt 605 | self.items_ids_mask = numpy.zeros(shape=item_id_max, dtype=numpy.int8) 606 | self.items_ids_mask[item_ids] = 1 607 | self.new_items_ids[item_ids,] = self.new_items_ids[item_ids,] / float(self.new_item_cycles) 608 | 609 | self.new_items_ids = self.items_ids_updater.calc_new_ids( 610 | ids_values = self.items_ids_base 611 | ,ids_new_values = self.new_items_ids 612 | ,mask = self.items_ids_mask 613 | ) 614 | 615 | updates_elements_cnt_lo = int(updates_elements_cnt*consts.items_ids_move_elem_count_rate1) 616 | item_ids = self.ratings_by_item_ids[self.ratings_by_item_idx[item_ids_ind[updates_elements_cnt_lo:updates_elements_cnt],0],self.movie_indice] - 1 617 | self.new_items_ids[item_ids,] = self.items_ids_base[item_ids,] 618 | 619 | p=consts.ids_move_count_coef 620 | self.new_items_ids = (1.0 - p)*self.items_ids_base + p*self.new_items_ids 621 | self.index = 0 622 | return 623 | 624 | def save(self,index,consts): 625 | numpy.save(file = consts.get_file_name_by_index(index,consts.users_ids_file_name), arr = self.users_ids) 626 | numpy.save(file = consts.get_file_name_by_index(index,consts.items_ids_file_name), arr = self.items_ids) 627 | self.users_ids_to_item_id_autoencoder.save_state( 628 | file_name = consts.get_file_name_by_index(index,consts.users_ids_to_item_id_autoencoder_file_name), 629 | consts = consts) 630 | self.items_ids_to_user_id_autoencoder.save_state( 631 | file_name = consts.get_file_name_by_index(index,consts.items_ids_to_user_id_autoencoder_file_name) 632 | ,consts = consts 633 | ) 634 | self.user_ids_rate_to_item_ids_net.save_state( 635 | file_name = consts.get_file_name_by_index(index,consts.user_ids_rate_to_item_ids_net_file_name) 636 | ,consts = consts 637 | ) 638 | return 639 | 640 | def load(self,index,consts): 641 | if not self.users_ids_to_item_id_autoencoder.load_state( 642 | file_name=consts.get_file_name_by_index(index,consts.users_ids_to_item_id_autoencoder_file_name) 643 | ,consts=consts): 644 | return False 645 | if not self.items_ids_to_user_id_autoencoder.load_state( 646 | file_name=consts.get_file_name_by_index(index,consts.items_ids_to_user_id_autoencoder_file_name) 647 | , consts=consts): 648 | return False 649 | if not self.user_ids_rate_to_item_ids_net.load_state( 650 | file_name=consts.get_file_name_by_index(index,consts.user_ids_rate_to_item_ids_net_file_name) 651 | ,consts = consts): 652 | return False 653 | file_name = consts.get_file_name_by_index(index,consts.users_ids_file_name) 654 | if not os.path.isfile(path = file_name): 655 | return False 656 | data = numpy.load(file = file_name) 657 | self.users_ids = numpy.asarray(a = data,dtype=theano.config.floatX) 658 | file_name = consts.get_file_name_by_index(index,consts.items_ids_file_name) 659 | if not os.path.isfile(path = file_name): 660 | return False 661 | data = numpy.load(file = file_name) 662 | self.items_ids = numpy.asarray(a = data,dtype=theano.config.floatX) 663 | self.items_ids_base = self.items_ids.copy() 664 | self.new_items_ids = self.items_ids.copy() 665 | self.users_ids_base = self.users_ids.copy() 666 | self.new_users_ids = self.users_ids.copy() 667 | return True 668 | 669 | class RatesApprox(object): 670 | ''' class for rates approximation''' 671 | 672 | def __init__(self 673 | ,usersids_data,itemsids_data 674 | ,ratings_by_user,ratings_by_user_ids,ratings_by_user_idx 675 | ,rng 676 | ,theano_rng 677 | ,matrix_ids 678 | ,consts = Consts() 679 | ): 680 | 681 | self.mini_batch_size = consts.result_batch_size 682 | self.usersids_data = usersids_data 683 | self.itemsids_data = itemsids_data 684 | self.ratings_by_user = ratings_by_user 685 | self.ratings_by_user_ids = ratings_by_user_ids 686 | self.ratings_by_user_idx = ratings_by_user_idx 687 | self.rng = rng 688 | 689 | self.user_indice = 0 690 | self.movie_indice = 1 691 | 692 | self.item_id_size = consts.item_id_size 693 | self.user_id_size = consts.user_id_size 694 | self.users_count = self.ratings_by_user_ids[self.ratings_by_user_idx[len(self.ratings_by_user_idx)-1,0],self.user_indice] 695 | self.matrix_ids = matrix_ids 696 | 697 | self.net = ApproxNet( 698 | batch_size = self.mini_batch_size 699 | ,input_size = self.item_id_size + self.user_id_size + self.itemsids_data.shape[1]+self.usersids_data.shape[1]+self.ratings_by_user.shape[1] - 1 700 | ,output_size = 1 701 | ,hidden_count = consts.result_hidden_layers_count 702 | ,hidden_size = consts.result_hidden_layer_size 703 | ,hidden_activation = consts.result_hidden_layers_activation 704 | ,numpy_rng = rng 705 | ,theano_rng = theano_rng 706 | ,L1_decay = consts.result_L1_decay 707 | ,L2_decay = consts.result_L2_decay 708 | ) 709 | 710 | self.all_rates_indices = numpy.arange(self.ratings_by_user_ids.shape[0]) 711 | numpy.random.shuffle(self.all_rates_indices) 712 | self.train_size_rate = consts.train_rate 713 | self.loss = float(0.0) 714 | self.loss_k = consts.result_loss_k 715 | return 716 | 717 | def train(self,learning_rate): 718 | x_size = self.item_id_size + self.user_id_size + self.itemsids_data.shape[1]+self.usersids_data.shape[1]+self.ratings_by_user.shape[1] - 1 719 | x_value = numpy.zeros((self.mini_batch_size,x_size), dtype=theano.config.floatX) 720 | y_size = 1 721 | y_value = numpy.zeros((self.mini_batch_size,y_size), dtype=theano.config.floatX) 722 | max_train_indice = int(self.all_rates_indices.shape[0]*self.train_size_rate) 723 | for bi in numpy.arange(self.mini_batch_size): 724 | train_indice = self.rng.randint(low=0 ,high = max_train_indice) 725 | user_id = self.ratings_by_user_ids[self.all_rates_indices[train_indice],self.user_indice] - 1 726 | item_id = self.ratings_by_user_ids[self.all_rates_indices[train_indice],self.movie_indice] - 1 727 | i0 = 0 728 | i1 = self.item_id_size 729 | x_value[bi,i0:i1] = self.matrix_ids.items_ids[item_id,] 730 | i0 = i1 731 | i1 = i0 + self.user_id_size 732 | x_value[bi,i0:i1] = self.matrix_ids.users_ids[user_id,] 733 | i0 = i1 734 | i1 = i0 + self.itemsids_data.shape[1] 735 | x_value[bi,i0:i1] = self.itemsids_data[item_id,] 736 | i0 = i1 737 | i1 = i0 + self.usersids_data.shape[1] 738 | x_value[bi,i0:i1] = self.usersids_data[user_id,] 739 | i0 = i1 740 | i1 = i0 + self.ratings_by_user.shape[1] - 1 741 | x_value[bi,i0:i1] = self.ratings_by_user[self.all_rates_indices[train_indice],1:] 742 | y_value[bi,0] = self.ratings_by_user[self.all_rates_indices[train_indice],0] 743 | 744 | loss = self.net.train_fn(x_value,y_value,learning_rate) 745 | if loss[0]>=0: 746 | loss = numpy.sqrt(loss[0]) 747 | else: 748 | loss = 0 749 | if self.loss==0: 750 | self.loss = loss 751 | else: 752 | self.loss += (loss - self.loss)*self.loss_k 753 | return self.loss 754 | 755 | def get_rates(self,userids_itemids,ratesinfo): 756 | rates = numpy.zeros(userids_itemids.shape[0],dtype=theano.config.floatX) 757 | x_size = self.item_id_size + self.user_id_size + self.itemsids_data.shape[1]+self.usersids_data.shape[1]+self.ratings_by_user.shape[1] - 1 758 | x_value = numpy.zeros((self.mini_batch_size,x_size), dtype=theano.config.floatX) 759 | cnt = int((userids_itemids.shape[0]/self.mini_batch_size+1)*self.mini_batch_size) 760 | for i in numpy.arange(cnt/self.mini_batch_size): 761 | for bi in numpy.arange(self.mini_batch_size): 762 | idx = int(i*self.mini_batch_size + bi) 763 | if idx>=userids_itemids.shape[0]: 764 | idx = userids_itemids.shape[0]-1 765 | user_id = int(userids_itemids[idx,self.user_indice]) 766 | item_id = int(userids_itemids[idx,self.movie_indice]) 767 | i0 = int(0) 768 | i1 = int(self.item_id_size) 769 | x_value[bi,i0:i1] = self.matrix_ids.items_ids[item_id,] 770 | i0 = i1 771 | i1 = i0 + int(self.user_id_size) 772 | x_value[bi,i0:i1] = self.matrix_ids.users_ids[user_id,] 773 | i0 = i1 774 | i1 = i0 + self.itemsids_data.shape[1] 775 | x_value[bi,i0:i1] = self.itemsids_data[item_id,] 776 | i0 = i1 777 | i1 = i0 + self.usersids_data.shape[1] 778 | x_value[bi,i0:i1] = self.usersids_data[user_id,] 779 | i0 = i1 780 | i1 = i0 + self.ratings_by_user.shape[1] - 1 781 | x_value[bi,i0:i1] = ratesinfo[idx,:] 782 | y_result = self.net.run_fn(x_value) 783 | for bi in numpy.arange(self.mini_batch_size): 784 | idx = int(i*self.mini_batch_size + bi) 785 | if idx>=userids_itemids.shape[0]: 786 | continue 787 | rates[idx] = y_result[0][bi,0] 788 | return rates 789 | 790 | def validate(self,consts): 791 | max_train_indice = int(self.all_rates_indices.shape[0]*self.train_size_rate) 792 | validate_indicies = numpy.arange(self.all_rates_indices.shape[0] - max_train_indice) 793 | validate_indicies[:] += max_train_indice 794 | userids_itemids = numpy.zeros((self.all_rates_indices.shape[0] - max_train_indice,2),dtype=theano.config.floatX); 795 | rates_info = numpy.zeros((self.all_rates_indices.shape[0] - max_train_indice,self.ratings_by_user.shape[1] - 1),dtype=theano.config.floatX) 796 | i0 = 0 797 | for validate_indice in validate_indicies: 798 | userids_itemids[i0,self.user_indice] = self.ratings_by_user_ids[self.all_rates_indices[validate_indice],self.user_indice] - 1 799 | userids_itemids[i0,self.movie_indice] = self.ratings_by_user_ids[self.all_rates_indices[validate_indice],self.movie_indice] - 1 800 | rates_info[i0,:] = self.ratings_by_user[self.all_rates_indices[validate_indice],1:] 801 | i0 += 1 802 | pass 803 | rates = self.get_rates(userids_itemids,rates_info) 804 | loss = 0 805 | i0 = 0 806 | for validate_indice in validate_indicies: 807 | loss += numpy.square(self.ratings_by_user[self.all_rates_indices[validate_indice],0] - rates[i0]) 808 | i0 += 1 809 | pass 810 | return numpy.sqrt(loss/rates.shape[0]) 811 | 812 | def save(self,index,consts): 813 | self.net.save_state(file_name = consts.get_file_name_by_index(index,consts.result_net_file_name), consts = consts) 814 | return 815 | 816 | def load(self,index,consts): 817 | self.net.load_state(file_name = consts.get_file_name_by_index(index,consts.result_net_file_name), consts = consts) 818 | return 819 | 820 | pass 821 | 822 | 823 | def get_aranged(value,min_value,max_value): 824 | if abs(max_value-min_value)<1e-9: 825 | return 0 826 | return (float(value)-float(min_value))/(float(max_value)-float(min_value)) - float(0.5) 827 | 828 | class RecommenderSystem(object): 829 | ''' 830 | class for recommender systems 831 | ''' 832 | 833 | 834 | def load_data(self,consts=Consts()): 835 | self.usersids = numpy.load(file=consts.userids_npy_file_name) 836 | self.moviesids = numpy.load(file=consts.moviesids_npy_file_name) 837 | self.ratings_by_user = numpy.load(file=consts.ratings_by_user_npy_file_name) 838 | self.ratings_by_user_ids = numpy.load(file=consts.ratings_by_user_ids_npy_file_name) 839 | self.ratings_by_user_idx = numpy.load(file=consts.ratings_by_user_idx_npy_file_name) 840 | self.ratings_by_movie = numpy.load(file=consts.ratings_by_movie_npy_file_name) 841 | self.ratings_by_movie_ids = numpy.load(file=consts.ratings_by_movie_ids_npy_file_name) 842 | self.ratings_by_movie_idx = numpy.load(file=consts.ratings_by_movie_idx_npy_file_name) 843 | self.users_cvs = pandas.read_csv( 844 | consts.users_cvs_file_name 845 | ,names = ("id","sex","age","occupation","zipcode","latitude","longitude","timezone","dts") 846 | ,dtype = { 847 | 'id':numpy.int32 848 | ,'sex':numpy.str 849 | ,'age':numpy.int32 850 | ,'occupation':numpy.int32 851 | ,"zipcode":numpy.str 852 | ,'latitude':numpy.float32 853 | ,'longitude':numpy.float32 854 | ,'timezone':numpy.int32 855 | ,'dts':numpy.int32 856 | } 857 | ,sep=";" 858 | ,skipinitialspace = False 859 | ,header=None 860 | ,index_col = False 861 | ,quoting = csv.QUOTE_ALL 862 | ,quotechar='"' 863 | ,encoding="utf-8" 864 | ,na_values='' 865 | ) 866 | self.movies_cvs = pandas.read_csv( 867 | consts.movies_cvs_file_name 868 | ,sep=";" 869 | ,names = ["id","name","gender","year"] 870 | ,dtype = { 871 | 'id':numpy.int32 872 | ,'name':numpy.str 873 | ,'gender':numpy.str 874 | ,'year':numpy.int32 875 | } 876 | ,skipinitialspace = False 877 | ,header=None 878 | ,index_col = False 879 | ,quoting = csv.QUOTE_ALL 880 | ,quotechar='"' 881 | ,encoding="utf-8" 882 | ) 883 | return 884 | 885 | def __init__(self 886 | ,rng 887 | ,theano_rng 888 | ,consts = Consts() 889 | ): 890 | ''' 891 | The constructor 892 | ''' 893 | 894 | self.load_data() 895 | 896 | self.items_cnt = self.movies_cvs["id"].max() 897 | self.users_cnt = self.users_cvs["id"].max() 898 | 899 | self.matrix_ids = MatrixIDs( 900 | usersids_data = self.usersids,itemsids_data = self.moviesids 901 | ,ratings_by_user = self.ratings_by_user,ratings_by_user_ids = self.ratings_by_user_ids,ratings_by_user_idx = self.ratings_by_user_idx 902 | ,ratings_by_item = self.ratings_by_movie,ratings_by_item_ids = self.ratings_by_movie_ids,ratings_by_item_idx = self.ratings_by_movie_idx 903 | ,users_cnt = self.users_cnt 904 | ,items_cnt = self.items_cnt 905 | ,rng = rng 906 | ,theano_rng = theano_rng 907 | ,consts = consts 908 | ,users_ids = None 909 | ,items_ids = None 910 | ) 911 | 912 | self.rates_approx = RatesApprox( 913 | usersids_data = self.usersids,itemsids_data = self.moviesids 914 | ,ratings_by_user = self.ratings_by_user,ratings_by_user_ids = self.ratings_by_user_ids,ratings_by_user_idx = self.ratings_by_user_idx 915 | ,rng = rng 916 | ,theano_rng = theano_rng 917 | ,matrix_ids = self.matrix_ids 918 | ,consts = consts 919 | ) 920 | 921 | if consts.load_from_ids>0: 922 | self.matrix_ids.load(index = consts.load_from_ids, consts = consts) 923 | self.rates_approx.load(index = consts.load_from_ids, consts = consts) 924 | return 925 | 926 | def train_encoders(self,learning_rate,corruption_level,consts): #,print_flag,idx 927 | loss_items_to_user,loss_users_to_item = self.matrix_ids.train_encoders(learning_rate = learning_rate, corruption_level = corruption_level,consts = consts) 928 | return loss_items_to_user,loss_users_to_item 929 | 930 | def train_itemids(self,learning_rate,consts): 931 | loss = self.matrix_ids.train_itemids(learning_rate = learning_rate,consts = consts) 932 | return loss 933 | 934 | def train_rates(self,learning_rate): 935 | loss = self.rates_approx.train(learning_rate) 936 | return loss 937 | 938 | def validate_rates(self,consts): 939 | return self.rates_approx.validate(consts) 940 | 941 | def calc_new_ids(self,consts): 942 | self.matrix_ids.get_new_user_ids(self.users_cnt,consts) 943 | self.matrix_ids.get_new_item_ids(self.items_cnt,consts) 944 | return 945 | 946 | def save(self,index,consts): 947 | self.matrix_ids.save(index,consts) 948 | return 949 | 950 | def save_rates(self,index,consts): 951 | self.rates_approx.save(index,consts) 952 | return 953 | 954 | def load(self,index,consts): 955 | self.matrix_ids.load(index = consts.load_from_ids, consts = consts) 956 | return 957 | 958 | pass # class RecommenderSystem 959 | 960 | class NearestMovies(object): 961 | def __init__(self,ids_index = 0,consts = Consts()): 962 | self.movies_cvs = pandas.read_csv( 963 | consts.movies_cvs_file_name 964 | ,header=None 965 | ,sep=";" 966 | ,names = ["id","name","gender","year"] 967 | ,skipinitialspace = False 968 | ) 969 | items_ids1 = numpy.load(file = consts.get_file_name_by_index(ids_index,consts.items_ids_file_name)) 970 | self.items_ids = numpy.zeros_like(a = items_ids1) 971 | self.items_ids[self.movies_cvs["id"]-1,] = items_ids1[self.movies_cvs["id"]-1,] 972 | self.users_ids = numpy.load(file = consts.get_file_name_by_index(ids_index,consts.users_ids_file_name)) 973 | self.max_distance = consts.item_max_distance 974 | #print("") 975 | #print() 976 | return 977 | 978 | def save_dta(self,index,consts): 979 | filename = consts.get_file_name_by_index(index,consts.users_ids_dta_file_name) 980 | outfile = open(filename,"wt") 981 | for i in numpy.arange(self.users_ids.shape[0]): 982 | outfile.write("%d\t" % (i,)) 983 | for j in numpy.arange(self.users_ids.shape[1]): 984 | outfile.write("%f\t" % (self.users_ids[i,j],)) 985 | outfile.write("\n") 986 | 987 | filename = consts.get_file_name_by_index(index,consts.items_ids_dta_file_name) 988 | outfile = open(filename,"wt") 989 | for i in numpy.arange(self.items_ids.shape[0]): 990 | outfile.write("%d\t" % (i,)) 991 | for j in numpy.arange(self.items_ids.shape[1]): 992 | outfile.write("%f\t" % (self.items_ids[i,j],)) 993 | outfile.write("\n") 994 | return 995 | 996 | def __distance(self,ids_values,i0,i1): 997 | df1 = self.movies_cvs[self.movies_cvs.id==i0] 998 | df2 = self.movies_cvs[self.movies_cvs.id==i1] 999 | if df1.empty or df2.empty: 1000 | return self.max_distance 1001 | return numpy.sum(numpy.square(ids_values[i0,]-ids_values[i1,])) 1002 | 1003 | def get_nearest_movies(self,movie_id): 1004 | dist = numpy.zeros(shape=self.items_ids.shape[0],dtype=theano.config.floatX) 1005 | for i in numpy.arange(self.items_ids.shape[0]): 1006 | dist[i] = self.__distance(ids_values = self.items_ids, i0 = movie_id-1, i1 = i) 1007 | indices = numpy.argsort(a = dist) 1008 | return [ 1009 | (dist[indice] 1010 | ,indice+1 1011 | ,self.movies_cvs[self.movies_cvs["id"]==(indice+1)]["name"].values[0] 1012 | ,self.movies_cvs[self.movies_cvs["id"]==(indice+1)]["gender"].values[0] 1013 | ) for indice in indices[0:min(len(indices),200)] if not self.movies_cvs[self.movies_cvs["id"]==(indice+1)].empty] 1014 | 1015 | def __distance_to_center(self,centers,ci,i0): 1016 | return numpy.sum(numpy.square(centers[ci,]-self.items_ids[i0,])) 1017 | 1018 | def __get_indexes(self,cluster_number,centers): 1019 | indexes = [[] for i in numpy.arange(cluster_number)] 1020 | for i in numpy.arange(self.items_ids.shape[0]): 1021 | min_dist = 0 1022 | cimin = 0 1023 | for ci in numpy.arange(centers.shape[0]): 1024 | dist = self.__distance_to_center(centers,ci,i) 1025 | if ci==0: 1026 | min_dist = dist 1027 | cimin = 0 1028 | else: 1029 | if distlt+1: 1048 | sys.stdout.write("\t\t\t\t\t\t\t\t\t\r") 1049 | sys.stdout.write("get_clusters %f %%\r" % (float(i1)/float(iterations)*100)) 1050 | lt = lt+1 1051 | indexes = self.__get_indexes(cluster_number,centers) 1052 | res = [] 1053 | for ind in indexes: 1054 | res.append( 1055 | [ 1056 | (indice+1 1057 | ,self.movies_cvs[self.movies_cvs["id"]==(indice+1)]["name"].values[0] 1058 | ,self.movies_cvs[self.movies_cvs["id"]==(indice+1)]["gender"].values[0] 1059 | ) 1060 | for indice in ind if not self.movies_cvs[self.movies_cvs["id"]==(indice+1)].empty] 1061 | ) 1062 | return res 1063 | pass #class NearestMovies 1064 | 1065 | class UserLines(object): 1066 | ''' 1067 | the class of user lines of rates 1068 | ''' 1069 | 1070 | def __init__(self,rng,theano_rng,consts): 1071 | ''' 1072 | constructor 1073 | ''' 1074 | self.recommender_system = RecommenderSystem(rng= rng,theano_rng = theano_rng,consts = consts) 1075 | self.matrix_ids = self.recommender_system.matrix_ids 1076 | self.rates_approx = self.recommender_system.rates_approx 1077 | self.rng = rng 1078 | 1079 | self.movies_cvs = self.recommender_system.movies_cvs 1080 | self.users_cvs = self.recommender_system.users_cvs 1081 | return 1082 | 1083 | def __user_dist(self,user_id1,user_id2,dist): 1084 | dist[user_id2] = numpy.sum(numpy.square(self.matrix_ids.users_ids[user_id1,:] - self.matrix_ids.users_ids[user_id2,:])) 1085 | return 1086 | 1087 | def __item_dist(self,item_id,item_ids,dist): 1088 | dist[item_id] = numpy.sum(numpy.square(self.matrix_ids.items_ids[item_id] - item_ids)) 1089 | return 1090 | 1091 | def __find_nearest(self,user_id,users_cnt): 1092 | dist = numpy.zeros((self.matrix_ids.users_count,),dtype=theano.config.floatX) 1093 | 1094 | for user_id2 in numpy.arange(self.matrix_ids.users_count): 1095 | self.__user_dist(user_id,user_id2,dist) 1096 | pass 1097 | if users_cnt>=self.matrix_ids.users_count: 1098 | users_cnt = self.matrix_ids.users_count 1099 | sorted_indicies = numpy.argsort(dist) 1100 | user_ids = [user_id1 for user_id1 in sorted_indicies[0:users_cnt]] 1101 | #print(dist[sorted_indicies[0:users_cnt]]) 1102 | return user_ids 1103 | 1104 | def __calc_line(self,user_id,rating_info): 1105 | line = numpy.zeros((self.matrix_ids.itemsids_mini_batch_size,self.matrix_ids.item_id_size),dtype=theano.config.floatX) 1106 | 1107 | input_size = self.matrix_ids.usersids_data.shape[1]+self.matrix_ids.user_id_size+self.matrix_ids.ratings_by_user.shape[1] 1108 | input_x = numpy.zeros((self.matrix_ids.itemsids_mini_batch_size,input_size),dtype=theano.config.floatX) 1109 | 1110 | for bi in numpy.arange(self.matrix_ids.itemsids_mini_batch_size): 1111 | i0 = 0 1112 | i1 = i0 + self.matrix_ids.usersids_data.shape[1] 1113 | input_x[bi,i0:i1] = self.matrix_ids.usersids_data[user_id,:] 1114 | i0 = i1 1115 | i1 = i0 + self.matrix_ids.user_id_size 1116 | input_x[bi,i0:i1] = self.matrix_ids.users_ids[user_id,:] 1117 | i0 = i1 1118 | i1 = i0 + self.matrix_ids.ratings_by_item.shape[1] 1119 | input_x[bi,i0+1:i1] = rating_info[:] 1120 | input_x[bi,i0] = (float(bi)/float(self.matrix_ids.itemsids_mini_batch_size) - float(0.5))/float(2.0) 1121 | pass 1122 | 1123 | result_y = self.matrix_ids.user_ids_rate_to_item_ids_net.run_fn(input_x) 1124 | print(input_x[0,:]) 1125 | print(result_y[0][0,:]) 1126 | for bi in numpy.arange(self.matrix_ids.itemsids_mini_batch_size): 1127 | line[bi,:] = result_y[0][bi,:] 1128 | pass 1129 | 1130 | return line 1131 | 1132 | def __get_line_items(self,line): 1133 | item_ids = numpy.zeros(line.shape[0],dtype="int32") 1134 | dist = numpy.zeros((self.matrix_ids.items_count,),dtype=theano.config.floatX) 1135 | for idx in numpy.arange(line.shape[0]): 1136 | for item_id in numpy.arange(self.matrix_ids.items_count): 1137 | self.__item_dist(item_id,line[idx,:],dist) 1138 | pass 1139 | item_ids[idx] = numpy.argmin(dist) 1140 | pass 1141 | return item_ids 1142 | 1143 | def __save_line(self,line,user_id,file_name): 1144 | item_ids = self.__get_line_items(line) 1145 | file_name = file_name % (user_id) 1146 | result = open(file_name,"wt") 1147 | result.write("user id = %d\n" % (user_id,)) 1148 | movies = [(item_ids[indice]+1 1149 | ,self.movies_cvs[self.movies_cvs["id"]==(item_ids[indice]+1)]["name"].values[0] 1150 | ,self.movies_cvs[self.movies_cvs["id"]==(item_ids[indice]+1)]["gender"].values[0] 1151 | ) for indice in numpy.arange(item_ids.shape[0]) if not self.movies_cvs[self.movies_cvs["id"]==(item_ids[indice]+1)].empty 1152 | ] 1153 | for movie_info in movies: 1154 | result.write("%d\t%s\t(%s)\n" % movie_info) 1155 | pass 1156 | return 1157 | 1158 | def __find_best_movies(self,line,movies_cnt): 1159 | return 1160 | 1161 | def __get_rates(self,user_id,movies): 1162 | return 1163 | 1164 | def __save_movies(self,movies,user_id,rates,file_name): 1165 | return 1166 | 1167 | def build_line_for_rand_user(self,rating_info,user_ids,consts): 1168 | for user_id1 in user_ids: 1169 | #user_id1 = self.rng.randint(low=0,high=self.matrix_ids.users_count) 1170 | sys.stdout.write("processing %d user_id " % (user_id1,)) 1171 | line = self.__calc_line(user_id1,rating_info) 1172 | self.__save_line(line,user_id1,consts.user_line_file_name) 1173 | movies = self.__find_best_movies(line,100) 1174 | rates = self.__get_rates(user_id1,movies) 1175 | self.__save_movies(movies,user_id1,rates,consts.best_movies_for_user_file_name) 1176 | sys.stdout.write("-- done\n") 1177 | pass 1178 | return 1179 | 1180 | def __get_rates_of_user(self,user_id,rating_info): 1181 | userids_itemids = numpy.zeros((self.matrix_ids.items_count,2),dtype=theano.config.floatX); 1182 | rates_info = numpy.zeros((self.matrix_ids.items_count,self.rates_approx.ratings_by_user.shape[1] - 1),dtype=theano.config.floatX) 1183 | i0 = 0 1184 | for indice in numpy.arange(self.matrix_ids.items_count): 1185 | userids_itemids[i0,self.rates_approx.user_indice] = user_id 1186 | userids_itemids[i0,self.rates_approx.movie_indice] = indice 1187 | rates_info[i0,:] = rating_info 1188 | i0 += 1 1189 | pass 1190 | rates = self.rates_approx.get_rates(userids_itemids = userids_itemids, ratesinfo = rates_info) 1191 | return rates 1192 | 1193 | def __save_movie_rates(self,user_id,rates,file_name): 1194 | file_name = file_name % (user_id) 1195 | result = open(file_name,"wt") 1196 | for item_id in numpy.arange(self.matrix_ids.items_count): 1197 | result.write("%d\t" % (item_id,)) 1198 | for j in numpy.arange(self.matrix_ids.item_id_size): 1199 | result.write("%f\t" % (self.matrix_ids.items_ids[item_id,j],)) 1200 | pass 1201 | result.write("%f\n" % (rates[item_id],)) 1202 | pass 1203 | return 1204 | 1205 | def __save_movies_by_rates(self,user_id,rates,file_name): 1206 | file_name = file_name % (user_id) 1207 | result = open(file_name,"wt") 1208 | indices = numpy.argsort(rates) 1209 | movies = [(indice + 1 1210 | ,self.movies_cvs[self.movies_cvs["id"]==(indice+1)]["name"].values[0] 1211 | ,self.movies_cvs[self.movies_cvs["id"]==(indice+1)]["gender"].values[0] 1212 | ) for indice in indices[-500:] if not self.movies_cvs[self.movies_cvs["id"]==(indice+1)].empty 1213 | ] 1214 | movies.reverse() 1215 | for movie_info in movies: 1216 | result.write("%d -- %s (%s)\n" % movie_info) 1217 | pass 1218 | return 1219 | 1220 | def build_rate_for_rand_user(self,rating_info,user_ids,consts): 1221 | for user_id1 in user_ids: 1222 | #user_id1 = self.rng.randint(low=0,high=self.matrix_ids.users_count) 1223 | sys.stdout.write("processing %d user_id " % (user_id1,)) 1224 | rates = self.__get_rates_of_user(user_id1,rating_info) 1225 | #self.__save_movie_rates(user_id1,rates,consts.user_rates_of_movies_file_name) 1226 | self.__save_movies_by_rates(user_id1,rates,consts.user_movies_by_rates_file_name) 1227 | sys.stdout.write("-- done\n") 1228 | pass 1229 | return 1230 | 1231 | @staticmethod 1232 | def main(load_id): 1233 | consts = Consts() 1234 | consts.load_from_ids = load_id 1235 | rng = numpy.random.RandomState() 1236 | theano_rng = RandomStreams(rng.randint(2 ** 30)) 1237 | user_lines = UserLines(rng = rng,theano_rng = theano_rng,consts = consts) 1238 | rating_info = numpy.zeros(1,dtype=theano.config.floatX) 1239 | wday = 4 # friday 1240 | rating_info[0] = get_aranged(value = wday, min_value = 0, max_value = 6) 1241 | #user_id = user_lines.rng.randint(low=0,high=user_lines.matrix_ids.users_count) 1242 | #user_ids = user_lines.__find_nearest(user_id,5) 1243 | user_indices = [user_lines.rng.randint(low=0,high=len(user_lines.users_cvs)-1) for it in numpy.arange(5)] 1244 | user_ids = [user_lines.users_cvs.at[indice,"id"] for indice in user_indices] 1245 | #user_lines.build_line_for_rand_user(rating_info = rating_info, user_ids = user_ids, consts = consts) 1246 | user_lines.build_rate_for_rand_user(rating_info = rating_info, user_ids = user_ids, consts = consts) 1247 | sys.stdout.write("all done\n") 1248 | return 1249 | 1250 | pass #class UserLines 1251 | 1252 | 1253 | def test_001(): 1254 | ''' 1255 | functional tests for classes 1256 | ''' 1257 | rng = numpy.random.RandomState() 1258 | theano_rng = RandomStreams(rng.randint(2 ** 30)) 1259 | mini_batch_size = 5 1260 | item_size = 7 1261 | user_size = 7 1262 | learning_rate = 0.1 1263 | corruption_level = 0.3 1264 | 1265 | 1266 | def func(a,b,t): 1267 | return (numpy.sin((2.0*numpy.pi)*(t+a)) + numpy.sin((2.0*numpy.pi)*(t+b)))*0.5 1268 | #autoencoder.print_state() 1269 | 1270 | autoencoder = AutoEncoder( 1271 | mini_batch_size = mini_batch_size, 1272 | input_size = (item_size+1)*5,encoded_size = user_size, 1273 | hidden_count = 4,hidden_size = 64,activation = T.nnet.relu, 1274 | L1_decay = 0,L2_decay = 0.0, 1275 | numpy_rng = rng, 1276 | theano_rng = theano_rng, 1277 | ) 1278 | repeate_times = 300000 1279 | loss = 0 1280 | t1 = time.clock() 1281 | for i in numpy.arange(repeate_times): 1282 | x_value = numpy.zeros((mini_batch_size,(item_size+1)*5), dtype=theano.config.floatX) 1283 | for j in numpy.arange(mini_batch_size): 1284 | for k in numpy.arange(5): 1285 | a = rng.uniform(0,1) 1286 | b = rng.uniform(0,1) 1287 | t = numpy.arange(float(item_size+1))/float(item_size+1) 1288 | x_value[j,k*(item_size+1):(k+1)*(item_size+1)] = func(a,b,t) 1289 | c1 = autoencoder.train_fn(x_value,learning_rate,corruption_level) 1290 | if numpy.isnan(c1): 1291 | break 1292 | if loss==0: 1293 | loss = c1[0] 1294 | else: 1295 | loss = loss + (c1[0] - loss)*0.001 1296 | if i%1000 == 0: 1297 | print("%d -- %f" % (i/1000,loss)) 1298 | t2 = time.clock() 1299 | print("train for : %f sec, times : %d" % ((t2-t1)/repeate_times,repeate_times)) 1300 | #print(c) 1301 | #autoencoder.print_state() 1302 | repeate_times = 1000 1303 | t1 = time.clock() 1304 | for i in numpy.arange(repeate_times) : 1305 | x_value = numpy.asarray( 1306 | rng.uniform( 1307 | low = -0.5, 1308 | high = 0.5, 1309 | size = (mini_batch_size,(item_size+1)*5))) 1310 | encoded_value = autoencoder.get_encoded_fn(x_value) 1311 | t2 = time.clock() 1312 | print("get_encoded for : %f sec" % ((t2-t1)/repeate_times)) 1313 | print(encoded_value[0]+0.5) 1314 | 1315 | #print("function autoencoder.train :") 1316 | #theano.printing.debugprint(autoencoder.train) 1317 | 1318 | #print("function autoencoder.__get_encoded :") 1319 | #theano.printing.debugprint(autoencoder.__get_encoded) 1320 | 1321 | return 1322 | 1323 | def test_002(): 1324 | rng = numpy.random.RandomState() 1325 | theano_rng = RandomStreams(rng.randint(2 ** 30)) 1326 | batch_size = 5 1327 | input_size = 10 1328 | output_size = 2 1329 | learning_rate = 0.1 1330 | 1331 | net1 = ApproxNet( 1332 | batch_size = batch_size 1333 | ,input_size = output_size 1334 | ,output_size = input_size 1335 | ,hidden_count = 6,hidden_size = 64,hidden_activation = T.nnet.relu 1336 | ,numpy_rng = rng 1337 | ,theano_rng = theano_rng 1338 | ,L1_decay = 0 1339 | ,L2_decay = 0 1340 | ) 1341 | 1342 | def func(a,b,t): 1343 | return (numpy.sin((2.0*numpy.pi)*(t+a)) + numpy.sin((2.0*numpy.pi)*(t+b)))*0.5 1344 | 1345 | #c = [] 1346 | repeate_times = 100000 1347 | loss = 0 1348 | t1 = time.clock() 1349 | for i in numpy.arange(repeate_times): 1350 | x_value = numpy.zeros((batch_size,input_size), dtype=theano.config.floatX) 1351 | y_value = numpy.zeros((batch_size,output_size),dtype=theano.config.floatX) 1352 | for j in numpy.arange(batch_size): 1353 | a = rng.uniform(0.0,1.0) 1354 | b = rng.uniform(0.0,1.0) 1355 | t = numpy.arange(float(input_size))/float(input_size) 1356 | x_value[j,0:input_size] = func(a,b,t) 1357 | y_value[j,0] = a - 0.5 1358 | y_value[j,1] = b - 0.5 1359 | c1 = net1.train_fn(y_value,x_value,learning_rate) 1360 | if loss == 0: 1361 | loss = c1[0] 1362 | else: 1363 | loss = loss + (c1[0]-loss)*0.001 1364 | #c.append(c1) 1365 | if numpy.isnan(c1): 1366 | break 1367 | if i % 1000 == 0: 1368 | print("%d -- %f" % (i/1000,loss)) 1369 | t2 = time.clock() 1370 | print("train for : %f sec, times : %d" % ((t2-t1)/repeate_times,repeate_times)) 1371 | #print(c) 1372 | 1373 | x_value = numpy.zeros((batch_size,input_size), dtype=theano.config.floatX) 1374 | y_value = numpy.zeros((batch_size,output_size),dtype=theano.config.floatX) 1375 | for j in numpy.arange(batch_size): 1376 | a = rng.uniform(0.0,1.0) 1377 | b = rng.uniform(0.0,1.0) 1378 | t = numpy.arange(float(input_size))/float(input_size) 1379 | x_value[j,0:input_size] = func(a,b,t) 1380 | y_value[j,0] = a - 0.5 1381 | y_value[j,1] = b - 0.5 1382 | x_result = net1.run_fn(y_value) 1383 | print(x_value+0.5) 1384 | print(x_result[0]+0.5) 1385 | 1386 | return 1387 | 1388 | def trace(index,loss_items_to_user,loss_users_to_item,loss_itemids,loss_rates,validate_loss,validate_loss_min,trace_file_name): 1389 | tf = open(trace_file_name,"at") 1390 | tf.write("%d\t%f\t%f\t%f\t%f\t%f\t%f\n" % (index,loss_items_to_user,loss_users_to_item,loss_itemids,loss_rates,validate_loss,validate_loss_min)) 1391 | return 1392 | 1393 | def train_all(): 1394 | consts = Consts() 1395 | rng = numpy.random.RandomState() 1396 | theano_rng = RandomStreams(rng.randint(2 ** 30)) 1397 | rs = RecommenderSystem(rng= rng,theano_rng = theano_rng,consts=consts) 1398 | 1399 | #print("userids, shape : " + str(rs.usersids.shape)) 1400 | #print(rs.usersids) 1401 | #print("moviesids, shape : " + str(rs.moviesids.shape)) 1402 | #print(rs.moviesids) 1403 | #print("ratings_by_user, shape : " + str(rs.ratings_by_user.shape)) 1404 | #print(rs.ratings_by_user) 1405 | #print("ratings_by_user_ids, shape : " + str(rs.ratings_by_user_ids.shape)) 1406 | #print(rs.ratings_by_user_ids) 1407 | #print("ratings_by_user_idx, shape : " + str(rs.ratings_by_user_idx.shape)) 1408 | #print(rs.ratings_by_user_idx) 1409 | #print("ratings_by_movie, shape : " + str(rs.ratings_by_movie.shape)) 1410 | #print(rs.ratings_by_movie) 1411 | #print("ratings_by_movie_ids, shape : " + str(rs.ratings_by_movie_ids.shape)) 1412 | #print(rs.ratings_by_movie_ids) 1413 | #print("atings_by_movie_idx, shape : " + str(rs.ratings_by_movie_idx.shape)) 1414 | #print(rs.ratings_by_movie_idx) 1415 | 1416 | sys.stdout.write("i item2user user2item itemids rates rval rvalmin\n") 1417 | 1418 | loss_rates = float(0) 1419 | loss_items_to_user = float(0) 1420 | loss_users_to_item = float(0) 1421 | validate_loss_min = float(0) 1422 | validate_loss = float(0) 1423 | loss_itemids = float(0) 1424 | for idx in numpy.arange(100000): 1425 | lt = time.time() 1426 | for j in numpy.arange(consts.ids_move_count): 1427 | loss_items_to_user,loss_users_to_item = rs.train_encoders(learning_rate = consts.encoder_learning_rate, corruption_level = consts.encoder_corruption_rate,consts=consts) 1428 | if rng.rand(1)[0]<=consts.train_rates_rate: 1429 | loss_rates = rs.train_rates(learning_rate = consts.result_learning_rate) 1430 | if rng.rand(1)[0]lt+1: 1434 | sys.stdout.write("\t\t\t\t\t\t\t\t\t\r") 1435 | sys.stdout.write("[%d] %f %f %f %f %f %f\r" % (idx+(consts.load_from_ids*consts.save_cycles),loss_items_to_user,loss_users_to_item,loss_itemids,2*loss_rates,2*validate_loss,2*validate_loss_min)) 1436 | lt = lt+1 1437 | pass 1438 | rs.calc_new_ids(consts=consts) 1439 | if idx % consts.save_cycles == 0: 1440 | rs.save((idx/consts.save_cycles) + consts.load_from_ids,consts) 1441 | rs.save_rates((idx/consts.save_cycles) + consts.load_from_ids,consts) 1442 | if idx % consts.validate_cycles == 0: 1443 | validate_loss = rs.validate_rates(consts=consts) 1444 | if validate_loss_min==0 or validate_losslt+1: 1471 | sys.stdout.write("\t\t\t\t\t\t\t\t\t\r") 1472 | sys.stdout.write("[%d] loss = %f , val = %f valmin = %f\r" % (i,loss_rates,validate_loss,validate_loss_min)) 1473 | lt = lt+1 1474 | trace_rates(i + (consts.load_from_ids*consts.save_cycles),loss_rates,validate_loss_min,validate_loss,consts.trace_rates_file_name) 1475 | if i % consts.save_cycles == 0: 1476 | rs.save_rates((i/consts.save_cycles) + consts.load_from_ids,consts) 1477 | if i % consts.validate_cycles == 0: 1478 | validate_loss = rs.validate_rates(consts=consts) 1479 | if validate_loss_min==0 or validate_loss