├── .gitignore ├── LICENSE.md ├── MLModel ├── Global.py ├── LoadCoreset.py ├── LoadData.py ├── MLmodel │ ├── linearRegression.py │ └── logisticRegression.py ├── hidden.py ├── optimizer.py └── paramRange.py ├── README.md ├── RECON ├── CMakeLists.txt ├── data.h ├── global.h ├── main.cpp ├── mycsBrazil.h ├── mycsIMDBC.h ├── mycsStackn.h ├── mycsTaxi.h ├── type.h └── util.h ├── linear-universal.py ├── logistic-universal.py └── preprocess ├── Brazil.py ├── IMDBC-5.py ├── IMDBC-Linear.py ├── stack.py └── taxi.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.xml 3 | *.iml 4 | MLModel/.DS_Store 5 | .DS_Store 6 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /MLModel/Global.py: -------------------------------------------------------------------------------- 1 | DATAPATH = "/home/jiayi/disk/C-craig/dataset/" 2 | CSPATH = "/home/jiayi/disk/C-craig/" -------------------------------------------------------------------------------- /MLModel/LoadCoreset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from MLModel.Global import * 3 | 4 | def LoadCoreset(coreset_from, data, subset_size, batch=0, sampleSize=0): 5 | assert coreset_from == 'diskOurs' 6 | if coreset_from == 'diskOurs': 7 | assert batch==0 8 | if batch==0: 9 | if subset_size == 0.00001: 10 | file_name = CSPATH+"inuse/{}-0.00001-ours.npz".format(data) 11 | else: 12 | file_name = CSPATH+'inuse/{}-{}-ours.npz'.format(data, str(subset_size)) 13 | print("【Load file path】 is ", file_name) 14 | 15 | 16 | if file_name != '': 17 | print(f'reading from {file_name}') 18 | dataset = np.load(f'{file_name}') 19 | order, weights, total_ordering_time = dataset['order'], dataset['weight'], dataset['order_time'] 20 | print(" 【Coreset size】 is ", order.shape) 21 | return order, weights, total_ordering_time -------------------------------------------------------------------------------- /MLModel/LoadData.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from MLModel.Global import * 3 | def load_dataset(dataset, prop=0.1, regression=False): 4 | assert dataset in ['IMDBCLinear', 'IMDBLargeCLinear', 'Brazilnew', 'IMDBC5', 'IMDBLargeC5', 'taxi', 'stackn'] 5 | 6 | X_train = np.load(DATAPATH + "{}-train-X.npy".format(dataset)) 7 | X_val = np.load(DATAPATH + "{}-val-X.npy".format(dataset)) 8 | X_test = np.load(DATAPATH + "{}-test-X.npy".format(dataset)) 9 | y_train = np.load(DATAPATH + "{}-train-y.npy".format(dataset)) 10 | y_val = np.load(DATAPATH + "{}-val-y.npy".format(dataset)) 11 | y_test = np.load(DATAPATH + "{}-test-y.npy".format(dataset)) 12 | 13 | if regression == False: 14 | assert dataset in ['IMDBC5','IMDBLargeC5', 'Brazilnew'] 15 | print("Is Multi class") 16 | if dataset in ['IMDBC5', 'IMDBLargeC5', 'Brazilnew']: 17 | num_class = 5 18 | print("Num class ", num_class) 19 | if dataset in ['Brazil5']: 20 | y_train-=1 21 | y_val-=1 22 | y_test-=1 23 | print(np.unique(y_train)) 24 | print(np.unique(y_val)) 25 | print(np.unique(y_test)) 26 | y_train = y_train.astype(np.int32) 27 | y_val = y_val.astype(np.int32) 28 | y_test = y_test.astype(np.int32) 29 | y_train = np.eye(num_class)[y_train] 30 | y_val = np.eye(num_class)[y_val] 31 | y_test = np.eye(num_class)[y_test] 32 | elif not regression: 33 | y_train = np.reshape(y_train, (-1, 1)) 34 | y_val = np.reshape(y_val, (-1, 1)) 35 | y_test = np.reshape(y_test, (-1, 1)) 36 | print(f'Training size: {len(y_train)}, Test size: {len(y_test)}') 37 | return X_train, y_train, X_val, y_val, X_test, y_test 38 | 39 | -------------------------------------------------------------------------------- /MLModel/MLmodel/linearRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import metrics 3 | class LinearRegression(object): 4 | def __init__(self, dim): 5 | self.W = np.zeros(dim) 6 | self.params = self.W 7 | 8 | def activation(self, X, params=None): 9 | pred_ys = X.dot(self.W) 10 | return pred_ys 11 | 12 | def loss(self, X,y, l2_reg=0.00, ): 13 | num_of_samples = X.shape[0] 14 | f_mat = X.dot(self.W) 15 | diff = f_mat - y 16 | loss = 1.0 * np.sum(diff * diff) / num_of_samples 17 | 18 | return loss + l2_reg * np.linalg.norm(self.W) ** 2 / 2 19 | 20 | def gradient(self, X, y, l2_reg=0.00, params=None, cnt=0): 21 | num_of_samples = X.shape[0] 22 | f_mat = X.dot(self.W) 23 | diff = f_mat - y 24 | if type(diff)==np.array and diff.shape[0]==1: 25 | gradient = (diff[0]*(X)).T - l2_reg * self.W 26 | return gradient 27 | else: 28 | if type(diff) ==np.float64: 29 | gradient = (diff *X).T - l2_reg * self.W 30 | else: 31 | gradient = ((diff.T).dot(X)).T - l2_reg * self.W 32 | return gradient 33 | 34 | 35 | def MASLE(self, X,y): 36 | predict_y = self.activation(X) 37 | MAE = metrics.mean_absolute_error(y, predict_y) 38 | MSE = metrics.mean_squared_error(y, predict_y) 39 | MSLE=0 40 | return MAE, MSE, MSLE -------------------------------------------------------------------------------- /MLModel/MLmodel/logisticRegression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import metrics 3 | 4 | def sigmoid(x): 5 | return 1. / (1 + np.exp(-x)) 6 | 7 | 8 | def softmax(x): 9 | if x.ndim == 1: 10 | e = np.exp(x - np.max(x)) 11 | else: 12 | e = np.exp(x - np.max(x, axis=1, keepdims=True)) 13 | 14 | if e.ndim == 1: 15 | return e / np.sum(e, axis=0) 16 | else: 17 | return e / np.array([np.sum(e, axis=1)]).T 18 | 19 | 20 | class LogisticRegression(object): 21 | def __init__(self, dim, num_class): 22 | self.binary = num_class == 1 23 | self.W = np.zeros((dim, num_class)) 24 | self.b = np.zeros(num_class) 25 | self.params = np.array([self.W, self.b]) 26 | 27 | def activation(self, input, params=None): 28 | W, b = params if params is not None else self.params 29 | if self.binary: 30 | return sigmoid(np.dot(input, W) + b) 31 | else: 32 | return softmax(np.dot(input, W) + b) 33 | 34 | def loss(self, input, label, l2_reg=0.00, params=None): 35 | sigmoid_activation = self.activation(input, params) 36 | 37 | cross_entropy = - np.mean(np.sum(label * np.log(sigmoid_activation) + 38 | (1 - label) * np.log(1 - sigmoid_activation), axis=1)) 39 | 40 | return cross_entropy + l2_reg * np.linalg.norm(self.W) ** 2 / 2 41 | 42 | def f1(self, input, label, params=None): 43 | if self.binary: 44 | return metrics.f1_score(label, np.rint(self.predict(input, params)), average = 'weighted') 45 | else: 46 | return metrics.f1_score(np.argmax(label, axis=1), np.argmax(self.predict(input, params), axis=1), 47 | average='weighted') 48 | def recall(self, input, label, params=None): 49 | if self.binary: 50 | return metrics.recall_score(label, np.rint(self.predict(input, params)), average = 'weighted') 51 | else: 52 | return metrics.recall_score(np.argmax(label,axis=1), np.argmax(np.rint(self.predict(input, params)), axis=1), average = 'weighted') 53 | def precision(self, input, label, params=None): 54 | if self.binary: 55 | return metrics.precision_score(label, np.rint(self.predict(input, params)), average = 'weighted') 56 | else: 57 | return metrics.precision_score(np.argmax(label, axis=1), np.argmax(self.predict(input, params),axis=1), average = 'weighted') 58 | 59 | def acc(self, input, label, params=None): 60 | if self.binary: 61 | return metrics.accuracy_score(label, np.rint(self.predict(input, params))) 62 | else: 63 | if len(label.shape)>1: 64 | 65 | label = np.argmax(label, axis=1) 66 | pred = self.predict(input, params) 67 | if len(pred.shape)>1: 68 | pred = np.argmax(pred, axis=1) 69 | return metrics.accuracy_score(label,pred) 70 | 71 | 72 | def predict(self, input, params=None): 73 | return self.activation(input, params) 74 | 75 | def accuracy(self, input, label, params=None): 76 | if self.binary: 77 | return np.mean(np.isclose(np.rint(self.predict(input, params)), label)) 78 | else: 79 | if len(label.shape)>1: 80 | label = np.argmax(label, axis=1) 81 | pred = self.predict(input, params) 82 | if len(pred.shape)>1: 83 | pred = np.argmax(pred, axis=1) 84 | return metrics.accuracy_score(label, 85 | pred) 86 | def gradient(self, input, label, l2_reg=0.00, params=None,cnt=1): 87 | p_y_given_x = self.activation(input, params) 88 | d_y = label - p_y_given_x 89 | d_W = -np.dot(np.reshape(input, (cnt, -1)).T, np.reshape(d_y.T, (cnt, -1))) - l2_reg * self.W 90 | d_b = -np.mean(d_y, axis=0) 91 | return np.array([d_W, d_b]) 92 | 93 | def gradientVec(self, input, label, cnt, l2_reg=0.00, params=None): 94 | p_y_given_x = self.activation(input, params) 95 | d_y = label - p_y_given_x 96 | d_W = -np.dot(np.reshape(input, (cnt, -1)).T, np.reshape(d_y.T, (cnt, -1))) - l2_reg * self.W 97 | d_b = -np.mean(d_y, axis=0) 98 | return np.array([d_W, d_b]) 99 | 100 | def MASLE(self, X,y): 101 | predict_y = self.activation(X) 102 | if len(predict_y.shape)>0: 103 | predict_y = np.argmax(predict_y, axis=1) 104 | 105 | if len(y.shape) > 0: 106 | y = np.argmax(y, axis=1) 107 | MAE = metrics.mean_absolute_error(y, predict_y) 108 | MSE = metrics.mean_squared_error(y, predict_y) 109 | if np.any(y<0): 110 | MSLE=0 111 | else: 112 | MSLE = metrics.mean_squared_log_error(y, predict_y) 113 | 114 | return MAE, MSE, MSLE -------------------------------------------------------------------------------- /MLModel/hidden.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | class HiddenPrints: 5 | def __init__(self, activated=True): 6 | self.activated = activated 7 | self.original_stdout = None 8 | 9 | def open(self): 10 | sys.stdout.close() 11 | sys.stdout = self.original_stdout 12 | 13 | def close(self): 14 | self.original_stdout = sys.stdout 15 | sys.stdout = open(os.devnull, 'w') 16 | 17 | def __enter__(self): 18 | if self.activated: 19 | self.close() 20 | 21 | def __exit__(self, exc_type, exc_val, exc_tb): 22 | if self.activated: 23 | self.open() -------------------------------------------------------------------------------- /MLModel/optimizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | from torch.utils.tensorboard import SummaryWriter 4 | from MLModel.Global import * 5 | class Optimizer(object): 6 | 7 | @staticmethod 8 | def order_elements(shuffle, n, seed=1234): 9 | if shuffle == 0: 10 | indices = np.arange(n) 11 | elif shuffle == 1: 12 | indices = np.random.permutation(n) 13 | elif shuffle == 2: 14 | indices = np.random.randint(0, n, n) 15 | else: # fixed permutation 16 | np.random.seed(seed) 17 | indices = np.random.permutation(n) 18 | return indices 19 | 20 | def optimize(self, method, model, data, labels, weights, num_epochs, shuffle, lr, l2_reg): 21 | if method == 'sgd': 22 | return self.sgd(model, data, labels, weights, num_epochs, shuffle, lr, l2_reg) 23 | elif method == 'saga': 24 | return self.saga(model, data, labels, weights, num_epochs, shuffle, lr, l2_reg) 25 | elif method == 'svrg': 26 | return self.svrg(model, data, labels, weights, num_epochs, shuffle, lr, l2_reg) 27 | elif method =='BGD': 28 | return self.BGD(model, data, labels, weights, num_epochs, shuffle, lr, l2_reg) 29 | else: 30 | print('Optimizer is not defined!') 31 | 32 | def sgd(self, model, data, labels, weights, num_epochs, shuffle, lr, l2_reg): 33 | n = len(data) 34 | W = [[]] * num_epochs 35 | T = np.empty(num_epochs) 36 | 37 | time.sleep(.1) 38 | start_epoch = time.process_time() 39 | writer = SummaryWriter(CSPATH+'/tensorboard/') 40 | for epoch in range(num_epochs): 41 | indices = self.order_elements(shuffle, n) 42 | for i in indices: 43 | grads = model.gradient(data[i], labels[i], l2_reg / n) * weights[i] 44 | 45 | model.params -= lr[epoch] * grads 46 | W[epoch] = model.params.copy() 47 | T[epoch] = (time.process_time() - start_epoch) 48 | 49 | writer.add_scalar('loss', model.loss(data,labels), global_step=epoch) 50 | return W, T 51 | 52 | def BGD(self, model, data, labels, weights, num_epochs, shuffle, lr, l2_reg): 53 | n = len(data) 54 | W = [[]] * num_epochs 55 | T = np.empty(num_epochs) 56 | 57 | time.sleep(.1) 58 | start_epoch = time.process_time() 59 | 60 | for epoch in range(num_epochs): 61 | indices = self.order_elements(shuffle, n) 62 | # grads_ = None 63 | 64 | grads_ = model.gradient(data, labels,l2_reg, cnt=n)/n 65 | # print('grads_ is ', grads_) 66 | # for i in indices: 67 | # if grads_ is None: 68 | # grads_ = model.gradient(data[i], labels[i], l2_reg / n) * weights[i] 69 | # # grads_ = np.dot(model.gradientVec(data, labels, n, l2_reg / n) , weights) 70 | # else: 71 | # grads_ += model.gradient(data[i], labels[i], l2_reg / n) * weights[i] 72 | # # grads_ += np.dot(model.gradient(data, labels, n, l2_reg / n) , weights) 73 | model.params -= lr[epoch] * grads_ 74 | W[epoch] = model.params.copy() 75 | T[epoch] = (time.process_time() - start_epoch) 76 | return W, T 77 | 78 | def saga(self, model, data, labels, weights, num_epochs, shuffle, lr, l2_reg): 79 | n = len(data) 80 | W = [[]] * num_epochs 81 | T = np.empty(num_epochs) 82 | 83 | time.sleep(.1) 84 | start_epoch = time.process_time() 85 | 86 | saved_grads = np.array([model.gradient(data[i], labels[i], l2_reg / n) * weights[i] for i in range(n)]) 87 | avg_saved_grads = saved_grads.mean(axis=0) 88 | 89 | for epoch in range(num_epochs): 90 | indices = self.order_elements(shuffle, n) 91 | for i in indices: 92 | grads = model.gradient(data[i], labels[i], l2_reg / n) * weights[i] 93 | model.params -= lr[epoch] * (grads - saved_grads[i] + avg_saved_grads) 94 | avg_saved_grads += (grads - saved_grads[i]) / n 95 | saved_grads[i] = grads 96 | 97 | W[epoch] = model.params.copy() 98 | T[epoch] = (time.process_time() - start_epoch) 99 | return W, T 100 | 101 | def svrg(self, model, data, labels, weights, num_epochs, shuffle, lr, l2_reg): 102 | n = len(data) 103 | W = [[]] * num_epochs 104 | T = np.empty(num_epochs) 105 | 106 | time.sleep(.1) 107 | start_epoch = time.process_time() 108 | 109 | for epoch in range(num_epochs): 110 | init_grads = np.array([model.gradient(data[i], labels[i], l2_reg / n) * weights[i] for i in range(n)]) 111 | avg_init_grads = np.mean(init_grads, axis=0) 112 | 113 | indices = self.order_elements(shuffle, n) 114 | for i in indices: 115 | grads = model.gradient(data[i], labels[i], l2_reg / n) * weights[i] 116 | model.params -= lr[epoch] * (grads - init_grads[i] + avg_init_grads) 117 | 118 | W[epoch] = model.params.copy() 119 | T[epoch] = (time.process_time() - start_epoch) 120 | return W, T 121 | -------------------------------------------------------------------------------- /MLModel/paramRange.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def get_param_range(subset_size, exp_decay, method, data): 3 | if method=='BGD': 4 | g_range = [0.001] 5 | b_range = np.arange(180, 200, 1) * .005 6 | return g_range, b_range 7 | if data=='IMDBLargeCLinear': 8 | g_range = [0.0002] 9 | b_range = np.arange(20, 40, 1) * 0.005 10 | elif data in [ 'IMDBCLinear','IMDBLargeC5']: 11 | g_range = [0.0001] 12 | b_range = np.arange(180, 200, 1) * .005 13 | elif data in ['IMDBC5']: 14 | g_range = [0.001] 15 | b_range = np.arange(180, 200, 1) * .005 16 | elif data in ['Brazilnew']: 17 | g_range = [0.01] 18 | b_range = np.arange(20, 40, 1) * .005 19 | elif data in ['stackn']: 20 | g_range = [0.0001] 21 | b_range = np.arange(180, 200, 1) * .005 22 | elif data in ['taxi']: 23 | g_range = [0.0001] 24 | b_range = np.arange(20, 40, 1) * .005 25 | else: 26 | g_range = [0.1, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.25, 0.3, 0.35] 27 | b_range = [0.7, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.9, 0.95] 28 | if subset_size < 1: 29 | g_range = [0.000035, 0.009, 0.01, 0.013, 0.015, 0.017, 0.018, 0.019, 0.02, 0.025, 0.03] 30 | b_range = np.arange(0, 19) * .01 31 | return g_range, b_range 32 | 33 | 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RECON 2 | This repo contains the codes for the VLDB 2023 paper [_Coresets over multiple tables for feature-rich and data-efficient machine learning_](https://www.vldb.org/pvldb/vol16/p64-wang.pdf). 3 | 4 | 5 | # Quick Start 6 | 7 | ## Folder Structure 8 | 9 | . 10 | ├── preprocess # Data preprocessing codes 11 | ├── RECON # RECON codes for coreset construction 12 | ├── MLModel # ML models training codes to test the performance of RECON 13 | ├── linear-universal.py # Evaluation of regression models 14 | ├── logistic-universal.py # Evaluation of classification models 15 | └── README.md 16 | 17 | 18 | 19 | ## Requirements 20 | Before running the codes, please make sure your C++ version is above `C++14`. 21 | Library cnpy is also needed to save results in the format of npz. 22 | 23 | The dataset path is configured by variable `DATAPATH` (line 9 in global.h), which should also be configured properly before running the codes. 24 | The datasets can be downloaded from [dataset link](https://drive.google.com/drive/folders/1kOLJQRnJk-_87y3WVq8Dwu18JYylbQhb?usp=sharing). 25 | - `Python 3.7+` 26 | 27 | - ` C++14` 28 | - `cnpy: a library to read/write .npy and .npz files in C/C++` [link](https://github.com/rogersce/cnpy) 29 | 30 | 31 | 32 | ## Usage 33 | 34 | ### RECON on IMDB / IMDB-Large: 35 | First build `./RECON` by: 36 | 37 | - `cd RECON` 38 | 39 | - `cmake .` 40 | 41 | - `make` 42 | 43 | 44 | and then perform RECON on different datasets by passing different arguments. 45 | > parameter setting: 46 | >> [dataName] [proportion] [0:IMDB 1:IMDB-Large] [0:Classification 1:Regression] 47 | 48 | - `IMDB, p=0.0128 for classification: ./RECON IMDB 0.0128 0 0 ` 49 | - `IMDB, p=0.0032 for regression: ./RECON IMDB 0.0032 0 1` 50 | - `IMDB-Large, p=0.0016 for classification: ./RECON IMDB 0.0016 1 0` 51 | - `IMDB-Large, p=0.0016 for regression ./RECON IMDB 0.0016 1 1` 52 | 53 | 54 | 55 | ### RECON on stack / Brazil / taxi: 56 | 57 | 58 | > parameter setting: 59 | >> [dataName] [proportion] 60 | - `stack, p=0.0032: ./RECON stack 0.0032` 61 | - `Brazil, p=0.0016: ./RECON Brazil 0.0016` 62 | - `taxi, p=0.0032: ./RECON taxi 0.0032` 63 | 64 | > Note: '-L/usr/local/lib/ -lcnpy -lz' may also need to be added to the program arguments, which depends on the method to install cnpy. 65 | 66 | **Note:** Before running RECON, make sure the variable `DATAPATH` (line 9 in global.h) is configured as the path of dataset. 67 | Besides, make sure the vaiable `CSPATH` (line 10 in gloabl.h) is configured as the location to save RECON's output, i.e., coresets. 68 | 69 | 70 | ### Training Logistic Regression 71 | Run `logsitic-universal.py` to train logistic regression models. 72 | 73 | - IMDB: `python logistic-universal.py --data IMDBC5 --method sgd -s 0.0128 ` 74 | 75 | - IMDB-Large: `python logistic-universal.py --data IMDBLargeC5 --method sgd -s 0.0016 ` 76 | 77 | 78 | - Brazil: `python logistic-universal.py --data Brazilnew --method sgd -s 0.0016 ` 79 | 80 | 81 | 82 | ### Training Linear Regression 83 | Run `linear-universal.py` to train linear regression models. 84 | 85 | - IMDB: `python linear-universal.py --data IMDBCLinear --method sgd -s 0.0032 ` 86 | 87 | - IMDB-Large: `python linear-universal.py --data IMDBLargeCLinear --method sgd -s 0.0016 ` 88 | 89 | - stack: `python linear-universal.py --data stackn --method sgd -s 0.0032` 90 | 91 | 92 | - taxi: `python linear-universal.py --data taxi --method sgd -s 0.0032` 93 | 94 | **Note:** Before training models, make sure variable `DATAPATH` (line 1 in Global.py) is configured as the path of datasets. 95 | And `CSPATH`(line 2 in Global.py) is configured as the path to RECON's output (path of coreset). 96 | 97 | ### Other Baselines 98 | 99 | - **Sample-Join**: The argument `--greedy [0:Uniform Sampling 1:Coreset (default)]` specifies the subset for training. 100 | Sample-Join can be achieved by setting `--greedy 0`. 101 | For example, to train a logistic regression model on a uniform sampling of IMDB, you may use: 102 | ```sh 103 | python logistic-universal.py --data IMDBC5 --method sgd -s 0.0128 --greedy 0 104 | ``` 105 | 106 | 107 | - **Full**: Full can be achieved by setting `-s` to `1` on top of Sample-Join. 108 | For example, to train a logistic regression model using full data of IMDB, you may use: 109 | ```sh 110 | python logistic-universal.py --data IMDBC5 --method sgd -s 1 --greedy 0 111 | ``` 112 | 113 | - **Coreset-Join** and **Join-Coreset**: You can find their official implementations from [link](https://github.com/baharanm/craig). 114 | 115 | ### Data Preprocessing 116 | In general, our preprocessing of each dataset in the `preprocess` directory can be summarized as data cleaning, normalization, and partition by label. 117 | We provide the preprocessed data in [dataset link](https://drive.google.com/drive/folders/1kOLJQRnJk-_87y3WVq8Dwu18JYylbQhb?usp=sharing). 118 | The raw datasets can be found in their original sources. 119 | 120 | ## License 121 | 122 | The project is available under the [MIT](LICENSE.md) license. 123 | 124 | ## Citation 125 | If our work is helpful to you, please cite our [paper](https://www.vldb.org/pvldb/vol16/p64-wang.pdf): 126 | ```bibtex 127 | @article{wang2022coresets, 128 | title={Coresets over multiple tables for feature-rich and data-efficient machine learning}, 129 | author={Wang, Jiayi and Chai, Chengliang and Tang, Nan and Liu, Jiabin and Li, Guoliang}, 130 | journal={Proceedings of the VLDB Endowment}, 131 | volume={16}, 132 | number={1}, 133 | pages={64--76}, 134 | year={2022}, 135 | publisher={VLDB Endowment} 136 | } 137 | 138 | ``` 139 | -------------------------------------------------------------------------------- /RECON/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | project(RECON) 3 | 4 | set(CMAKE_CXX_STANDARD 14) 5 | 6 | set(LINK_DIR /usr/local/lib/) 7 | set(INC_DIR /usr/local/include/) 8 | 9 | include_directories(${INC_DIR}) 10 | link_directories(${LINK_DIR}) 11 | link_libraries(cnpy) 12 | link_libraries(z) 13 | 14 | FIND_PACKAGE(OpenMP REQUIRED) 15 | if(OPENMP_FOUND) 16 | set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS}") 17 | set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS}") 18 | endif() 19 | 20 | 21 | add_executable(RECON main.cpp) 22 | target_link_libraries(RECON cnpy z) -------------------------------------------------------------------------------- /RECON/data.h: -------------------------------------------------------------------------------- 1 | #ifndef UNTITLED2_DATA_H 2 | #define UNTITLED2_DATA_H 3 | #include "type.h" 4 | #include 5 | #include "type.h" 6 | #include "global.h" 7 | void loadData(char * data= nullptr){ 8 | if(!data) { 9 | assert(0); 10 | } 11 | std::stringstream ss; 12 | std::string dataName(data); 13 | ss.str(""); 14 | ss << DATAPATH << dataName << "-train-X.npy"; 15 | cnpy::NpyArray trainX = cnpy::npy_load(ss.str()); 16 | n = trainX.shape[0]; 17 | d = trainX.shape[1]; 18 | X = (dtype *) malloc(n * d * sizeof(dtype)); 19 | memcpy(X, trainX.data(), n * d * sizeof(dtype)); 20 | 21 | ss.str(""); 22 | ss << DATAPATH << dataName << "-train-y.npy"; 23 | cnpy::NpyArray trainY = cnpy::npy_load(ss.str()); 24 | assert(trainY.shape[0] == trainX.shape[0]); 25 | std::cout<<"word size is "<(), n * sizeof(labeltype)); 29 | 30 | } 31 | 32 | #endif //UNTITLED2_DATA_H 33 | 34 | 35 | -------------------------------------------------------------------------------- /RECON/global.h: -------------------------------------------------------------------------------- 1 | #ifndef UNTITLED2_GLOBAL_H 2 | #define UNTITLED2_GLOBAL_H 3 | #include 4 | #include 5 | #include 6 | #include "cnpy.h" 7 | 8 | 9 | const std::string DATAPATH ="/home/jiayi/disk/C-craig/dataset/"; 10 | const std::string CSPATH ="/home/jiayi/disk/C-craig/inuse/"; 11 | const int tc = 16; 12 | dtype *X; 13 | labeltype *Y; 14 | dtype *similarity; 15 | idtype n, d, N; 16 | idtype * Map; 17 | std::map cateNum; 18 | int cateCnt; 19 | dtype alpha = 1.; 20 | idtype target_coreset_size; 21 | idtype real_coreset_size; 22 | idtype* nn; 23 | dtype* maxSim; 24 | dtype* weight; 25 | dtype * lazy; 26 | idtype * idx; 27 | idtype * invidx; 28 | std::vector weight_vec; 29 | std::priority_queue > pq; 30 | std::vector coreset; 31 | std::vector coresetAll; 32 | dtype curSum; 33 | dtype f_norm; 34 | dtype norm; 35 | idtype cSize; 36 | 37 | void freeAll(){ 38 | free(Map); 39 | free(lazy); 40 | free(invidx); 41 | free(idx); 42 | free(similarity); 43 | free(nn); 44 | free(maxSim); 45 | free(weight); 46 | } 47 | #endif //UNTITLED2_GLOBAL_H 48 | 49 | -------------------------------------------------------------------------------- /RECON/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "type.h" 5 | #include "util.h" 6 | #include "data.h" 7 | #include "omp.h" 8 | #include "cnpy.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "mycsIMDBC.h" 16 | #include "mycsStackn.h" 17 | #include "mycsTaxi.h" 18 | #include "mycsBrazil.h" 19 | #include 20 | 21 | using namespace std::chrono; 22 | using std::chrono::system_clock; 23 | 24 | 25 | int main(int argc, char** argv) { 26 | 27 | omp_set_num_threads(tc); 28 | 29 | std::cout<> sim_time(0); 34 | 35 | if(dataName == "IMDB") 36 | sim_time = IMDBC::testIMDBC(std::stod(argv[2]), std::atol(argv[3]),0.01, std::atol(argv[4])); 37 | else if(dataName == "stack") 38 | sim_time = stackn::testStackn(std::stod(argv[2])); 39 | else if(dataName == "Brazil") 40 | sim_time = Brazil::testBrazil(std::stod(argv[2])); 41 | else if(dataName == "taxi") 42 | sim_time = taxi::testTaxi(std::stod(argv[2])); 43 | 44 | 45 | auto en = system_clock::now(); 46 | auto duration = duration_cast(en - st); 47 | std::cout << "### Find Coreset Spent " 48 | << double(duration.count()) * microseconds::period::num / microseconds::period::den << " seconds.\n"; 49 | std::cout << "### Find Coreset(except sim) Spent " 50 | << double((duration - sim_time).count()) * microseconds::period::num / microseconds::period::den 51 | << " seconds.\n"; 52 | } 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /RECON/mycsBrazil.h: -------------------------------------------------------------------------------- 1 | #ifndef UNTITLED2_MYCSBrazil_H 2 | #define UNTITLED2_MYCSBrazil_H 3 | 4 | 5 | #include "cnpy.h" 6 | #include "type.h" 7 | #include "util.h" 8 | #include "data.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | namespace Brazil{ 16 | using std::chrono::system_clock; 17 | using std::chrono::duration_cast; 18 | using std::chrono::microseconds; 19 | std::random_device rd; 20 | std::mt19937 mt(rd()); 21 | 22 | 23 | cnpy::NpyArray reviewArr; 24 | cnpy::NpyArray orderArr; 25 | cnpy::NpyArray orderItemArr; 26 | cnpy::NpyArray productArr; 27 | cnpy::NpyArray joinArr; 28 | 29 | 30 | dtype *dp; 31 | dtype *review, *order, *orderItem, *product, *join; 32 | idtype reviewNum, reviewDim, orderNum, orderDim, orderItemNum, orderItemDim, productNum, productDim, joinNum, joinDim; 33 | dtype *reviewSim, *orderSim, *orderItemSim, *productSim; 34 | 35 | 36 | cnpy::NpyArray loadNpy(std::string fileDir); 37 | void readBrazilnewNpy(int cate); 38 | void mallocBrazilnewArray(); 39 | void loadToArr(int cate); 40 | 41 | 42 | void mallocBrazilnewSim(); 43 | void calBrazilnewSim(); 44 | void initWeight(); 45 | 46 | 47 | void sampleOneBrazilnew(idtype &uID, idtype &ID, idtype &qID, idtype &rowID, 48 | idtype &samplejoinID); 49 | void sampleBatchBrazilnew(int sampleSize, 50 | std::vector &uIDs, 51 | std::vector &IDs, 52 | std::vector &qIDs, 53 | std::vector &rowIDs, 54 | std::vector &joinIDs); 55 | 56 | void realAddOne(idtype joinID); 57 | 58 | 59 | dtype getBenefitBrazilnew(idtype uID, 60 | idtype ID, 61 | idtype qID, 62 | idtype rowID, 63 | idtype joinID, 64 | bool change, 65 | int verbose); 66 | 67 | std::chrono::duration> testBrazil(dtype PROP, 68 | dtype epsilon, 69 | int saveWhere, 70 | int verbose 71 | ); 72 | 73 | std::vector fullCS; 74 | std::vector fullCSWeight; 75 | dtype rW = 33. / 100, oW = 33. / 100, orderItemW = 33. / 100, pW= 1./100; 76 | 77 | 78 | void freeBrazilnew() { 79 | free(review); 80 | free(orderItem); 81 | free(order); 82 | free(product); 83 | free(join); 84 | free(reviewSim); 85 | free(orderItemSim); 86 | free(orderSim); 87 | free(productSim); 88 | free(dp); 89 | } 90 | 91 | 92 | cnpy::NpyArray loadNpy(std::string fileDir) { 93 | 94 | cnpy::NpyArray arr = cnpy::npy_load(fileDir); 95 | return arr; 96 | } 97 | 98 | 99 | void readBrazilnewNpy(int cate) { 100 | std::stringstream dir; 101 | dir.str(""); 102 | 103 | 104 | dir << DATAPATH<< "Brazilnew-formycs/train-cate-" 105 | << cate << "-review.npy"; 106 | reviewArr = loadNpy(dir.str()); 107 | dir.str(""); 108 | 109 | 110 | dir << DATAPATH<< "Brazilnew-formycs/train-cate-" 111 | << cate << "-orderItem.npy"; 112 | orderItemArr = loadNpy(dir.str()); 113 | dir.str(""); 114 | 115 | 116 | dir << DATAPATH<< "Brazilnew-formycs/train-cate-" 117 | << cate << "-order.npy"; 118 | orderArr = loadNpy(dir.str()); 119 | dir.str(""); 120 | 121 | 122 | dir << DATAPATH<< "Brazilnew-formycs/train-cate-" 123 | << cate << "-product.npy"; 124 | productArr = loadNpy(dir.str()); 125 | dir.str(""); 126 | 127 | 128 | dir << DATAPATH<< "Brazilnew-formycs/train-cate-" 129 | << cate << "-joined.npy"; 130 | joinArr = loadNpy(dir.str()); 131 | dir.str(""); 132 | } 133 | 134 | 135 | void mallocBrazilnewArray() { 136 | 137 | 138 | reviewNum = reviewArr.shape[0]; 139 | reviewDim = reviewArr.shape[1]; 140 | review = (dtype *) malloc(reviewNum * reviewDim * sizeof(dtype)); 141 | 142 | 143 | orderItemNum = orderItemArr.shape[0]; 144 | orderItemDim = orderItemArr.shape[1]; 145 | orderItem = (dtype *) malloc(orderItemNum * orderItemDim * sizeof(dtype)); 146 | 147 | 148 | orderNum = orderArr.shape[0]; 149 | orderDim = orderArr.shape[1]; 150 | order = (dtype *) malloc(orderNum * orderDim * sizeof(dtype)); 151 | 152 | productNum = productArr.shape[0]; 153 | productDim = productArr.shape[1]; 154 | product = (dtype *) malloc(productNum * productDim * sizeof(dtype)); 155 | 156 | 157 | joinNum = joinArr.shape[0]; 158 | joinDim = joinArr.shape[1]; 159 | join = (dtype *) malloc(joinNum * joinDim * sizeof(dtype)); 160 | } 161 | 162 | void loadToArr(int cate) { 163 | 164 | readBrazilnewNpy(cate); 165 | mallocBrazilnewArray(); 166 | 167 | memcpy(review, reviewArr.data(), 1LL * reviewNum * reviewDim * sizeof(dtype)); 168 | memcpy(orderItem, orderItemArr.data(), 1LL * orderItemNum * orderItemDim * sizeof(dtype)); 169 | memcpy(order, orderArr.data(), 1LL * orderNum * orderDim * sizeof(dtype)); 170 | memcpy(product, productArr.data(), 1LL * productNum * productDim * sizeof(dtype)); 171 | memcpy(join, joinArr.data(), 1LL * joinNum * joinDim * sizeof(dtype)); 172 | } 173 | 174 | void mallocBrazilnewSim() { 175 | 176 | 177 | reviewSim = (dtype *) malloc(reviewNum * reviewNum * sizeof(dtype)); 178 | orderItemSim = (dtype *) malloc(orderItemNum * orderItemNum * sizeof(dtype)); 179 | orderSim = (dtype *) malloc(orderNum * orderNum * sizeof(dtype)); 180 | productSim = (dtype *) malloc(productNum * productNum * sizeof(dtype)); 181 | } 182 | 183 | void calBrazilnewSim() { 184 | 185 | initSim(reviewSim, review, reviewNum, reviewDim, 3); 186 | 187 | initSim(orderSim, order, orderNum, orderDim, 1); 188 | 189 | initSim(orderItemSim, orderItem, orderItemNum, orderItemDim, 3); 190 | 191 | initSim(productSim, product, productNum, productDim, 1); 192 | 193 | 194 | 195 | } 196 | 197 | 198 | std::vector joinIDs; 199 | void initWeight(){ 200 | 201 | joinIDs.clear(); 202 | joinIDs.reserve(orderItemNum); 203 | dp = (dtype *)malloc(orderItemNum * sizeof(dtype)); 204 | memset(dp,0, orderItemNum * sizeof(dtype)); 205 | for(int i= 0 ;i < orderItemNum; i++) 206 | joinIDs.emplace_back(i); 207 | } 208 | 209 | 210 | void sampleOneBrazilnew(idtype &rID, 211 | idtype &oID, 212 | idtype &pID, 213 | idtype &rowID, 214 | idtype &joinID){ 215 | 216 | int id = joinIDs[mt()% joinIDs.size()]; 217 | 218 | idtype idx_st = id * joinDim; 219 | rID = join[idx_st]; 220 | oID = join[idx_st + 1]; 221 | pID = join[idx_st + 2]; 222 | rowID = join[idx_st + 3]; 223 | joinID = id; 224 | } 225 | 226 | 227 | void sampleBatchBrazilnew(int sampleSize, 228 | std::vector& rIDs, 229 | std::vector& oIDs, 230 | std::vector& pIDs, 231 | std::vector& rowIDs, 232 | std::vector& joinIDs){ 233 | rIDs.resize(sampleSize); 234 | oIDs.resize(sampleSize); 235 | pIDs.resize(sampleSize); 236 | rowIDs.resize(sampleSize); 237 | joinIDs.resize(sampleSize); 238 | 239 | for(int i = 0; i < sampleSize; i ++) 240 | sampleOneBrazilnew(rIDs[i], 241 | oIDs[i], 242 | pIDs[i], 243 | rowIDs[i], 244 | joinIDs[i]); 245 | } 246 | 247 | void realAddOne(idtype joinID){ 248 | 249 | for(int i = 0 ;i < joinIDs.size();i++){ 250 | if(joinIDs[i] == joinID){ 251 | std::swap(joinIDs[joinIDs.size()-1 ], joinIDs[i]); 252 | joinIDs.pop_back(); 253 | break; 254 | } 255 | } 256 | } 257 | 258 | dtype getBenefitBrazilnew(idtype rID, 259 | idtype oID, 260 | idtype pID, 261 | idtype rowID, 262 | idtype joinID, 263 | bool change=false, 264 | int verbose=1){ 265 | 266 | dtype simSum = 0; 267 | dtype thisWeight = 0.; 268 | 269 | idtype sim_loc_review = oID * orderNum; 270 | idtype sim_loc_order = oID * orderNum; 271 | idtype sim_loc_orderItem = joinID * orderItemNum; 272 | idtype sim_loc_product = pID * productNum; 273 | 274 | 275 | 276 | idtype idx_loc = 0; 277 | for(int i = 0, jID=0 ; i < orderItemNum; i++, idx_loc+=orderItemDim, jID++){ 278 | idtype oid_ = orderItem[idx_loc]; 279 | idtype rowID_ = orderItem[idx_loc + 1]; 280 | idtype pID_ = orderItem[idx_loc + 2]; 281 | 282 | 283 | dtype tempDP = reviewSim[sim_loc_review + oid_] * rW; 284 | tempDP += orderSim[sim_loc_order + oid_] * oW; 285 | tempDP += orderItemSim[sim_loc_orderItem + jID] * orderItemW; 286 | tempDP += productSim[sim_loc_product + pID_] * pW; 287 | 288 | if(tempDP > dp[i] && change){ 289 | dp[i] = tempDP; 290 | if(cs.nn[i] !=-1){ 291 | cs.weight[cs.nn[i]] -= 1; 292 | } 293 | cs.nn[i] = cs.weight.size(); 294 | thisWeight += 1; 295 | } 296 | simSum += std::max(tempDP, dp[i]); 297 | } 298 | 299 | 300 | if(change) { 301 | cs.curSum = simSum; 302 | cs.curSum = cs.norm * std::log(1. + cs.f_norm * cs.curSum); 303 | 304 | cs.add(rowID); 305 | cs.weight.emplace_back(thisWeight); 306 | if(verbose) 307 | printf(" add this weight is %.2f Current progress 【%.2f %%】\n", thisWeight, 308 | 100. * cs.weight.size() / cs.siz); 309 | realAddOne(joinID); 310 | } 311 | 312 | return cs.norm * std::log(1. + cs.f_norm * simSum) - cs.curSum; 313 | } 314 | 315 | 316 | 317 | std::chrono::duration > testBrazil(dtype PROP, 318 | dtype epsilon = 0.01, 319 | int saveWhere=0, 320 | int verbose=1 321 | ) { 322 | fullCS.clear(); 323 | fullCSWeight.clear(); 324 | std::chrono::duration> sim_time(0); 325 | 326 | std::vector rIDs; 327 | std::vector oDs; 328 | std::vector pIDs; 329 | std::vector rowIDs; 330 | std::vector samplejoinIDs; 331 | 332 | for (int cate = 0; cate <5; cate++) { 333 | auto st = system_clock::now(); 334 | if (verbose) 335 | std::cout << "############# Current category is " << cate << " ##########\n"; 336 | 337 | 338 | loadToArr(cate); 339 | initWeight(); 340 | 341 | mallocBrazilnewSim(); 342 | calBrazilnewSim(); 343 | 344 | 345 | assert(joinNum == orderItemNum); 346 | 347 | if (verbose)std::cout << "join N is " << joinNum << "\n"; 348 | if (verbose)std::cout << "PROP is " << PROP << "\n"; 349 | 350 | idtype csSize = (idtype) (PROP * joinNum + 0.5); 351 | if (verbose)std::cout << "This cate should have [" << csSize << "]\n"; 352 | 353 | 354 | idtype sampleEachStep = 1. / PROP * std::log(1. / epsilon) + 0.5; 355 | 356 | 357 | idtype ano = 1. / PROP * std::log(1. / epsilon) + 0.5; 358 | if (ano < sampleEachStep) 359 | sampleEachStep = ano; 360 | 361 | std::cout<<"sample each step is "<(en - st); 368 | sim_time += duration; 369 | 370 | while (csSize--) { 371 | dtype curMaxBenefit = -1; 372 | idtype curMaxBenefitID = 0; 373 | 374 | 375 | std::vector rIDs; 376 | std::vector oIDs; 377 | std::vector pIDs; 378 | std::vector rowIDs; 379 | std::vector samplejoinIDs; 380 | 381 | sampleBatchBrazilnew(sampleEachStep, rIDs, oIDs, pIDs, rowIDs, samplejoinIDs); 382 | std::vector benefit_vec(sampleEachStep); 383 | 384 | #pragma omp parallel for schedule(static) 385 | for (int i = 0; i < sampleEachStep; i++) 386 | benefit_vec[i] = getBenefitBrazilnew(rIDs[i], oIDs[i], pIDs[i], rowIDs[i],samplejoinIDs[i], 0,0); 387 | idtype i = 0; 388 | for (auto val : benefit_vec) { 389 | if (val > curMaxBenefit) { 390 | curMaxBenefit = val; 391 | curMaxBenefitID = i; 392 | } 393 | ++i; 394 | } 395 | i = curMaxBenefitID; 396 | 397 | if (verbose)std::cout << "Benefit is " << curMaxBenefit<<"\n"; 398 | benefit_vec[i] = getBenefitBrazilnew(rIDs[i], oIDs[i], pIDs[i], rowIDs[i],samplejoinIDs[i], 1, 0); 399 | } 400 | 401 | freeAll(); 402 | freeBrazilnew(); 403 | 404 | fullCS.insert(fullCS.end(), cs.coresetAll.begin(), cs.coresetAll.end()); 405 | fullCSWeight.insert(fullCSWeight.end(), cs.weight.begin(), cs.weight.end()); 406 | 407 | 408 | if(verbose)std::cout << "Finished!\n"; 409 | } 410 | 411 | if(verbose)printf("Total coreset size 【%d】\n", fullCS.size()); 412 | 413 | if(verbose)std::cout << "@### 【Similarity】 Spent " << double(sim_time.count()) * microseconds::period::num / microseconds::period::den << " seconds.\n"; 414 | 415 | 416 | assert(!saveWhere); 417 | if (!saveWhere) { 418 | std::stringstream dir; 419 | dir.str(""); 420 | dir< 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "time.h" 15 | #include "assert.h" 16 | 17 | namespace IMDBC { 18 | using std::chrono::system_clock; 19 | using std::chrono::duration_cast; 20 | using std::chrono::microseconds; 21 | std::random_device rd; 22 | std::mt19937 mt(rd()); 23 | 24 | cnpy::NpyArray miArr; 25 | cnpy::NpyArray mixArr; 26 | cnpy::NpyArray titleArr; 27 | cnpy::NpyArray nameArr; 28 | cnpy::NpyArray ciArr; 29 | cnpy::NpyArray mcArr; 30 | cnpy::NpyArray mapArr; 31 | 32 | dtype *genders, *countries; 33 | idtype jN; 34 | dtype *dp; 35 | dtype *mi, *mix, *title, *name, *ci, *mc; 36 | idtype maxMovieID; 37 | idtype miNum, miDim, mixNum, mixDim, titleNum, titleDim, nameNum, nameDim, ciNum, ciDim, mcNum, mcDim; 38 | idtype mapNum, mapDim; 39 | idtype *hashMapV; 40 | std::unordered_map hashMap; 41 | dtype *mvSim, *mixSim, *miSim, *titleSim; 42 | dtype *mRowMap; 43 | dtype mixWeight, miWeight, titleWeight, personWeight, companyWeight; 44 | std::discrete_distribution<> movieDis; 45 | std::vector movies; 46 | std::vector movieWeight; 47 | std::vector > moviePerson; 48 | std::vector > movieCompany; 49 | std::vector constmovieWeight; 50 | std::vector fullCS; 51 | std::vector fullCSWeight; 52 | 53 | 54 | cnpy::NpyArray loadNpy(std::string fileDir) { 55 | cnpy::NpyArray arr = cnpy::npy_load(fileDir); 56 | return arr; 57 | } 58 | 59 | void readIMDBCNpy(int cate, int Large = 0, int linear = 0, int cateNum = 10) { 60 | std::stringstream dir; 61 | dir.str(""); 62 | if (linear == 0) { 63 | if (cateNum == 10) 64 | dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-" 65 | << cate << "-" << "mi.npy"; 66 | else 67 | dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-" 68 | << cate << "-" << "mi.npy"; 69 | } else 70 | dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate 71 | << "-" << "mi.npy"; 72 | miArr = loadNpy(dir.str()); 73 | dir.str(""); 74 | if (linear == 0) { 75 | if (cateNum == 10) 76 | dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-" 77 | << cate << "-" << "mix.npy"; 78 | else 79 | dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-" 80 | << cate << "-" << "mix.npy"; 81 | } else 82 | dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate 83 | << "-" << "mix.npy"; 84 | mixArr = loadNpy(dir.str()); 85 | dir.str(""); 86 | if (linear == 0) { 87 | if (cateNum == 10) 88 | dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-" 89 | << cate << "-" << "title.npy"; 90 | else 91 | dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-" 92 | << cate << "-" << "title.npy"; 93 | } else 94 | dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate 95 | << "-" << "title.npy"; 96 | titleArr = loadNpy(dir.str()); 97 | dir.str(""); 98 | if (linear == 0) { 99 | if (cateNum == 10) 100 | dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-" 101 | << cate << "-" << "name.npy"; 102 | else 103 | dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-" 104 | << cate << "-" << "name.npy"; 105 | } else 106 | dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate 107 | << "-" << "name.npy"; 108 | nameArr = loadNpy(dir.str()); 109 | dir.str(""); 110 | if (linear == 0) { 111 | if (cateNum == 10) 112 | dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-" 113 | << cate << "-" << "ci.npy"; 114 | else 115 | dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-" 116 | << cate << "-" << "ci.npy"; 117 | } else 118 | dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate 119 | << "-" << "ci.npy"; 120 | ciArr = loadNpy(dir.str()); 121 | dir.str(""); 122 | if (linear == 0) { 123 | if (cateNum == 10) 124 | dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-" 125 | << cate << "-" << "mc.npy"; 126 | else 127 | dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-" 128 | << cate << "-" << "mc.npy"; 129 | } else 130 | dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate 131 | << "-" << "mc.npy"; 132 | mcArr = loadNpy(dir.str()); 133 | } 134 | 135 | void mallocCArray() { 136 | mRowMap = (dtype *) malloc(100000 * sizeof(dtype)); 137 | genders = (dtype *) malloc(1000000 * sizeof(dtype)); 138 | countries = (dtype *) malloc(1000000 * sizeof(dtype)); 139 | 140 | miNum = miArr.shape[0]; 141 | miDim = miArr.shape[1]; 142 | mi = (dtype *) malloc(miNum * miDim * sizeof(dtype)); 143 | 144 | mixNum = mixArr.shape[0]; 145 | mixDim = mixArr.shape[1]; 146 | mix = (dtype *) malloc(mixNum * mixDim * sizeof(dtype)); 147 | titleNum = titleArr.shape[0]; 148 | titleDim = titleArr.shape[1]; 149 | title = (dtype *) malloc(titleNum * titleDim * sizeof(dtype)); 150 | nameNum = nameArr.shape[0]; 151 | nameDim = nameArr.shape[1]; 152 | name = (dtype *) malloc(nameNum * nameDim * sizeof(dtype)); 153 | ciNum = ciArr.shape[0]; 154 | ciDim = ciArr.shape[1]; 155 | ci = (dtype *) malloc(ciNum * ciDim * sizeof(dtype)); 156 | mcNum = mcArr.shape[0]; 157 | mcDim = mcArr.shape[1]; 158 | mc = (dtype *) malloc(mcNum * mcDim * sizeof(dtype)); 159 | } 160 | 161 | void loadToArr(int cate, int Large = 0, int linear = 0, int cateNum = 10) { 162 | readIMDBCNpy(cate, Large, linear, cateNum); 163 | mallocCArray(); 164 | memcpy(mi, miArr.data(), 1LL * miNum * miDim * sizeof(dtype)); 165 | memcpy(mix, mixArr.data(), 1LL * mixNum * mixDim * sizeof(dtype)); 166 | memcpy(title, titleArr.data(), 1LL * titleNum * titleDim * sizeof(dtype)); 167 | memcpy(name, nameArr.data(), 1LL * nameNum * nameDim * sizeof(dtype)); 168 | memcpy(ci, ciArr.data(), 1LL * ciNum * ciDim * sizeof(dtype)); 169 | memcpy(mc, mcArr.data(), 1LL * mcNum * mcDim * sizeof(dtype)); 170 | } 171 | 172 | void mallocIMDBCSim() { 173 | mvSim = (dtype *) malloc(titleNum * titleNum * sizeof(dtype)); 174 | mixSim = (dtype *) malloc(titleNum * titleNum * sizeof(dtype)); 175 | miSim = (dtype *) malloc(titleNum * titleNum * sizeof(dtype)); 176 | titleSim = (dtype *) malloc(titleNum * titleNum * sizeof(dtype)); 177 | } 178 | 179 | void calIMDBCSim() { 180 | initSim(mixSim, mix, mixNum, mixDim, 1, mixDim - 1); 181 | initSim(miSim, mi, miNum, miDim, 1); 182 | initSim(titleSim, title, titleNum, titleDim, 1); 183 | mixWeight = 1.0 / 6; 184 | miWeight = 1.0 / 6; 185 | titleWeight = 1.0 / 6; 186 | personWeight = 1.0 / 2; 187 | idtype st_id = 0; 188 | 189 | for (idtype i = 0; i < miNum; i++, st_id += miNum) { 190 | #pragma omp parallel for schedule(static) 191 | for (idtype j = 0; j < miNum; j++) { 192 | mvSim[st_id + j] = mixSim[st_id + j] * mixWeight 193 | + miSim[st_id + j] * miWeight 194 | + titleSim[st_id + j] * titleWeight; 195 | } 196 | } 197 | } 198 | void initWeight(int verbose = 0) { 199 | maxMovieID = 0; 200 | jN = 0; 201 | movies.clear(); 202 | for (idtype i = 0; i < nameNum; i++) { 203 | idtype pid = name[i * nameDim]; 204 | genders[pid] = name[i * nameDim + 1]; 205 | } 206 | for (idtype i = 0; i < mcNum; i++) { 207 | idtype cid = mc[i * mcDim + 1]; 208 | countries[cid] = mc[i * mcDim + 2]; 209 | } 210 | for (idtype i = 0; i < titleNum; i++) { 211 | maxMovieID = std::max(maxMovieID, (idtype) title[i * titleDim]); 212 | mRowMap[(idtype) title[i * titleDim]] = i; 213 | movies.emplace_back(title[i * titleDim]); 214 | } 215 | if (verbose)std::cout << "Max movie ID is " << maxMovieID << "!\n"; 216 | 217 | moviePerson.resize(maxMovieID + 1); 218 | movieCompany.resize(maxMovieID + 1); 219 | 220 | constmovieWeight.clear(); 221 | constmovieWeight.resize(2 * (maxMovieID + 1)); 222 | movieWeight.clear(); 223 | movieWeight.resize(maxMovieID + 1); 224 | 225 | for (idtype i = 0; i <= maxMovieID; i++) { 226 | moviePerson[i].clear(); 227 | movieCompany[i].clear(); 228 | } 229 | 230 | for (idtype i = 0; i < ciNum; i++) { 231 | idtype person_id = ci[i * ciDim + 0]; 232 | idtype movie_id = ci[i * ciDim + 1]; 233 | moviePerson[movie_id].emplace_back(person_id); 234 | } 235 | if (verbose)std::cout << "moviePerson Weight set finished!\n"; 236 | 237 | for (idtype i = 0; i < mcNum; i++) { 238 | idtype movie_id = mc[i * mcDim + 0]; 239 | idtype company_id = mc[i * mcDim + 1]; 240 | movieCompany[movie_id].emplace_back(company_id); 241 | } 242 | if (verbose)std::cout << "movieCompany Weight set finished!\n"; 243 | idtype sm = 0; 244 | for (idtype i = 0; i <= maxMovieID; i++) { 245 | movieWeight[i] = (idtype) moviePerson[i].size() * movieCompany[i].size(); 246 | idtype maleCnt = 0, femaleCnt = 0; 247 | for (auto p:moviePerson[i]) { 248 | if (genders[p] == 1)++maleCnt; 249 | else ++femaleCnt; 250 | } 251 | constmovieWeight[i << 1] = (idtype) femaleCnt * movieCompany[i].size(); 252 | constmovieWeight[i << 1 | 1] = (idtype) maleCnt * movieCompany[i].size(); 253 | jN += movieWeight[i]; 254 | sm += moviePerson[i].size(); 255 | } 256 | if (verbose)std::cout << "sm total is " << sm << "\n"; 257 | if (verbose)std::cout << "movie Weight set finished!\n"; 258 | movieDis = std::discrete_distribution<>(movieWeight.begin(), movieWeight.end()); 259 | dp = (dtype *) malloc(3 * (maxMovieID + 1) * sizeof(dtype)); 260 | } 261 | 262 | void sampleOneIMDBC(idtype & m, idtype & p, idtype & c) { 263 | m = movieDis(mt); 264 | std::uniform_int_distribution<> personDis = std::uniform_int_distribution<>(0, moviePerson[m].size() - 1); 265 | p = moviePerson[m][personDis(mt)]; 266 | std::uniform_int_distribution<> companyDis = std::uniform_int_distribution<>(0, movieCompany[m].size() - 1); 267 | c = movieCompany[m][companyDis(mt)]; 268 | } 269 | 270 | void sampleBatchIMDBC(int sampleSize, std::vector &ms, std::vector &ps, std::vector &cs) { 271 | for (int i = 0; i < sampleSize; i++) 272 | sampleOneIMDBC(ms[i], ps[i], cs[i]); 273 | } 274 | 275 | void realAddOne(idtype m, idtype p, idtype c) { 276 | --movieWeight[m]; 277 | movieDis = std::discrete_distribution<>(movieWeight.begin(), movieWeight.end()); 278 | } 279 | 280 | void initHashMap(int Large, int linear, int cateNum) { 281 | std::stringstream dir; 282 | dir.str(""); 283 | if (linear == 0) { 284 | if (cateNum == 10) 285 | dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/idMap.npy"; 286 | else 287 | dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/idMap.npy"; 288 | } else 289 | dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/idMap.npy"; 290 | 291 | mapArr = loadNpy(dir.str()); 292 | 293 | mapNum = mapArr.shape[0]; 294 | mapDim = mapArr.shape[1]; 295 | hashMapV = (idtype *) malloc(mapNum * mapDim * sizeof(idtype)); 296 | 297 | memcpy(hashMapV, mapArr.data(), 1LL * mapNum * mapDim * sizeof(idtype)); 298 | hashMap.clear(); 299 | for (idtype i = 0; i < mapNum; i++) { 300 | idtype hashV = hashMapV[i * mapDim]; 301 | idtype ID = hashMapV[i * mapDim + 1]; 302 | hashMap[hashV] = ID; 303 | } 304 | 305 | } 306 | 307 | idtype idInJoin(idtype m, idtype p, idtype c) { 308 | idtype hashValue = (m + 1) + (p + 1) * 100000LL + (c + 1) * 100000000000LL; 309 | assert(hashMap.find(hashValue) != hashMap.end()); 310 | return hashMap[hashValue]; 311 | } 312 | 313 | 314 | dtype getBenefitIMDBC(idtype m, idtype p, idtype c, bool change = true, int verbose = 1) { 315 | 316 | dtype simSum = 0; 317 | dtype thisWeight = 0.; 318 | 319 | idtype gender = genders[p]; 320 | dtype country = countries[c]; 321 | 322 | idtype mRowID = mRowMap[m]; 323 | assert((idtype) title[mRowID * titleDim] == m); 324 | 325 | idtype mSt = mRowID * titleNum; 326 | for (idtype i: movies) { 327 | idtype iRowID = mRowMap[i]; 328 | assert((idtype) title[iRowID * titleDim] == i); 329 | 330 | dtype newSim = mvSim[mSt + iRowID]; 331 | bool addCompanyDiff = false; 332 | for (auto c_: movieCompany[m]) 333 | if (countries[c_] != country) { 334 | addCompanyDiff = true; 335 | break; 336 | } 337 | if (!addCompanyDiff) 338 | newSim += companyWeight; 339 | 340 | dtype maleSim = ((gender == 1) ? personWeight : 0) + newSim; 341 | dtype femaleSim = ((gender == 0) ? personWeight : 0) + newSim; 342 | 343 | simSum += std::max(dp[i << 1 | 1], maleSim) * constmovieWeight[i << 1 | 1]; 344 | simSum += std::max(dp[i << 1], femaleSim) * constmovieWeight[i << 1]; 345 | 346 | if (maleSim > dp[i << 1 | 1] && change) { 347 | dp[i << 1 | 1] = maleSim; 348 | if (cs.nn[i << 1 | 1] != -1) { 349 | cs.weight[cs.nn[i << 1 | 1]] -= constmovieWeight[i << 1 | 1]; // 350 | } 351 | cs.nn[i << 1 | 1] = cs.weight.size(); 352 | thisWeight += constmovieWeight[i << 1 | 1]; 353 | } 354 | 355 | if (femaleSim > dp[i << 1] && change) { 356 | dp[i << 1] = femaleSim; 357 | if (cs.nn[i << 1] != -1) { 358 | cs.weight[cs.nn[i << 1]] -= constmovieWeight[i << 1]; // 359 | } 360 | cs.nn[i << 1] = cs.weight.size(); 361 | thisWeight += constmovieWeight[i << 1]; 362 | } 363 | 364 | } 365 | if (change) { 366 | cs.curSum = simSum; 367 | cs.curSum = cs.norm * std::log(1. + cs.f_norm * cs.curSum); 368 | cs.add(idInJoin(m, p, c)); 369 | cs.weight.emplace_back(thisWeight); 370 | if (verbose) 371 | printf(" add this weight is %.2f Current progress 【%.2f %%】\n", thisWeight, 372 | 100. * cs.weight.size() / cs.siz); 373 | realAddOne(m, p, c); 374 | } 375 | return cs.norm * std::log(1. + cs.f_norm * simSum) - cs.curSum; 376 | } 377 | 378 | std::chrono::duration> testIMDBC(dtype PROP, 379 | idtype Large = 0, 380 | dtype epsilon = 0.01, 381 | int linear = 0, 382 | int cateNum = 5, 383 | int saveWhere = 0, 384 | int verbose = 1, 385 | int assignSampleSize = 0 386 | ) { 387 | fullCS.clear(); 388 | fullCSWeight.clear(); 389 | 390 | std::chrono::duration> sim_time(0); 391 | auto st = system_clock::now(); 392 | initHashMap(Large, linear, cateNum); 393 | auto en = system_clock::now(); 394 | auto duration = duration_cast(en - st); 395 | sim_time += duration; 396 | 397 | for (int cate = 0; cate < (linear == 0 ? cateNum : 87); cate++) { 398 | st = system_clock::now(); 399 | if (verbose)std::cout << "############# Current category is " << cate << " ##########\n"; 400 | 401 | loadToArr(cate, Large, linear, cateNum); 402 | if (verbose)std::cout << "title num is " << titleNum << "\n"; 403 | 404 | initWeight(verbose); 405 | mallocIMDBCSim(); 406 | calIMDBCSim(); 407 | 408 | if (verbose)std::cout << "join N is " << jN << "\n"; 409 | if (verbose)std::cout << "PROP is " << PROP << "\n"; 410 | idtype csSize = (idtype) (PROP * jN); 411 | 412 | if (verbose)std::cout << "This cate should have [" << csSize << "]\n"; 413 | idtype sampleEachStep = 500; 414 | 415 | en = system_clock::now(); 416 | duration = duration_cast(en - st); 417 | sim_time += duration; 418 | std::vector Ms(sampleEachStep), Ps(sampleEachStep), Cs(sampleEachStep); 419 | 420 | 421 | cs.init(2 * (maxMovieID + 1), csSize); 422 | cs.f_norm = 1. / jN; 423 | 424 | if (verbose) 425 | std::cout << "company weight is " << companyWeight << " person weight is " << personWeight << "\n"; 426 | 427 | while (csSize--) { 428 | dtype curMaxBenefit = -1; 429 | idtype curMaxBenefitID = 0; 430 | 431 | sampleBatchIMDBC(sampleEachStep, Ms, Ps, Cs); 432 | 433 | std::vector benefit_vec(sampleEachStep); 434 | #pragma omp parallel for schedule(static) 435 | for (int i = 0; i < sampleEachStep; i++) 436 | benefit_vec[i] = getBenefitIMDBC(Ms[i], Ps[i], Cs[i], false); 437 | 438 | idtype i = 0; 439 | for (auto val : benefit_vec) { 440 | if (val > curMaxBenefit) { 441 | curMaxBenefit = val; 442 | curMaxBenefitID = i; 443 | } 444 | ++i; 445 | } 446 | i = curMaxBenefitID; 447 | if (verbose)std::cout << "Benefit is " << curMaxBenefit; 448 | getBenefitIMDBC(Ms[i], Ps[i], Cs[i], true, verbose); 449 | 450 | } 451 | 452 | fullCS.insert(fullCS.end(), cs.coresetAll.begin(), cs.coresetAll.end()); 453 | fullCSWeight.insert(fullCSWeight.end(), cs.weight.begin(), cs.weight.end()); 454 | 455 | } 456 | printf("Total coreset size 【%d】\n", fullCS.size()); 457 | 458 | std::cout << "@### 【Similarity】 Spent " 459 | << double(sim_time.count()) * microseconds::period::num / microseconds::period::den << " seconds.\n"; 460 | 461 | assert(saveWhere==0); 462 | if (!saveWhere) { 463 | std::stringstream dir; 464 | dir.str(""); 465 | if (linear == 0) { 466 | if (cateNum == 10) 467 | dir << CSPATH << (Large ? "IMDBLargeC10" : "IMDBC10"); 468 | else 469 | dir << CSPATH << (Large ? "IMDBLargeC5" : "IMDBC5"); 470 | } else 471 | dir << CSPATH << (Large ? "IMDBLargeCLinear" : "IMDBCLinear"); 472 | dir<< "-"< 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | namespace stackn { 16 | using std::chrono::system_clock; 17 | using std::chrono::duration_cast; 18 | using std::chrono::microseconds; 19 | std::random_device rd; 20 | std::mt19937 mt(rd()); 21 | 22 | 23 | dtype userW = 1. / 14, questionW = 2. / 14, answerW = 7. / 14; 24 | 25 | cnpy::NpyArray userArr; 26 | cnpy::NpyArray questionArr; 27 | cnpy::NpyArray answerArr; 28 | cnpy::NpyArray joinArr; 29 | 30 | dtype *dp; 31 | dtype *user, *question, *answer, *join; 32 | idtype userNum, userDim, questionNum, questionDim, answerNum, answerDim, joinNum, joinDim; 33 | dtype *userSim, *questionSim, *answerSim; 34 | 35 | std::vector users; 36 | std::vector fullCS; 37 | std::vector fullCSWeight; 38 | 39 | 40 | void freeStackn() { 41 | free(user); 42 | free(answer); 43 | free(question); 44 | free(join); 45 | free(userSim); 46 | free(answerSim); 47 | free(questionSim); 48 | free(dp); 49 | } 50 | 51 | cnpy::NpyArray loadNpy(std::string fileDir) { 52 | cnpy::NpyArray arr = cnpy::npy_load(fileDir); 53 | return arr; 54 | } 55 | 56 | 57 | void readStacknNpy(int cate) { 58 | std::stringstream dir; 59 | dir.str(""); 60 | 61 | dir << DATAPATH + "stackn-formycs/train-" 62 | << cate << "-user.npy"; 63 | userArr = loadNpy(dir.str()); 64 | dir.str(""); 65 | 66 | dir << DATAPATH + "stackn-formycs/train-" 67 | << cate << "-answer.npy"; 68 | answerArr = loadNpy(dir.str()); 69 | dir.str(""); 70 | 71 | 72 | dir << DATAPATH + "stackn-formycs/train-" 73 | << cate << "-question.npy"; 74 | questionArr = loadNpy(dir.str()); 75 | dir.str(""); 76 | 77 | 78 | dir << DATAPATH + "stackn-formycs/train-" 79 | << cate << "-joined.npy"; 80 | joinArr = loadNpy(dir.str()); 81 | dir.str(""); 82 | } 83 | 84 | void mallocStacknArray() { 85 | 86 | 87 | userNum = userArr.shape[0]; 88 | userDim = joinArr.shape[1]; 89 | user = (dtype *) malloc(userNum * userDim * sizeof(dtype)); 90 | 91 | 92 | answerNum = answerArr.shape[0]; 93 | answerDim = answerArr.shape[1]; 94 | answer = (dtype *) malloc(answerNum * answerDim * sizeof(dtype)); 95 | 96 | 97 | questionNum = questionArr.shape[0]; 98 | questionDim = questionArr.shape[1]; 99 | question = (dtype *) malloc(questionNum * questionDim * sizeof(dtype)); 100 | 101 | 102 | joinNum = joinArr.shape[0]; 103 | joinDim = joinArr.shape[1]; 104 | join = (dtype *) malloc(joinNum * joinDim * sizeof(dtype)); 105 | } 106 | 107 | void loadToArr(int cate) { 108 | 109 | 110 | readStacknNpy(cate); 111 | mallocStacknArray(); 112 | 113 | memcpy(user, userArr.data(), 1LL * userNum * userDim * sizeof(dtype)); 114 | memcpy(answer, answerArr.data(), 1LL * answerNum * answerDim * sizeof(dtype)); 115 | memcpy(question, questionArr.data(), 1LL * questionNum * questionDim * sizeof(dtype)); 116 | memcpy(join, joinArr.data(), 1LL * joinNum * joinDim * sizeof(dtype)); 117 | 118 | 119 | 120 | for (int i = 0; i < joinNum; i++) 121 | join[i * joinDim + 5] = i; 122 | } 123 | 124 | void mallocStacknSim() { 125 | 126 | 127 | userSim = (dtype *) malloc(userNum * userNum * sizeof(dtype)); 128 | answerSim = (dtype *) malloc(answerNum * answerNum * sizeof(dtype)); 129 | questionSim = (dtype *) malloc(questionNum * questionNum * sizeof(dtype)); 130 | } 131 | 132 | void calStacknSim() { 133 | 134 | initSim(userSim, user, userNum, userDim, 1); 135 | 136 | initSim(answerSim, answer, answerNum, answerDim, 3); 137 | 138 | initSim(questionSim, question, questionNum, questionDim, 2); 139 | 140 | 141 | } 142 | 143 | std::vector joinIDs; 144 | 145 | void initWeight() { 146 | 147 | joinIDs.clear(); 148 | joinIDs.reserve(answerNum); 149 | dp = (dtype *) malloc(answerNum * sizeof(dtype)); 150 | memset(dp, 0, answerNum * sizeof(dtype)); 151 | for (int i = 0; i < answerNum; i++) 152 | joinIDs.emplace_back(i); 153 | } 154 | 155 | 156 | void sampleOneStackn(idtype & uID, 157 | idtype & ID, 158 | idtype & qID, 159 | idtype & rowID, 160 | idtype & samplejoinID) { 161 | 162 | int id = joinIDs[mt() % joinIDs.size()]; 163 | 164 | 165 | 166 | idtype idx_st = id * joinDim; 167 | 168 | uID = join[idx_st]; 169 | qID = join[idx_st + 1]; 170 | ID = join[idx_st + 2]; 171 | rowID = join[idx_st + 3]; 172 | samplejoinID = join[idx_st + 5]; 173 | } 174 | 175 | 176 | void sampleBatchStackn(int sampleSize, 177 | std::vector &uIDs, 178 | std::vector &IDs, 179 | std::vector &qIDs, 180 | std::vector &rowIDs, 181 | std::vector &joinIDs) { 182 | uIDs.resize(sampleSize); 183 | IDs.resize(sampleSize); 184 | qIDs.resize(sampleSize); 185 | rowIDs.resize(sampleSize); 186 | joinIDs.resize(sampleSize); 187 | 188 | for (int i = 0; i < sampleSize; i++) 189 | sampleOneStackn(uIDs[i], 190 | IDs[i], 191 | qIDs[i], 192 | rowIDs[i], 193 | joinIDs[i]); 194 | } 195 | 196 | void realAddOne(idtype joinID) { 197 | 198 | for (int i = 0; i < joinIDs.size(); i++) { 199 | if (joinIDs[i] == joinID) { 200 | std::swap(joinIDs[joinIDs.size() - 1], joinIDs[i]); 201 | joinIDs.pop_back(); 202 | break; 203 | } 204 | } 205 | } 206 | 207 | 208 | dtype getBenefitStackn(idtype uID, 209 | idtype ID, 210 | idtype qID, 211 | idtype rowID, 212 | idtype joinID, 213 | bool change = false, 214 | int verbose = 1) { 215 | 216 | 217 | dtype simSum = 0; 218 | dtype thisWeight = 0.; 219 | 220 | idtype sim_loc_user = uID * userNum; 221 | idtype sim_loc_answer = ID * answerNum; 222 | idtype sim_loc_question = qID * questionNum; 223 | 224 | 225 | 226 | idtype idx_loc = 0; 227 | for (int i = 0; i < answerNum; i++, idx_loc += answerDim) { 228 | idtype _id = answer[idx_loc]; 229 | idtype _uID = answer[idx_loc + 1]; 230 | idtype _qID = answer[idx_loc + 2]; 231 | 232 | dtype tempDP = answerSim[sim_loc_answer + _id] * answerW; 233 | tempDP += questionSim[sim_loc_question + _qID] * questionW; 234 | tempDP += userSim[sim_loc_user + _uID] * userW; 235 | 236 | 237 | if (tempDP > dp[i] && change) { 238 | dp[i] = tempDP; 239 | if (cs.nn[i] != -1) { 240 | cs.weight[cs.nn[i]] -= 1; 241 | } 242 | cs.nn[i] = cs.weight.size(); 243 | thisWeight += 1; 244 | } 245 | simSum += std::max(tempDP, dp[i]); 246 | } 247 | 248 | if (change) { 249 | cs.curSum = simSum; 250 | cs.curSum = cs.norm * std::log(1. + cs.f_norm * cs.curSum); 251 | 252 | cs.add(rowID); 253 | cs.weight.emplace_back(thisWeight); 254 | if (verbose) 255 | printf(" add this weight is %.2f Current progress 【%.2f %%】\n", thisWeight, 256 | 100. * cs.weight.size() / cs.siz); 257 | realAddOne(joinID); 258 | } 259 | 260 | return cs.norm * std::log(1. + cs.f_norm * simSum) - cs.curSum; 261 | } 262 | 263 | 264 | std::chrono::duration> testStackn(dtype PROP, 265 | dtype epsilon = 0.01, 266 | int saveWhere = 0, 267 | int verbose = 1 268 | ) { 269 | fullCS.clear(); 270 | fullCSWeight.clear(); 271 | 272 | std::chrono::duration> sim_time(0); 273 | 274 | 275 | std::vector uIDs; 276 | std::vector IDs; 277 | std::vector qIDs; 278 | std::vector rowIDs; 279 | std::vector samplejoinIDs; 280 | 281 | 282 | for (int cate = 0; cate <= 18305; cate++) { 283 | auto st = system_clock::now(); 284 | if (verbose) 285 | std::cout << "############# Current category is " << cate << " ##########\n"; 286 | 287 | 288 | loadToArr(cate); 289 | initWeight(); 290 | 291 | mallocStacknSim(); 292 | calStacknSim(); 293 | 294 | 295 | assert(joinNum == answerNum); 296 | 297 | if (verbose)std::cout << "join N is " << joinNum << "\n"; 298 | if (verbose)std::cout << "PROP is " << PROP << "\n"; 299 | 300 | idtype csSize = (idtype) (PROP * joinNum + 0.5); 301 | if (verbose)std::cout << "This cate should have [" << csSize << "]\n"; 302 | 303 | 304 | idtype sampleEachStep = 1. / PROP * std::log(1. / epsilon) + 0.5; 305 | 306 | 307 | idtype ano = 1. / PROP * std::log(1. / epsilon) + 0.5; 308 | if (ano < sampleEachStep) 309 | sampleEachStep = ano; 310 | 311 | cs.init(joinNum, csSize); 312 | cs.f_norm = 1. / joinNum; 313 | 314 | auto en = system_clock::now(); 315 | auto duration = duration_cast(en - st); 316 | sim_time += duration; 317 | 318 | while (csSize--) { 319 | dtype curMaxBenefit = -1; 320 | idtype curMaxBenefitID = 0; 321 | 322 | 323 | std::vector uIDs; 324 | std::vector IDs; 325 | std::vector qIDs; 326 | std::vector rowIDs; 327 | std::vector samplejoinIDs; 328 | 329 | sampleBatchStackn(sampleEachStep, uIDs, IDs, qIDs, rowIDs, samplejoinIDs); 330 | std::vector benefit_vec(sampleEachStep); 331 | 332 | #pragma omp parallel for schedule(static) 333 | for (int i = 0; i < sampleEachStep; i++) 334 | benefit_vec[i] = getBenefitStackn(uIDs[i], IDs[i], qIDs[i], rowIDs[i], samplejoinIDs[i], 0, 0); 335 | idtype i = 0; 336 | for (auto val : benefit_vec) { 337 | if (val > curMaxBenefit) { 338 | curMaxBenefit = val; 339 | curMaxBenefitID = i; 340 | } 341 | ++i; 342 | } 343 | i = curMaxBenefitID; 344 | 345 | if (verbose)std::cout << "Benefit is " << curMaxBenefit<<"\n"; 346 | benefit_vec[i] = getBenefitStackn(uIDs[i], IDs[i], qIDs[i], rowIDs[i], samplejoinIDs[i], 1, 0); 347 | } 348 | 349 | freeAll(); 350 | freeStackn(); 351 | 352 | fullCS.insert(fullCS.end(), cs.coresetAll.begin(), cs.coresetAll.end()); 353 | fullCSWeight.insert(fullCSWeight.end(), cs.weight.begin(), cs.weight.end()); 354 | 355 | 356 | if (verbose)std::cout << "Finished!\n"; 357 | } 358 | 359 | if (verbose)printf("Total coreset size 【%d】\n", fullCS.size()); 360 | 361 | if (verbose) 362 | std::cout << "@### 【Similarity】 Spent " 363 | << double(sim_time.count()) * microseconds::period::num / microseconds::period::den 364 | << " seconds.\n"; 365 | 366 | assert(!saveWhere); 367 | if (!saveWhere) { 368 | std::stringstream dir; 369 | dir.str(""); 370 | dir< 11 | #include 12 | #include 13 | #include 14 | 15 | 16 | namespace taxi { 17 | using std::chrono::system_clock; 18 | using std::chrono::duration_cast; 19 | using std::chrono::microseconds; 20 | std::random_device rd; 21 | std::mt19937 mt(rd()); 22 | 23 | dtype taxiW = 1. / 14, t5W = 2. / 14, t11W = 7. / 14, t16W = 2. / 14, t20W = 2. / 14; 24 | 25 | 26 | cnpy::NpyArray taxiArr; 27 | cnpy::NpyArray t5Arr; 28 | cnpy::NpyArray t11Arr; 29 | cnpy::NpyArray t16Arr; 30 | cnpy::NpyArray t20Arr; 31 | cnpy::NpyArray joinArr; 32 | 33 | dtype *dp; 34 | 35 | 36 | dtype *taxi, *t5, *t11, *t16, *t20, *join; 37 | 38 | idtype taxiNum, taxiDim, t5Num, t5Dim, t11Num, t11Dim, t16Num, t16Dim, t20Num, t20Dim, joinNum, joinDim; 39 | 40 | dtype *taxiSim, *t5Sim, *t11Sim, *t16Sim, *t20Sim; 41 | 42 | 43 | 44 | std::vector movies; 45 | 46 | 47 | idtype *f642Weight; 48 | 49 | 50 | 51 | cnpy::NpyArray loadNpy(std::string fileDir); 52 | void 53 | readTaxiNpy(int cate); 54 | void 55 | mallocTaxiArray(); 56 | void 57 | loadToArr(int cate); 58 | 59 | void 60 | readTaxiNpyGlobal(); 61 | void 62 | mallocTaxiArrayGlobal(); 63 | void 64 | loadToArrGlobal(); 65 | 66 | 67 | 68 | void mallocTaxiSim(); 69 | void calTaxiSim(); 70 | 71 | void mallocTaxiSimGlobal(); 72 | void calTaxiSimGlobal(); 73 | 74 | 75 | void initWeight(); 76 | void initWeightGlobal(); 77 | 78 | 79 | 80 | void sampleOneTaxi(idtype &ID5, idtype &ID11, idtype &ID16, idtype &ID20, idtype &f642, idtype &rowID, 81 | idtype &joinID); 82 | void sampleBatchTaxi(int sampleSize, 83 | std::vector &ID5s, 84 | std::vector &ID11s, 85 | std::vector &ID16s, 86 | std::vector &ID20s, 87 | std::vector &f642s, 88 | std::vector &rowIDs, 89 | std::vector &joinIDs); 90 | 91 | void starDP(idtype ID5, 92 | idtype ID11, 93 | idtype ID16, 94 | idtype ID20, 95 | idtype f642); 96 | 97 | 98 | void realAddOne(idtype joinID); 99 | 100 | 101 | 102 | 103 | dtype getBenefitTaxi(idtype ID5, 104 | idtype ID11, 105 | idtype ID16, 106 | idtype ID20, 107 | idtype f642, 108 | idtype rowID, 109 | idtype joinID, 110 | bool change, 111 | int verbose); 112 | 113 | 114 | std::chrono::duration> testTaxi(dtype PROP, 115 | dtype epsilon, 116 | int saveWhere, 117 | int verbose 118 | ); 119 | 120 | std::vector fullCS; 121 | std::vector fullCSWeight; 122 | 123 | 124 | cnpy::NpyArray loadNpy(std::string fileDir) { 125 | 126 | cnpy::NpyArray arr = cnpy::npy_load(fileDir); 127 | return arr; 128 | } 129 | 130 | void readTaxiNpyGlobal() { 131 | std::stringstream dir; 132 | dir.str(""); 133 | 134 | dir << DATAPATH + "taxi-formycs/train-taxi.npy"; 135 | taxiArr = loadNpy(dir.str()); 136 | dir.str(""); 137 | 138 | dir << DATAPATH + "taxi-formycs/train-t5.npy"; 139 | t5Arr = loadNpy(dir.str()); 140 | dir.str(""); 141 | 142 | dir << DATAPATH + "taxi-formycs/train-t16.npy"; 143 | t16Arr = loadNpy(dir.str()); 144 | dir.str(""); 145 | 146 | dir << DATAPATH + "taxi-formycs/train-t20.npy"; 147 | t20Arr = loadNpy(dir.str()); 148 | dir.str(""); 149 | } 150 | 151 | void mallocTaxiArrayGlobal() { 152 | 153 | 154 | taxiNum = taxiArr.shape[0]; 155 | taxiDim = taxiArr.shape[1]; 156 | taxi = (dtype *) malloc(taxiNum * taxiDim * sizeof(dtype)); 157 | 158 | 159 | t5Num = t5Arr.shape[0]; 160 | t5Dim = t5Arr.shape[1]; 161 | t5 = (dtype *) malloc(t5Num * t5Dim * sizeof(dtype)); 162 | 163 | 164 | t16Num = t16Arr.shape[0]; 165 | t16Dim = t16Arr.shape[1]; 166 | t16 = (dtype *) malloc(t16Num * t16Dim * sizeof(dtype)); 167 | 168 | 169 | t20Num = t20Arr.shape[0]; 170 | t20Dim = t20Arr.shape[1]; 171 | t20 = (dtype *) malloc(t20Num * t20Dim * sizeof(dtype)); 172 | } 173 | 174 | void loadToArrGlobal() { 175 | 176 | readTaxiNpyGlobal(); 177 | mallocTaxiArrayGlobal(); 178 | memcpy(taxi, taxiArr.data(), 1LL * taxiNum * taxiDim * sizeof(dtype)); 179 | memcpy(t5, t5Arr.data(), 1LL * t5Num * t5Dim * sizeof(dtype)); 180 | memcpy(t16, t16Arr.data(), 1LL * t16Num * t16Dim * sizeof(dtype)); 181 | memcpy(t20, t20Arr.data(), 1LL * t20Num * t20Dim * sizeof(dtype)); 182 | } 183 | 184 | 185 | void readTaxiNpy(int cate) { 186 | std::stringstream dir; 187 | dir.str(""); 188 | 189 | 190 | dir << DATAPATH + "taxi-formycs/train-" 191 | << cate << "-joined.npy"; 192 | joinArr = loadNpy(dir.str()); 193 | dir.str(""); 194 | 195 | dir << DATAPATH + "taxi-formycs/train-" 196 | << cate << "-t11.npy"; 197 | t11Arr = loadNpy(dir.str()); 198 | dir.str(""); 199 | 200 | } 201 | 202 | void mallocTaxiArray() { 203 | 204 | 205 | joinNum = joinArr.shape[0]; 206 | joinDim = joinArr.shape[1]; 207 | join = (dtype *) malloc(joinNum * joinDim * sizeof(dtype)); 208 | 209 | 210 | t11Num = t11Arr.shape[0]; 211 | t11Dim = t11Arr.shape[1]; 212 | t11 = (dtype *) malloc(t11Num * t11Dim * sizeof(dtype)); 213 | } 214 | 215 | void loadToArr(int cate) { 216 | 217 | 218 | readTaxiNpy(cate); 219 | mallocTaxiArray(); 220 | memcpy(join, joinArr.data(), 1LL * joinNum * joinDim * sizeof(dtype)); 221 | memcpy(t11, t11Arr.data(), 1LL * t11Num * t11Dim * sizeof(dtype)); 222 | } 223 | 224 | 225 | void mallocTaxiSimGlobal() { 226 | 227 | 228 | taxiSim = (dtype *) malloc(taxiNum * taxiNum * sizeof(dtype)); 229 | t5Sim = (dtype *) malloc(t5Num * t5Num * sizeof(dtype)); 230 | t16Sim = (dtype *) malloc(t16Num * t16Num * sizeof(dtype)); 231 | t20Sim = (dtype *) malloc(t20Num * t20Num * sizeof(dtype)); 232 | } 233 | 234 | void calTaxiSimGlobal() { 235 | 236 | initSim(taxiSim, taxi, taxiNum, taxiDim, 1, taxiDim - 1); 237 | 238 | initSim(t5Sim, t5, t5Num, t5Dim, 2); 239 | 240 | initSim(t16Sim, t16, t16Num, t16Dim, 2); 241 | 242 | initSim(t20Sim, t20, t20Num, t20Dim, 2); 243 | } 244 | 245 | void mallocTaxiSim() { 246 | 247 | t11Sim = (dtype *) malloc(t11Num * t11Num * sizeof(dtype)); 248 | } 249 | 250 | void calTaxiSim() { 251 | 252 | initSim(t11Sim, t11, t11Num, t11Dim, 2); 253 | } 254 | 255 | dtype *tp, *tp2; 256 | std::vector f642s; 257 | 258 | void initWeightGlobal() { 259 | 260 | 261 | 262 | tp = (dtype *) malloc((taxiNum + 1) * sizeof(dtype)); 263 | tp2 = (dtype *) malloc((taxiNum + 1) * sizeof(dtype)); 264 | 265 | f642s.clear(); 266 | f642Weight = (idtype *) malloc(500 * sizeof(idtype)); 267 | 268 | 269 | 270 | for (int i = 0; i < taxiNum; i++) { 271 | idtype key = taxi[i * taxiDim]; 272 | 273 | f642s.emplace_back(key); 274 | f642Weight[key] = 1; 275 | int cnt = 0; 276 | for (int j = 0; j < t5Num; j++) { 277 | int loc = j * t5Dim + 1; 278 | if (key == t5[loc])++cnt; 279 | } 280 | f642Weight[key] *= cnt; 281 | 282 | cnt = 0; 283 | for (int j = 0; j < t16Num; j++) { 284 | int loc = j * t16Dim + 1; 285 | if (key == t16[loc])++cnt; 286 | } 287 | f642Weight[key] *= cnt; 288 | 289 | cnt = 0; 290 | for (int j = 0; j < t20Num; j++) { 291 | int loc = j * t20Dim + 1; 292 | if (key == t20[loc])++cnt; 293 | } 294 | f642Weight[key] *= cnt; 295 | } 296 | } 297 | 298 | 299 | std::vector joinIDs; 300 | 301 | void initWeight() { 302 | 303 | joinIDs.clear(); 304 | joinIDs.reserve(joinNum); 305 | dp = (dtype *) malloc(t11Num * sizeof(dtype)); 306 | memset(dp, 0, t11Num * sizeof(dtype)); 307 | for (int i = 0; i < joinNum; i++) 308 | joinIDs.emplace_back(i); 309 | } 310 | 311 | void sampleOneTaxi(idtype &ID5, 312 | idtype &ID11, 313 | idtype &ID16, 314 | idtype &ID20, 315 | idtype &f642, 316 | idtype &rowID, 317 | idtype &joinID) { 318 | 319 | int id = joinIDs[mt() % joinIDs.size()]; 320 | 321 | 322 | idtype idx_st = id * joinDim; 323 | 324 | f642 = join[idx_st]; 325 | ID5 = join[idx_st + 1]; 326 | ID11 = join[idx_st + 2]; 327 | ID16 = join[idx_st + 3]; 328 | ID20 = join[idx_st + 4]; 329 | 330 | rowID = join[idx_st + joinDim - 1]; 331 | joinID = id; 332 | } 333 | 334 | void sampleBatchTaxi(int sampleSize, 335 | std::vector &ID5s, 336 | std::vector &ID11s, 337 | std::vector &ID16s, 338 | std::vector &ID20s, 339 | std::vector &f642s, 340 | std::vector &rowIDs, 341 | std::vector &joinIDs) { 342 | ID5s.resize(sampleSize); 343 | ID11s.resize(sampleSize); 344 | ID16s.resize(sampleSize); 345 | ID20s.resize(sampleSize); 346 | f642s.resize(sampleSize); 347 | rowIDs.resize(sampleSize); 348 | joinIDs.resize(sampleSize); 349 | 350 | for (int i = 0; i < sampleSize; i++) 351 | sampleOneTaxi(ID5s[i], 352 | ID11s[i], 353 | ID16s[i], 354 | ID20s[i], 355 | f642s[i], 356 | rowIDs[i], 357 | joinIDs[i]); 358 | } 359 | 360 | void realAddOne(idtype joinID) { 361 | 362 | for (int i = 0; i < joinIDs.size(); i++) { 363 | if (joinIDs[i] == joinID) { 364 | std::swap(joinIDs[joinIDs.size() - 1], joinIDs[i]); 365 | joinIDs.pop_back(); 366 | break; 367 | } 368 | } 369 | } 370 | 371 | 372 | void starDP(idtype ID5, 373 | idtype ID11, 374 | idtype ID16, 375 | idtype ID20, 376 | idtype f642) { 377 | 378 | memset(tp2, 0x3f, sizeof(tp2) * taxiNum); 379 | idtype simloc = ID5 * t5Num; 380 | for (int i = 0; i < t5Num; i++) { 381 | idtype this_f642 = t5[i * t5Dim + 1]; 382 | dtype this_sim = t5Sim[simloc + i]; 383 | tp2[this_f642] = std::min(tp2[this_f642], this_sim); 384 | } 385 | for (int i = 0; i < taxiNum; i++) 386 | tp[i] += tp2[i] * t5W; 387 | 388 | 389 | 390 | simloc = ID16 * t16Num; 391 | memset(tp2, 0x3f, sizeof(tp2) * taxiNum); 392 | for (int i = 0; i < t16Num; i++) { 393 | idtype this_f642 = t16[i * t16Dim + 1]; 394 | dtype this_sim = t16Sim[simloc + i]; 395 | tp2[this_f642] = std::min(tp2[this_f642], this_sim); 396 | } 397 | for (int i = 0; i < taxiNum; i++) 398 | tp[i] += tp2[i] * t16W; 399 | 400 | 401 | simloc = ID20 * t20Num; 402 | memset(tp2, 0x3f, sizeof(tp2) * taxiNum); 403 | for (int i = 0; i < t20Num; i++) { 404 | idtype this_f642 = t20[i * t20Dim + 1]; 405 | dtype this_sim = t20Sim[simloc + i]; 406 | tp2[this_f642] = std::min(tp2[this_f642], this_sim); 407 | } 408 | for (int i = 0; i < taxiNum; i++) 409 | tp[i] += tp2[i] * t20W; 410 | 411 | 412 | 413 | simloc = f642 * taxiNum; 414 | memset(tp2, 0x3f, sizeof(tp2) * taxiNum); 415 | 416 | for (int i = 0; i < taxiNum; i++) { 417 | idtype this_f642 = taxi[i * taxiDim]; 418 | dtype this_sim = taxiSim[simloc + i]; 419 | tp2[this_f642] = std::min(tp2[this_f642], this_sim); 420 | } 421 | for (int i = 0; i < taxiNum; i++) 422 | tp[i] += tp2[i] * taxiW; 423 | 424 | } 425 | 426 | dtype getBenefitTaxi(idtype ID5, 427 | idtype ID11, 428 | idtype ID16, 429 | idtype ID20, 430 | idtype f642, 431 | idtype rowID, 432 | idtype joinID, 433 | bool change = false, 434 | int verbose = 1) { 435 | memset(tp, 0, sizeof(dtype) * taxiNum); 436 | starDP(ID5, ID11, ID16, ID20, f642); 437 | 438 | dtype simSum = 0; 439 | dtype thisWeight = 0.; 440 | 441 | 442 | idtype sim_loc = ID11 * t11Num; 443 | 444 | for (int i = 0; i < t11Num; i++) { 445 | idtype tmp_f642 = t11[i * t11Dim + 1]; 446 | dtype tmp_dp = tp[tmp_f642]; 447 | dtype t11_sim = t11Sim[sim_loc + i]; 448 | tmp_dp += t11_sim * t11W; 449 | if (tmp_dp > dp[i] && change) { 450 | dp[i] = tmp_dp; 451 | if (cs.nn[i] != -1) { 452 | cs.weight[cs.nn[i]] -= f642Weight[tmp_f642]; 453 | } 454 | cs.nn[i] = cs.weight.size(); 455 | thisWeight += f642Weight[tmp_f642]; 456 | } 457 | simSum += std::max(tmp_dp, dp[i]) * f642Weight[tmp_f642]; 458 | } 459 | 460 | if (change) { 461 | cs.curSum = simSum; 462 | cs.curSum = cs.norm * std::log(1. + cs.f_norm * cs.curSum); 463 | 464 | cs.add(rowID); 465 | cs.weight.emplace_back(thisWeight); 466 | if (verbose) 467 | printf(" add this weight is %.2f Current progress 【%.2f %%】\n", thisWeight, 468 | 100. * cs.weight.size() / cs.siz); 469 | realAddOne(joinID); 470 | } 471 | 472 | return cs.norm * std::log(1. + cs.f_norm * simSum) - cs.curSum; 473 | } 474 | 475 | 476 | std::chrono::duration> testTaxi(dtype PROP, 477 | dtype epsilon = 0.01, 478 | int saveWhere = 0, 479 | int verbose = 1 480 | ) { 481 | fullCS.clear(); 482 | fullCSWeight.clear(); 483 | 484 | std::chrono::duration> sim_time(0); 485 | 486 | 487 | loadToArrGlobal(); 488 | mallocTaxiSimGlobal(); 489 | calTaxiSimGlobal(); 490 | initWeightGlobal(); 491 | 492 | std::vector ID5s; 493 | std::vector ID11s; 494 | std::vector ID16s; 495 | std::vector ID20s; 496 | std::vector f642s; 497 | std::vector rowIDs; 498 | std::vector samplejoinIDs; 499 | 500 | 501 | for (int cate = 0; cate <= 93; cate++) { 502 | auto st = system_clock::now(); 503 | if (verbose)std::cout << "############# Current category is " << cate << " ##########\n"; 504 | 505 | 506 | loadToArr(cate); 507 | initWeight(); 508 | 509 | mallocTaxiSim(); 510 | calTaxiSim(); 511 | 512 | if (verbose)std::cout << "join N is " << joinNum << "\n"; 513 | if (verbose)std::cout << "PROP is " << PROP << "\n"; 514 | 515 | idtype csSize = (idtype) (PROP * joinNum); 516 | if (verbose)std::cout << "This cate should have [" << csSize << "]\n"; 517 | 518 | 519 | idtype sampleEachStep = 1. / PROP * std::log(1. / epsilon) + 0.5; 520 | 521 | 522 | idtype ano = 1. / PROP * std::log(1. / epsilon) + 0.5; 523 | if (ano < sampleEachStep) 524 | sampleEachStep = ano; 525 | 526 | cs.init(t11Num, csSize); 527 | cs.f_norm = 1. / joinNum; 528 | 529 | auto en = system_clock::now(); 530 | auto duration = duration_cast(en - st); 531 | sim_time += duration; 532 | 533 | while (csSize--) { 534 | dtype curMaxBenefit = -1; 535 | idtype curMaxBenefitID = 0; 536 | 537 | sampleBatchTaxi(sampleEachStep, ID5s, ID11s, ID16s, ID20s, f642s, rowIDs, samplejoinIDs); 538 | std::vector benefit_vec(sampleEachStep); 539 | 540 | #pragma omp parallel for schedule(static) 541 | for (int i = 0; i < sampleEachStep; i++) 542 | benefit_vec[i] = getBenefitTaxi(ID5s[i], ID11s[i], ID16s[i], ID20s[i], f642s[i], rowIDs[i], 543 | samplejoinIDs[i], 0, 0); 544 | idtype i = 0; 545 | for (auto val : benefit_vec) { 546 | if (val > curMaxBenefit) { 547 | curMaxBenefit = val; 548 | curMaxBenefitID = i; 549 | } 550 | ++i; 551 | } 552 | i = curMaxBenefitID; 553 | 554 | if (verbose)std::cout << "Benefit is " << curMaxBenefit<<"\n"; 555 | getBenefitTaxi(ID5s[i], ID11s[i], ID16s[i], ID20s[i], f642s[i], rowIDs[i], samplejoinIDs[i], true, 0); 556 | } 557 | 558 | fullCS.insert(fullCS.end(), cs.coresetAll.begin(), cs.coresetAll.end()); 559 | fullCSWeight.insert(fullCSWeight.end(), cs.weight.begin(), cs.weight.end()); 560 | 561 | 562 | if (verbose)std::cout << "Finished!\n"; 563 | } 564 | 565 | if (verbose)printf("Total coreset size 【%d】\n", fullCS.size()); 566 | 567 | if (verbose) 568 | std::cout << "@### 【Similarity】 Spent " 569 | << double(sim_time.count()) * microseconds::period::num / microseconds::period::den 570 | << " seconds.\n"; 571 | 572 | assert(!saveWhere); 573 | if (!saveWhere) { 574 | std::stringstream dir; 575 | dir.str(""); 576 | dir< 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | char * cur_time(){ 14 | time_t now = time(0); 15 | char* dt = ctime(&now); 16 | dt[strlen(dt) -1 ]='\0'; 17 | return dt; 18 | } 19 | 20 | void test(dtype *z){ 21 | for(idtype i = 0; i < std::min((idtype)20, n * d); i++) 22 | std::cout << z[i] <<" "; 23 | puts(""); 24 | return; 25 | } 26 | 27 | inline dtype distance(idtype idx, idtype idy){ 28 | 29 | dtype ret = 0; 30 | idx = Map[idx]; 31 | idy = Map[idy]; 32 | 33 | for(idtype i = 0; i < d; i++) 34 | ret += (X[idx * d + i] - X[idy * d + i]) * (X[idx * d + i] - X[idy * d + i]); 35 | return ret; 36 | } 37 | 38 | inline dtype tryAdd(dtype* cur, idtype element){ 39 | 40 | dtype sim_sum = 0; 41 | #pragma omp parallel for schedule (static) reduction(+:sim_sum) 42 | for(idtype i = 0; i < n; i++) 43 | sim_sum += std::max(cur[i], similarity[element * n + i]); 44 | 45 | return norm * std::log(1. + f_norm * sim_sum) - curSum; 46 | } 47 | inline void realAdd(dtype* cur, idtype element){ 48 | 49 | curSum = 0; 50 | for(idtype i = 0; i < n; i++){ 51 | if(similarity[element * n + i] > cur[i]) { 52 | cur[i] = similarity[element * n + i]; 53 | if(nn[i]!=-1) { 54 | 55 | --weight[nn[i]]; 56 | } 57 | nn[i] = element; 58 | ++weight[element]; 59 | } 60 | curSum += cur[i]; 61 | } 62 | curSum = norm * std::log(1. + f_norm * curSum); 63 | coreset.emplace_back(element); 64 | ++cSize; 65 | return; 66 | } 67 | void initSimilarity(int verbose=1){ 68 | 69 | 70 | dtype max_similarity = 0; 71 | 72 | if(verbose)printf("Start to cal similarity %s\n",cur_time()); 73 | for(idtype i = 0; i < n; i++) 74 | #pragma omp parallel for schedule (static) 75 | for(idtype j = i + 1; j < n; j++){ 76 | similarity[i * n + j] = -distance(i, j); 77 | 78 | } 79 | if(verbose)printf("Finish to cal similarity %s\n",cur_time()); 80 | for(idtype i = 0; i < n; i++) 81 | for(idtype j = i + 1; j < n; j++){ 82 | 83 | max_similarity = std::max(max_similarity, -similarity[i * n + j]); 84 | } 85 | if(verbose)std::cout << "max similarity is " << max_similarity << "\n"; 86 | #pragma omp parallel for schedule (guided) 87 | for(idtype i = 0; i < n; i++) 88 | for(idtype j = i + 1; j < n; j++) { 89 | similarity[i * n + j] = (max_similarity + similarity[i * n + j]) / max_similarity; 90 | } 91 | 92 | for(idtype i = 0; i < n; i++) 93 | #pragma omp parallel for schedule (static) 94 | for(idtype j = 0; j < i; j++) 95 | similarity[i * n + j] = similarity[j * n + i]; 96 | for(idtype i = 0; i < n; i++) 97 | similarity[i * n + i] = 1; 98 | 99 | return; 100 | } 101 | void initPQ(){ 102 | 103 | while(!pq.empty())pq.pop(); 104 | for(idtype i = 1; i < n; i++) 105 | pq.push(std::make_pair(tryAdd(maxSim, i),i)); 106 | return; 107 | } 108 | 109 | 110 | void initCategories(int verbose=1){ 111 | 112 | N = n; 113 | cateNum.clear(); 114 | 115 | 116 | 117 | for(idtype i = 0; i < N; i++) { 118 | 119 | if(Y[i]==-1)Y[i]=0; 120 | 121 | ++cateNum[Y[i]]; 122 | } 123 | 124 | cateCnt = cateNum.size(); 125 | std::cout<<"cateCnt is "< lazyVec; 149 | bool cmpLazyIMDB(int i, int j){ 150 | return lazyVec[i] > lazyVec[j]; 151 | } 152 | 153 | bool cmpLazy(int i, int j){ 154 | return lazy[i] > lazy[j]; 155 | } 156 | 157 | 158 | inline dtype Dist(idtype u1, idtype u2, dtype * data, idtype dim, idtype st_id = 1, idtype end_id=-1){ 159 | 160 | dtype ret = 0.; 161 | if(end_id == -1) 162 | end_id = dim; 163 | idtype u1Loc = u1 * dim; 164 | idtype u2Loc = u2 * dim; 165 | for(idtype i = st_id; i < end_id; i++) 166 | ret += (data[u1Loc + i] - data[u2Loc + i]) * (data[u1Loc + i] - data[u2Loc + i]); 167 | return ret; 168 | } 169 | 170 | void initSim(dtype * sim, dtype * data, idtype num, idtype dim, idtype st_id=1,idtype end_id=-1){ 171 | 172 | if(end_id==-1) 173 | end_id = dim; 174 | dtype maxDis = 0.; 175 | #pragma omp parallel for schedule(guided) 176 | for(idtype i = 0; i < num; i++) { 177 | idtype now = i * num; 178 | for (idtype j = i + 1; j < num; j++) { 179 | sim[now + j] = -Dist(i, j, data, dim, st_id, end_id); 180 | } 181 | } 182 | idtype now = 0; 183 | for(idtype i = 0;i < num;i++) { 184 | for (idtype j = i + 1; j < num; j++) 185 | maxDis = std::max(maxDis, -sim[now+ j]); 186 | now += num; 187 | } 188 | 189 | 190 | #pragma omp parallel for schedule(guided) 191 | for(idtype i = 0; i < num; i++) { 192 | idtype now = i * num; 193 | for (idtype j = i + 1; j < num; j++) 194 | sim[now + j] = (maxDis + sim[now + j]) / maxDis; 195 | } 196 | 197 | #pragma omp parallel for schedule(guided) 198 | for(idtype i = 0; i < n; i++) 199 | 200 | for(idtype j = 0; j < i; j++) 201 | sim[i * num + j] = sim[j * num + i]; 202 | 203 | for(idtype i = 0; i < n; i++) 204 | sim[i * n + i] = 1; 205 | return; 206 | 207 | 208 | 209 | } 210 | 211 | 212 | struct CS{ 213 | int n; 214 | int siz; 215 | dtype curSum = 0.; 216 | 217 | dtype norm = 1./std::log(2.); 218 | dtype f_norm; 219 | 220 | idtype* nn; 221 | std::vector coresetAll; 222 | std::vector weight; 223 | 224 | void init(int n_, int size_){ 225 | n = n_; 226 | curSum = 0; 227 | siz = size_; 228 | coresetAll.clear(); 229 | weight.clear(); 230 | 231 | coresetAll.reserve(size_); 232 | weight.reserve(size_); 233 | 234 | nn = (idtype *)malloc(n * sizeof(idtype)); 235 | memset(nn, -1, n * sizeof(idtype)); 236 | f_norm = 1./(2. * n); 237 | } 238 | void add(idtype id_){ 239 | coresetAll.emplace_back(id_); 240 | } 241 | }cs; 242 | 243 | 244 | #endif 245 | 246 | -------------------------------------------------------------------------------- /linear-universal.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | from warnings import simplefilter 4 | simplefilter(action='ignore', category=FutureWarning) 5 | np.seterr(all='ignore') 6 | from MLModel.optimizer import * 7 | from MLModel.LoadData import * 8 | from MLModel.MLmodel.linearRegression import * 9 | from MLModel.paramRange import * 10 | from MLModel.LoadCoreset import * 11 | from MLModel.hidden import * 12 | 13 | def test(method='sgd', data='movieLen1M', exp_decay=1, subset_size=1., greedy=1, shuffle=0, g_cnt=-1., 14 | b_cnt=-1., num_runs=10, metric='', reg=1e-5, rand='', ne=-1, from_all=0,coreset_from='scratch', batch=1, sampleSize=0): 15 | train_data, train_target, val_data, val_target, test_data, test_target = load_dataset(data, regression=True) 16 | print("Dataset Loaded") 17 | 18 | g_range, b_range = get_param_range(subset_size, exp_decay, method, data) 19 | best_f_list = [] 20 | best_MAE_list = [] 21 | best_MSE_list = [] 22 | best_MSLE_list = [] 23 | 24 | train_time_list = [] 25 | 26 | for itr in range(num_runs): 27 | f_best, acc_best, b_f, g_f, b_a, g_a = 1e10, 0, 0, 0, 0, 0 28 | 29 | print("Cur itr is ", itr) 30 | if ne == -1: 31 | ne = 20 + int(np.ceil((1. / subset_size) * 5)) + 5 if subset_size < 1 else 20 32 | else: 33 | rand += f'_e{ne}' 34 | if ne > 100: 35 | ne = 100 36 | # assert greedy == 1 37 | if greedy == 1: 38 | order, weights, total_ordering_time = LoadCoreset(coreset_from, data, subset_size, batch=batch,sampleSize=sampleSize) 39 | else: 40 | print('Selecting a random subset') 41 | order = np.arange(0, len(train_data)) 42 | random.shuffle(order) 43 | order = order[:int(subset_size * len(train_data))] 44 | print(' 【Random subset size】 is ', int(subset_size * len(train_data))) 45 | weights = np.ones(int(subset_size * len(train_data)), dtype=np.float) 46 | print(f'--------------- run number: {itr}, rand: {rand}, ' 47 | f'subset: {subset_size}, subset size: {len(order)}') 48 | 49 | best_test_f = 0 50 | best_test_MAE = 0 51 | best_test_MSE = 0 52 | best_test_MSLE = 0 53 | 54 | print("g_range is ", g_range) 55 | print("b_range is ", b_range) 56 | for gamma in g_range: 57 | for b in b_range: 58 | dim = len(train_data[0]) 59 | 60 | model = LinearRegression(dim) 61 | lr = gamma * np.power(b, np.arange(ne)) if exp_decay else gamma / (1 + b * np.arange(ne)) 62 | 63 | st_time = time.time() 64 | x_s, t_s = Optimizer().optimize( 65 | method, model, train_data[order, :], train_target[order], weights, ne, shuffle, lr, reg) 66 | en_time = time.time() 67 | print("Train time is ", en_time - st_time) 68 | train_time_list.append(en_time - st_time) 69 | 70 | f_s = model.loss(val_data, val_target, l2_reg=reg) 71 | 72 | print(f'data: {data}, method: {method}, run: {itr}, exp_decay: {exp_decay}, size: {subset_size} {rand} ' 73 | f'--> f: {f_s}, b: {b}, g: {gamma}') 74 | 75 | if f_s < f_best: 76 | x_a, g_a, b_a, t_a = x_s, gamma, b, t_s 77 | 78 | f_best = f_s 79 | 80 | best_test_f = model.loss(test_data, test_target) 81 | best_test_MAE, best_test_MSE, best_test_MSLE = model.MASLE(test_data, test_target) 82 | print("Current best f is ", f_best) 83 | print("Current best MAE is ", best_test_MAE) 84 | print("Current best MSE is ", best_test_MSE) 85 | print("Current best MSLE is ", best_test_MSLE) 86 | 87 | 88 | print(f'Best solution is => f: {f_best}, a: {acc_best}, b_f: {b_f}, g_f: {g_f}, b_a: {b_a}, g_a: {g_a}') 89 | 90 | 91 | best_f_list.append(f_best) 92 | best_MAE_list.append(best_test_MAE) 93 | best_MSE_list.append(best_test_MSE) 94 | best_MSLE_list.append(best_test_MSLE) 95 | 96 | print(" Current best f_list") 97 | print(best_f_list) 98 | print("Mean ", np.mean(best_f_list), "Max ", np.max(best_f_list), "Min ", np.min(best_f_list), 99 | "Median ", np.median(best_f_list)) 100 | 101 | print(" Current best MAE_list") 102 | print(best_MAE_list) 103 | print("Mean ", np.mean(best_MAE_list), "Max ", np.max(best_MAE_list), "Min ", np.min(best_MAE_list), 104 | "Median ", np.median(best_MAE_list)) 105 | 106 | 107 | print(" Current best MSE_list") 108 | print(best_MSE_list) 109 | print("Mean ", np.mean(best_MSE_list), "Max ", np.max(best_MSE_list), "Min ", np.min(best_MSE_list), 110 | "Median ", np.median(best_MSE_list)) 111 | 112 | print(" Current best MSLE_list") 113 | print(best_MSLE_list) 114 | print("Mean ", np.mean(best_MSLE_list), "Max ", np.max(best_MSLE_list), "Min ", np.min(best_MSLE_list), 115 | "Median ", np.median(best_MSLE_list)) 116 | 117 | 118 | print("Train time list(one hyper-param)") 119 | print(train_time_list) 120 | print("Mean ", np.mean(train_time_list), "Max ", np.max(train_time_list), "Min ", np.min(train_time_list), "Median ", np.median(train_time_list)) 121 | print('Finish') 122 | return best_MSE_list, train_time_list, best_f_list 123 | 124 | 125 | if __name__ == '__main__': 126 | p = argparse.ArgumentParser(description='Faster Training.') 127 | p.add_argument('--data', type=str, required=False, default='IMDB', 128 | choices=['IMDBCLinear','IMDBLargeCLinear','stackLinear', 'taxi', 'stackn'], help='name of dataset') 129 | p.add_argument('--greedy', type=int, required=False, default=1, 130 | help='greedy ordering') 131 | p.add_argument('--reg', type=float, required=False, default=1e-5, 132 | help='L2 regularization constant') 133 | p.add_argument('--method', type=str, required=False, default='sgd', 134 | choices=['sgd', 'svrg', 'saga', 'BGD'], help='sgd, svrg, saga, BGD') 135 | p.add_argument('--subset_size', '-s', type=float, required=False, 136 | help='size of the subset') 137 | p.add_argument('--shuffle', type=int, default=2, 138 | choices=[0, 1, 2, 3], 139 | help='0: not shuffling, 1: random permutation, 2: with replacement, 3: fixed permutation') 140 | p.add_argument('--exp_decay', type=int, required=False, default=1, 141 | choices=[0, 1], help='exponentially decaying learning rate') 142 | p.add_argument('--num_runs', type=int, required=False, default=10, 143 | help='number of runs') 144 | p.add_argument('--metric', type=str, required=False, default='l2', 145 | help='distance metric') 146 | p.add_argument('--b', type=float, required=False, default=-1, 147 | help='learning rate parameter b') 148 | p.add_argument('--g', type=float, required=False, default=-1, 149 | help='learning rate parameter g') 150 | p.add_argument('--ne', type=int, required=False, default=-1, 151 | help='number of epochs') 152 | p.add_argument('--grad_diff', type=int, required=False, default=0, 153 | help='number of epochs') 154 | p.add_argument('--from_all', type=int, required=False, default=0) 155 | p.add_argument('--coreset_from', type=str, required=False, default='diskOurs', 156 | choices=['diskOurs'], help='Where to load coreset') 157 | args = p.parse_args() 158 | 159 | if args.greedy == 0: 160 | rand = 'rand_nw' 161 | elif args.greedy == 1 and args.shuffle == 1: 162 | rand = 'grd_shuff' 163 | elif args.greedy == 1 and args.shuffle == 2: 164 | rand = 'grd_rand' 165 | elif args.greedy == 1 and args.shuffle == 0: 166 | rand = 'grd_ord' 167 | elif args.greedy == 1 and args.shuffle > 2: 168 | rand = 'grd_fix_perm' 169 | else: 170 | rand = '' 171 | 172 | print("Start test time", time.asctime( time.localtime(time.time()) )) 173 | test(method=args.method, data=args.data, exp_decay=args.exp_decay, subset_size=args.subset_size, 174 | greedy=args.greedy, shuffle=args.shuffle, b_cnt=args.b, g_cnt=args.g, num_runs=args.num_runs, 175 | metric=args.metric, rand=rand, ne=-1, from_all=args.from_all, 176 | coreset_from=args.coreset_from, reg=args.reg, batch=0) 177 | print("Finished test time", time.asctime(time.localtime(time.time()) )) 178 | 179 | -------------------------------------------------------------------------------- /logistic-universal.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import time 4 | from os import path 5 | from warnings import simplefilter 6 | import warnings 7 | simplefilter(action='ignore', category=FutureWarning) 8 | warnings.filterwarnings('ignore') 9 | np.seterr(all='ignore') 10 | import random 11 | from MLModel.optimizer import * 12 | from MLModel.LoadData import * 13 | from MLModel.MLmodel.logisticRegression import * 14 | from MLModel.paramRange import * 15 | from MLModel.LoadCoreset import * 16 | from MLModel.hidden import * 17 | 18 | 19 | def test(method='sgd', data='movieLen1M', exp_decay=1, subset_size=1., greedy=1, shuffle=0, g_cnt=-1., 20 | b_cnt=-1., num_runs=10, metric='', reg=1e-5, rand='', ne=-1, from_all=0, 21 | coreset_from='scratch', 22 | batch=0, sampleSize=0): 23 | train_data, train_target, val_data, val_target, test_data, test_target = load_dataset(data) 24 | print("Dataset Loaded") 25 | print(np.unique(train_target)) 26 | if data in ['IMDBC5', 'IMDBLargeC5', 'Brazilnew']: 27 | num_class = 5 28 | print("num class is ", num_class) 29 | print(np.unique(test_target)) 30 | print("Class number is [{}]".format(num_class)) 31 | g_range, b_range = get_param_range(subset_size, exp_decay, method, data) 32 | 33 | 34 | x_runs_a = [[]] * num_runs 35 | if ne == -1: 36 | ne = 20 + int(np.ceil((1. / subset_size) * 5)) + 5 if subset_size < 1 else 20 37 | else: 38 | rand += f'_e{ne}' 39 | if ne > 100: 40 | ne = 100 41 | f_runs_a = np.zeros((num_runs, ne)) 42 | ft_runs_a = np.zeros((num_runs, ne)) 43 | acc_runs_a = np.zeros((num_runs, ne)) 44 | t_runs_a = np.zeros((num_runs, ne)) 45 | 46 | 47 | precision_runs_a = np.zeros((num_runs, ne)) 48 | recall_runs_a = np.zeros((num_runs, ne)) 49 | 50 | best_f1_list = [] 51 | best_f_list = [] 52 | best_acc_list = [] 53 | best_precision_list = [] 54 | best_recall_list = [] 55 | train_time_list = [] 56 | best_MAE_list = [] 57 | best_MSE_list = [] 58 | best_MSLE_list = [] 59 | 60 | for itr in range(num_runs): 61 | f_best, acc_best, b_f, g_f, b_a, g_a = 1e10, 0, 0, 0, 0, 0 62 | 63 | 64 | print("Cur itr is ", itr) 65 | # assert greedy == 1 66 | if greedy == 1: 67 | order, weights, total_ordering_time = LoadCoreset(coreset_from, data, subset_size, batch=batch, sampleSize=sampleSize) 68 | else: 69 | print('Selecting a random subset') 70 | 71 | order = np.arange(0, len(train_data)) 72 | random.shuffle(order) 73 | order = order[:int(subset_size * len(train_data))] 74 | print(' 【Random subset size】 is ', int(subset_size * len(train_data))) 75 | weights = np.ones(int(subset_size * len(train_data)), dtype=np.float) 76 | 77 | print(f'--------------- run number: {itr}, rand: {rand}, ' 78 | f'subset: {subset_size}, subset size: {len(order)}') 79 | 80 | best_test_f1 = 0 81 | best_test_acc = 0 82 | best_test_recall = 0 83 | best_test_precision = 0 84 | 85 | best_test_MAE = 0 86 | best_test_MSE = 0 87 | best_test_MSLE = 0 88 | 89 | print("g_range is ", g_range) 90 | print("b_range is ", b_range) 91 | for gamma in g_range: 92 | for b in b_range: 93 | 94 | 95 | dim = len(train_data[0]) 96 | 97 | model = LogisticRegression(dim, num_class) 98 | lr = gamma * np.power(b, np.arange(ne)) if exp_decay else gamma / (1 + b * np.arange(ne)) 99 | 100 | st_time = time.time() 101 | x_s, t_s = Optimizer().optimize( 102 | method, model, train_data[order, :], train_target[order], weights, ne, shuffle, lr, reg) 103 | en_time = time.time() 104 | print("Train time is ", en_time - st_time) 105 | train_time_list.append(en_time - st_time) 106 | 107 | f_s = model.loss(train_data, train_target, l2_reg=reg) 108 | acc_s = model.accuracy(val_data, val_target) 109 | print('acc_s is ',acc_s) 110 | f1_s = model.f1(val_data, val_target) 111 | 112 | print(f'data: {data}, method: {method}, run: {itr}, exp_decay: {exp_decay}, size: {subset_size} {rand} ' 113 | f'--> f: {f_s}, acc: {acc_s}, f1: {f1_s} b: {b}, g: {gamma}') 114 | # if f1_s > f1_best: 115 | if acc_s > acc_best: 116 | 117 | acc_best, x_a, g_a, b_a, t_a = acc_s, x_s, gamma, b, t_s 118 | 119 | f1_best = f1_s 120 | f_best = f_s 121 | 122 | x_runs_a[itr] = x_a 123 | t_runs_a[itr, :] = t_a 124 | best_test_f1 = model.f1(test_data, test_target) 125 | best_test_precision = model.precision(test_data, test_target) 126 | best_test_recall = model.recall(test_data, test_target) 127 | best_test_acc = model.acc(test_data, test_target) 128 | 129 | best_test_MAE, best_test_MSE, best_test_MSLE = model.MASLE(test_data, test_target) 130 | print("### New best MAE is ", best_test_MAE) 131 | print("### New best MSE is ", best_test_MSE) 132 | print("### New best MSLE is ", best_test_MSLE) 133 | 134 | print("### New best f1 is [{}]".format(best_test_f1)) 135 | print("### New best precision is [{}]".format(best_test_precision)) 136 | print("### New best recall is [{}]".format(best_test_recall)) 137 | print("### New best acc is [{}]".format(best_test_acc)) 138 | 139 | 140 | print(f'Best solution is => f: {f_best}, a: {acc_best}, b_f: {b_f}, g_f: {g_f}, b_a: {b_a}, g_a: {g_a}') 141 | 142 | 143 | best_f1_list.append(best_test_f1) 144 | best_f_list.append(f_best) 145 | best_acc_list.append(best_test_acc) 146 | best_precision_list.append(best_test_precision) 147 | best_recall_list.append(best_test_recall) 148 | 149 | best_MAE_list.append(best_test_MAE) 150 | best_MSE_list.append(best_test_MSE) 151 | best_MSLE_list.append(best_test_MSLE) 152 | 153 | print(" Current best_f1_list") 154 | print(best_f1_list) 155 | print("Mean ", np.mean(best_f1_list), "Max ", np.max(best_f1_list), "Min ", np.min(best_f1_list), 156 | "Median ", np.median(best_f1_list)) 157 | 158 | print(" Current best f_list") 159 | print(best_f_list) 160 | print("Mean ", np.mean(best_f_list), "Max ", np.max(best_f_list), "Min ", np.min(best_f_list), 161 | "Median ", np.median(best_f_list)) 162 | print(" Current best acc_list") 163 | print(best_acc_list) 164 | print(" Current best recall_list") 165 | print(best_recall_list) 166 | print(" Current best precision_list") 167 | print(best_precision_list) 168 | 169 | print(" Current best MAE_list") 170 | print(best_MAE_list) 171 | print("Mean ", np.mean(best_MAE_list), "Max ", np.max(best_MAE_list), "Min ", np.min(best_MAE_list), 172 | "Median ", np.median(best_MAE_list)) 173 | 174 | print(" Current best MSE_list") 175 | print(best_MSE_list) 176 | print("Mean ", np.mean(best_MSE_list), "Max ", np.max(best_MSE_list), "Min ", np.min(best_MSE_list), 177 | "Median ", np.median(best_MSE_list)) 178 | 179 | print(" Current best MSLE_list") 180 | print(best_MSLE_list) 181 | print("Mean ", np.mean(best_MSLE_list), "Max ", np.max(best_MSLE_list), "Min ", np.min(best_MSLE_list), 182 | "Median ", np.median(best_MSLE_list)) 183 | 184 | print("Train time list(one hyper-param)") 185 | print(train_time_list) 186 | print("Mean ", np.mean(train_time_list), "Max ", np.max(train_time_list), "Min ", np.min(train_time_list), "Median ", np.median(train_time_list)) 187 | print('Finish') 188 | 189 | return best_acc_list,best_MSE_list,train_time_list,best_f_list 190 | 191 | 192 | if __name__ == '__main__': 193 | 194 | p = argparse.ArgumentParser(description='Faster Training.') 195 | p.add_argument('--exp_decay', type=int, required=False, default=1, 196 | choices=[0, 1], help='exponentially decaying learning rate') 197 | p.add_argument('--greedy', type=int, required=False, default=1, 198 | help='greedy ordering') 199 | p.add_argument('--reg', type=float, required=False, default=1e-5, 200 | help='L2 regularization constant') 201 | p.add_argument('--method', type=str, required=False, default='sgd', 202 | choices=['sgd', 'svrg', 'saga', 'BGD'], help='sgd, svrg, saga, BGD') 203 | p.add_argument('--subset_size', '-s', type=float, required=False, 204 | help='size of the subset') 205 | p.add_argument('--shuffle', type=int, default=2, 206 | choices=[0, 1, 2, 3], 207 | help='0: not shuffling, 1: random permutation, 2: with replacement, 3: fixed permutation') 208 | p.add_argument('--num_runs', type=int, required=False, default=10, 209 | help='number of runs') 210 | p.add_argument('--data', type=str, required=False, default='IMDB', 211 | choices=['IMDBC5','IMDBLargeC5', 'Brazilnew'], help='name of dataset') 212 | p.add_argument('--metric', type=str, required=False, default='l2', 213 | help='distance metric') 214 | p.add_argument('--b', type=float, required=False, default=-1, 215 | help='learning rate parameter b') 216 | p.add_argument('--g', type=float, required=False, default=-1, 217 | help='learning rate parameter g') 218 | p.add_argument('--grad_diff', type=int, required=False, default=0, 219 | help='number of epochs') 220 | p.add_argument('--from_all', type=int, required=False, default=0) 221 | p.add_argument('--coreset_from', type=str, required=False, default='diskOurs', 222 | choices=['diskOurs'], help='Where to load coreset') 223 | p.add_argument('--batch', type=int, required=False, default=0) 224 | 225 | args = p.parse_args() 226 | 227 | if args.greedy == 0: 228 | rand = 'rand_nw' 229 | elif args.greedy == 1 and args.shuffle == 1: 230 | rand = 'grd_shuff' 231 | elif args.greedy == 1 and args.shuffle == 2: 232 | rand = 'grd_rand' 233 | elif args.greedy == 1 and args.shuffle == 0: 234 | rand = 'grd_ord' 235 | elif args.greedy == 1 and args.shuffle > 2: 236 | rand = 'grd_fix_perm' 237 | else: 238 | rand = '' 239 | 240 | print("Start test time", time.asctime( time.localtime(time.time()) )) 241 | test(method=args.method, data=args.data, exp_decay=args.exp_decay, subset_size=args.subset_size, 242 | greedy=args.greedy, shuffle=args.shuffle, b_cnt=args.b, g_cnt=args.g, num_runs=args.num_runs, 243 | metric=args.metric, rand=rand, ne=-1, from_all=args.from_all, 244 | coreset_from=args.coreset_from, reg=args.reg,batch=0) 245 | print("Finished test time", time.asctime(time.localtime(time.time()) )) 246 | 247 | 248 | 249 | -------------------------------------------------------------------------------- /preprocess/Brazil.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn import metrics 5 | import os 6 | 7 | X_train = [] 8 | X_test = [] 9 | y_train = [] 10 | y_test = [] 11 | 12 | import datetime 13 | def parseDatetime(s): 14 | pre, suf = s.split(' ') 15 | 16 | year_s, mon_s, day_s = pre.split('-') 17 | hour_s, minute_s, second_s = suf.split(':') 18 | return datetime.datetime(int(year_s), int(mon_s), int(day_s), int(hour_s), int(minute_s), int(second_s)) 19 | 20 | def timeDelta(arrLike, col1, col2): 21 | purchase = parseDatetime(arrLike[col1]) 22 | approve = parseDatetime(arrLike[col2]) 23 | delta = approve - purchase 24 | return delta.total_seconds() 25 | 26 | DIR = '/home/jiayi/disk/C-craig/dataset/Brazil/' 27 | 28 | file = 'olist_order_reviews_dataset' 29 | review = pd.read_csv(DIR + file + '.csv') 30 | 31 | file = 'olist_orders_dataset.csv' 32 | order = pd.read_csv(DIR + file) 33 | 34 | file = 'olist_order_items_dataset.csv' 35 | orderItem = pd.read_csv(DIR + file) 36 | 37 | file = 'olist_products_dataset.csv' 38 | product = pd.read_csv(DIR + file) 39 | 40 | review = review[['review_id', 'order_id', 'review_score','review_creation_date', 'review_answer_timestamp']].copy() 41 | order = order[['order_id', 'order_status', 'order_purchase_timestamp', 42 | 'order_approved_at', 'order_delivered_carrier_date', 43 | 'order_delivered_customer_date', 'order_estimated_delivery_date']].copy() 44 | orderItem = orderItem[['order_id', 'product_id', 45 | 'price', 'freight_value']].copy() 46 | product = product[['product_id', 'product_photos_qty']].copy() 47 | 48 | review.review_score = review.review_score - 1 49 | 50 | tmp = pd.merge(review, order) 51 | tmp = pd.merge(tmp, orderItem) 52 | tmp = pd.merge(tmp, product) 53 | 54 | print(tmp.shape) 55 | tmp.dropna(inplace=True) 56 | print(tmp.shape) 57 | 58 | tmp['approve'] = tmp.apply(timeDelta, axis=1, args=('order_purchase_timestamp', 59 | 'order_approved_at')) 60 | tmp['approve'] /= tmp['approve'].max() 61 | tmp['deliver'] = tmp.apply(timeDelta, axis=1, args=('order_approved_at', 62 | 'order_delivered_carrier_date')) 63 | tmp['deliver'] /= tmp['deliver'].max() 64 | tmp['arrive'] = tmp.apply(timeDelta, axis=1, args=('order_delivered_carrier_date', 65 | 'order_delivered_customer_date')) 66 | tmp['arrive'] /= tmp['arrive'].max() 67 | tmp['review'] = tmp.apply(timeDelta, axis=1, args=('review_creation_date', 68 | 'review_answer_timestamp')) 69 | tmp['review'] /= tmp['review'].max() 70 | 71 | tmp['faster'] = tmp.apply(timeDelta, axis=1, args=('order_delivered_customer_date', 72 | 'order_estimated_delivery_date')) 73 | tmp['faster'] /= tmp['faster'].max() 74 | isDelivered_idx = tmp[tmp['order_status'] == 'delivered'].index 75 | isCanceled_idx = tmp[tmp['order_status'] == 'canceled'].index 76 | tmp.loc[isDelivered_idx, 'order_status'] = 0 77 | tmp.loc[isCanceled_idx, 'order_status'] = 1 78 | col_list = [ 79 | 'review_score', 80 | 'order_status', 81 | 'approve', 82 | 'deliver', 83 | 'arrive', 84 | 'faster', 85 | 'review' 86 | ] 87 | 88 | tmp.drop([ 89 | 'review_creation_date','review_answer_timestamp', 90 | 'order_purchase_timestamp', 'order_approved_at', 91 | 'order_delivered_carrier_date', 'order_delivered_customer_date', 92 | 'order_estimated_delivery_date', 93 | ], axis=1, inplace=True) 94 | 95 | print(tmp.columns) 96 | print(tmp.shape) 97 | 98 | for col in tmp.columns: 99 | if col not in [ 100 | 'review_score', 101 | 'order_status', 102 | 'approve', 103 | 'deliver', 104 | 'arrive', 105 | 'faster', 106 | 'review', 107 | 'product_id','review_id', 'order_id', 108 | ]: 109 | print(col) 110 | tmp[col] = (tmp[col] - tmp[col].min()) / (tmp[col].max() - tmp[col].min()) 111 | 112 | tmp.drop_duplicates(keep='first',inplace=True) 113 | print(tmp.shape) 114 | 115 | print(tmp.columns) 116 | 117 | review = tmp[['review_id', 'order_id', 'review_score','review']].copy() 118 | 119 | order = tmp[['order_id', 'order_status', 'approve', 'deliver','arrive', 'faster']].copy() 120 | 121 | orderItem = tmp[['order_id', 'product_id', 122 | 'price', 'freight_value']].copy() 123 | 124 | product = tmp[['product_id', 'product_photos_qty']].copy() 125 | 126 | review.drop_duplicates(['order_id'],keep='first', inplace=True) 127 | review.drop_duplicates(['review_id'],keep='first', inplace=True) 128 | 129 | order.drop_duplicates(keep='first', inplace=True) 130 | orderItem.drop_duplicates(keep='first', inplace=True) 131 | product.drop_duplicates(keep='first', inplace=True) 132 | 133 | from sklearn.utils import shuffle 134 | rng=np.random.RandomState(123) 135 | review = shuffle(review, random_state=rng) 136 | 137 | print("All base data shape is ") 138 | print(review.shape) 139 | 140 | TrainProp = 0.5 141 | ValProp = 0.25 142 | TrainEnd = int(TrainProp * review.shape[0]) 143 | ValEnd = TrainEnd + int(ValProp * review.shape[0]) 144 | 145 | print(TrainEnd) 146 | print(ValEnd) 147 | 148 | trainReview = review[:TrainEnd].copy() 149 | valReview = review[TrainEnd:ValEnd].copy() 150 | testReview = review[ValEnd:].copy() 151 | 152 | trainSet = pd.merge(trainReview, order) 153 | trainSet = pd.merge(trainSet, orderItem) 154 | trainSet = pd.merge(trainSet, product) 155 | 156 | valSet = pd.merge(valReview, order) 157 | valSet = pd.merge(valSet, orderItem) 158 | valSet = pd.merge(valSet, product) 159 | 160 | testSet = pd.merge(testReview, order) 161 | testSet = pd.merge(testSet, orderItem) 162 | testSet = pd.merge(testSet, product) 163 | 164 | DIR = "/home/jiayi/disk/C-craig/dataset/" 165 | dataName = "Brazilnew" 166 | 167 | trainSet.to_csv(DIR + "{}-train.csv".format(dataName), index=False) 168 | valSet.to_csv(DIR + "{}-val.csv".format(dataName), index=False) 169 | testSet.to_csv(DIR + "{}-test.csv".format(dataName), index=False) 170 | 171 | y_train = trainSet.review_score.values 172 | y_val = valSet.review_score.values 173 | y_test = testSet.review_score.values 174 | 175 | trainSet.drop(['review_id', 'order_id','review_score','product_id' ], axis=1, inplace=True) 176 | valSet.drop(['review_id', 'order_id', 'review_score', 'product_id' ], axis=1, inplace=True) 177 | testSet.drop(['review_id', 'order_id','review_score', 'product_id' ], axis=1, inplace=True) 178 | 179 | X_train = np.ascontiguousarray(trainSet.astype(np.float64)) 180 | X_val = np.ascontiguousarray(valSet.astype(np.float64)) 181 | X_test = np.ascontiguousarray(testSet.astype(np.float64)) 182 | 183 | print(trainSet.shape) 184 | print(trainSet.columns) 185 | 186 | DIR = "/home/jiayi/disk/C-craig/dataset/" 187 | dataName = "Brazilnew" 188 | np.save(DIR + "{}-train-X.npy".format(dataName), X_train) 189 | np.save(DIR + "{}-test-X.npy".format(dataName), X_test) 190 | np.save(DIR + "{}-val-X.npy".format(dataName), X_val) 191 | 192 | np.save(DIR + "{}-train-y.npy".format(dataName), y_train) 193 | np.save(DIR + "{}-test-y.npy".format(dataName), y_test) 194 | np.save(DIR + "{}-val-y.npy".format(dataName), y_val) 195 | 196 | DIR = "/home/jiayi/disk/C-craig/dataset/" 197 | dataName = "Brazilnew" 198 | 199 | tmp = pd.read_csv(DIR + "{}-train.csv".format(dataName)) 200 | tmp['rowID'] = np.arange(tmp.shape[0]) 201 | 202 | dataName = "Brazilnew" 203 | mycsDIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataName) 204 | 205 | tmp = tmp[['review_id', 'order_id', 'product_id', 'rowID', 206 | 'review_score', 'review', 'order_status', 207 | 'approve', 'deliver', 'arrive', 'faster', 'price', 208 | 'freight_value', 'product_photos_qty']].copy() 209 | 210 | for cate in range(5): 211 | train = tmp[tmp.review_score==cate].copy() 212 | 213 | le = preprocessing.LabelEncoder() 214 | le.fit(train.review_id) 215 | train.review_id = le.transform(train.review_id) 216 | 217 | le = preprocessing.LabelEncoder() 218 | le.fit(train.order_id) 219 | train.order_id = le.transform(train.order_id) 220 | 221 | le = preprocessing.LabelEncoder() 222 | le.fit(train.product_id) 223 | train.product_id = le.transform(train.product_id) 224 | 225 | 226 | train.to_csv(mycsDIR + "train-cate-{}-joined.csv".format(cate), index=False) 227 | tmp_ = np.ascontiguousarray(train.values.astype(np.float64)) 228 | np.save(mycsDIR + "train-cate-{}-joined.npy".format(cate), tmp_) 229 | 230 | 231 | review = train[['review_id', 'order_id', 'review_score','review']].copy() 232 | review.sort_values(by='review_id') 233 | 234 | order = train[['order_id', 'order_status', 'approve', 'deliver','arrive', 'faster']].copy() 235 | order.sort_values(by='order_id') 236 | 237 | orderItem = train[['order_id', 'rowID','product_id', 238 | 'price', 'freight_value']].copy() 239 | orderItem.sort_values(by='order_id') 240 | 241 | product = train[['product_id', 'product_photos_qty']].copy() 242 | 243 | 244 | review.drop_duplicates(keep='first', inplace=True) 245 | order.drop_duplicates(keep='first', inplace=True) 246 | orderItem.drop_duplicates(keep='first', inplace=True) 247 | product.drop_duplicates(keep='first', inplace=True) 248 | 249 | 250 | 251 | np.save(mycsDIR + 'train-cate-{}-review.npy'.format(cate), np.ascontiguousarray(review.values.astype(np.float64))) 252 | np.save(mycsDIR + 'train-cate-{}-order.npy'.format(cate), np.ascontiguousarray(order.values.astype(np.float64))) 253 | np.save(mycsDIR + 'train-cate-{}-orderItem.npy'.format(cate), np.ascontiguousarray(orderItem.values.astype(np.float64))) 254 | np.save(mycsDIR + 'train-cate-{}-product.npy'.format(cate), np.ascontiguousarray(product.values.astype(np.float64))) 255 | 256 | 257 | review.to_csv(mycsDIR + 'train-cate-{}-review.csv'.format(cate),index=False) 258 | order.to_csv(mycsDIR + 'train-cate-{}-order.csv'.format(cate), index=False) 259 | orderItem.to_csv(mycsDIR + 'train-cate-{}-orderItem.csv'.format(cate), index=False) 260 | product.to_csv(mycsDIR + 'train-cate-{}-product.csv'.format(cate), index=False) 261 | 262 | -------------------------------------------------------------------------------- /preprocess/IMDBC-5.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn import metrics 6 | import os 7 | from sklearn import preprocessing 8 | 9 | X_train = [] 10 | X_test = [] 11 | y_train = [] 12 | y_test = [] 13 | 14 | import datetime 15 | def parseDatetime(s): 16 | # print('s is ',s) 17 | pre, suf = s.split(' ') 18 | 19 | year_s, mon_s, day_s = pre.split('-') 20 | hour_s, minute_s, second_s = suf.split(':') 21 | return datetime.datetime(int(year_s), int(mon_s), int(day_s), int(hour_s), int(minute_s), int(second_s)) 22 | 23 | def timeDelta(arrLike, col1, col2): 24 | purchase = parseDatetime(arrLike[col1]) 25 | approve = parseDatetime(arrLike[col2]) 26 | delta = approve - purchase 27 | return delta.total_seconds() 28 | 29 | from scipy import sparse 30 | def transMultihot(df, rowName, colName, IDName, onehotName='s'): 31 | 32 | tmp = df[colName].factorize() 33 | df.drop(colName,axis=1, inplace=True) 34 | df.insert(df.shape[1],colName,tmp[0]) 35 | 36 | values = np.ones(df.shape[0]) 37 | rows = df[rowName].values 38 | cols = df[colName].values 39 | 40 | sparse_matrix = sparse.coo_matrix((values, (rows,cols))) 41 | ar = sparse_matrix.toarray() 42 | sm = ar.sum(axis=1) 43 | 44 | idxs = sm>0 45 | IDs = np.arange(ar.shape[0]) 46 | 47 | IDs = IDs[idxs] 48 | ARs = ar[idxs] 49 | 50 | col_name_list = ['{}{}'.format(onehotName, i) for i in range(ARs.shape[1])] 51 | col_name_list = [IDName] + col_name_list 52 | 53 | assert IDs.shape[0] == ARs.shape[0] 54 | IDs = IDs.reshape(-1,1) 55 | 56 | z = np.concatenate((IDs,ARs),axis=1) 57 | 58 | 59 | ret = pd.DataFrame(z, columns=col_name_list) 60 | ret[IDName] = ret[IDName].astype(np.int64) 61 | return ret 62 | 63 | DIR = '/home/jiayi/disk/neurocard/datasets/job/' 64 | 65 | file = 'title.csv' 66 | title = pd.read_csv(DIR+file) 67 | 68 | file = 'info_type.csv' 69 | it = pd.read_csv(DIR+file) 70 | 71 | file = 'movie_info.csv' 72 | mi = pd.read_csv(DIR+file) 73 | 74 | file = 'movie_info_idx.csv' 75 | mix = pd.read_csv(DIR+file) 76 | 77 | file = 'name.csv' 78 | name = pd.read_csv(DIR+file) 79 | 80 | file = 'cast_info.csv' 81 | ci = pd.read_csv(DIR+file) 82 | 83 | file = 'movie_companies.csv' 84 | mc = pd.read_csv(DIR+file) 85 | 86 | file = 'company_name.csv' 87 | cn = pd.read_csv(DIR+file) 88 | 89 | def changeToFloor(arrLike, col): 90 | 91 | # def timeDelta(arrLike, col1, col2): 92 | colValue = arrLike[col] 93 | # colValue = np.floor(colValue) 94 | colValue = np.around(arrLike[col],0) 95 | 96 | colValue = np.floor(colValue/2) 97 | return colValue 98 | # purchase = parseDatetime(arrLike[col1]) 99 | # approve = parseDatetime(arrLike[col2]) 100 | # delta = approve - purchase 101 | # return delta.total_seconds() 102 | 103 | def LoadIMDBC(Large=0,dataName="", saveCSV=False): 104 | global X_train, X_test, X_val, y_val, y_train, y_test 105 | 106 | z = mix.copy() 107 | # print(z.groupby('id')) 108 | votes = z[z['info_type_id']==100].copy() 109 | rating = z[z['info_type_id']==101].copy() 110 | 111 | votes['info'] = votes['info'].astype(int) 112 | useVotes = votes[votes['info']>100].copy() 113 | 114 | useVotes.rename(columns={'info':'votes'},inplace=True) 115 | useVotes = useVotes[['movie_id', 'votes']] 116 | 117 | MAX = useVotes.votes.max() 118 | MIN = useVotes.votes.min() 119 | useVotes.votes = (useVotes.votes - MIN)/(MAX - MIN) 120 | 121 | rating['info'] = rating['info'].astype(np.double) 122 | useRating = rating.copy() 123 | 124 | useRating.rename(columns={'info':'rating'},inplace=True) 125 | 126 | useRating = useRating[['movie_id', 'rating']] 127 | useRating['rating'] = useRating['rating'].astype(np.double) 128 | 129 | 130 | 131 | 132 | 133 | useRating['rating'] = useRating.apply(changeToFloor, axis=1, args=['rating']) 134 | useRating.rating-=1 135 | 136 | # midLE = preprocessing.LabelEncoder() 137 | # midLE.fit(useRating.rating) 138 | # useRating['rating'] = midLE.transform(useRating.rating) 139 | 140 | # useRating['rating']=useRating['rating'].astype(int) 141 | 142 | useMIX = pd.merge(useVotes, useRating) 143 | print(useMIX.shape) 144 | print(useMIX.columns) 145 | 146 | useMI = mi.copy() 147 | color = useMI[useMI['info_type_id']==2].copy() 148 | genres = useMI[useMI['info_type_id']==3].copy() 149 | 150 | color.rename(columns={'info':'color'},inplace=True) 151 | 152 | color = color[['movie_id', 'color']] 153 | 154 | BWIndex = color[color['color']=='Black and White'].index 155 | ColorIndex = color[color['color']=='Color'].index 156 | color.loc[BWIndex,'color'] = 0 157 | color.loc[ColorIndex,'color'] = 1 158 | 159 | genres.rename(columns={'info':'genres'},inplace=True) 160 | genres = genres[['movie_id', 'genres']] 161 | genres.drop_duplicates(inplace=True) 162 | 163 | genres = transMultihot(genres, 'movie_id', 'genres', IDName='movie_id', onehotName='s') 164 | 165 | useMI = pd.merge(color, genres) 166 | 167 | print(useMI.shape) 168 | print(useMI.columns) 169 | 170 | if Large==0: 171 | useCI = ci[ci['role_id']==4].copy() 172 | else: 173 | useCI= ci.copy() 174 | # useCI = ci[ci['role_id']<=4].copy() 175 | 176 | useCI = useCI[['person_id', 'movie_id']] 177 | print(useCI.shape) 178 | print(useCI.columns) 179 | 180 | 181 | useNAME = name.copy() 182 | 183 | mIndex = useNAME[useNAME['gender']=='m'].index 184 | fIndex = useNAME[useNAME['gender']=='f'].index 185 | 186 | useNAME.loc[mIndex,'gender'] = 1 187 | useNAME.loc[fIndex,'gender'] = 0 188 | 189 | genderNA = ~useNAME['gender'].isna() 190 | # purchaseNA = ~tmp['order_purchase_timestamp'].isna() 191 | 192 | useNAME = useNAME[genderNA] 193 | 194 | useNAME.rename(columns={'id':'person_id'},inplace=True) 195 | useNAME = useNAME[['person_id', 'gender']] 196 | print(useNAME.shape) 197 | print(useNAME.columns) 198 | 199 | 200 | useTITLE = title.copy() 201 | useTITLE.rename(columns={'id':'movie_id'},inplace=True) 202 | yearNA = ~useTITLE.production_year.isna() 203 | kindNA = ~useTITLE.kind_id.isna() 204 | yearNA = yearNA & kindNA 205 | useTITLE = useTITLE[yearNA] 206 | 207 | useTITLE = useTITLE[['movie_id', 'production_year','kind_id']].copy() 208 | MIN = useTITLE.production_year.min() 209 | MAX = useTITLE.production_year.max() 210 | 211 | useTITLE['production_year'] = (useTITLE['production_year'] - MIN)/(MAX - MIN) 212 | useTITLE = useTITLE.join(pd.get_dummies(useTITLE.kind_id)) 213 | useTITLE.rename(columns={1:'k1',2:'k2',3:'k3',4:'k4',6:'k6',7:'k7'},inplace=True) 214 | 215 | useTITLE.drop(['kind_id'],axis=1, inplace=True) 216 | print(useTITLE.shape) 217 | print(useTITLE.columns) 218 | 219 | 220 | 221 | 222 | useMC = mc.copy() 223 | useCN = cn.copy() 224 | useCN.rename(columns={'id':'company_id'},inplace=True) 225 | le = preprocessing.LabelEncoder() 226 | le.fit(useCN.country_code) 227 | useCN['country_code'] = le.transform(useCN.country_code) 228 | 229 | joinedMC = pd.merge(useMC, useCN) 230 | 231 | tMC = joinedMC[['company_id', 'country_code','movie_id']].copy() 232 | 233 | 234 | MAX = tMC.country_code.max() 235 | tMC['country_code'] = (tMC['country_code']/MAX) 236 | 237 | 238 | 239 | useTITLE.drop_duplicates(useTITLE.columns,inplace=True) 240 | useMIX.drop_duplicates(subset=['movie_id'], keep='first', inplace=True) 241 | useMIX.drop_duplicates(useMIX.columns,inplace=True) 242 | useCI.drop_duplicates(useCI.columns,inplace=True) 243 | useNAME.drop_duplicates(useNAME.columns,inplace=True) 244 | useMI.drop_duplicates(subset=['movie_id'], keep='first', inplace=True) 245 | useMI.drop_duplicates(useMI.columns,inplace=True) 246 | 247 | 248 | 249 | useTITLE.drop_duplicates(inplace=True) 250 | useCI.drop_duplicates(inplace=True) 251 | useNAME.drop_duplicates(inplace=True) 252 | useMI.drop_duplicates(inplace=True) 253 | useMIX.drop_duplicates(inplace=True) 254 | tMC.drop_duplicates(inplace=True) 255 | 256 | 257 | 258 | 259 | 260 | z = pd.merge(useTITLE, useMIX) 261 | print(z.shape) 262 | z = pd.merge(z, useCI) 263 | print(z.shape) 264 | z = pd.merge(z, useNAME) 265 | print(z.shape) 266 | z = pd.merge(z, useMI) 267 | print(z.shape) 268 | 269 | 270 | 271 | print(z.columns) 272 | print(z.shape) 273 | 274 | print("##### Join company ") 275 | z = pd.merge(z, tMC) 276 | print(z.columns) 277 | print(z.shape) 278 | 279 | 280 | from sklearn.utils import shuffle 281 | z = shuffle(z, random_state=123) 282 | 283 | # trainSize = int(0.5 * z.shape[0]) 284 | # valSize = trainSize + int(0.25 * z.shape[0]) 285 | 286 | 287 | 288 | 289 | movieIDs= title.id.unique() 290 | movieIDs = shuffle(movieIDs, random_state=123) 291 | 292 | 293 | trainSize = int(0.5 * movieIDs.shape[0]) 294 | valSize = trainSize + int(0.25 * movieIDs.shape[0]) 295 | 296 | 297 | trainMovies = movieIDs[:trainSize] 298 | trainTMP = pd.DataFrame(trainMovies.reshape(-1,1), columns=["movie_id"]) 299 | trainData = pd.merge(z, trainTMP) 300 | 301 | valMovies = movieIDs[trainSize:valSize] 302 | valTMP = pd.DataFrame(valMovies.reshape(-1,1), columns=["movie_id"]) 303 | valData = pd.merge(z, valTMP) 304 | 305 | testMovies = movieIDs[valSize:] 306 | testTMP = pd.DataFrame(testMovies.reshape(-1,1), columns=["movie_id"]) 307 | testData = pd.merge(z, testTMP) 308 | 309 | 310 | 311 | 312 | 313 | y_train = trainData.rating.values 314 | y_val = valData.rating.values 315 | y_test = testData.rating.values 316 | 317 | 318 | if saveCSV: 319 | trainData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-train.csv'.format(dataName), index=False) 320 | valData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-val.csv'.format(dataName), index=False) 321 | testData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-test.csv'.format(dataName), index=False) 322 | 323 | 324 | trainData.drop(['rating'], axis=1, inplace=True) 325 | valData.drop(['rating'], axis=1, inplace=True) 326 | testData.drop(['rating'], axis=1, inplace=True) 327 | 328 | trainData.drop(['person_id','movie_id', 'company_id'], axis=1, inplace=True) 329 | valData.drop(['person_id', 'movie_id', 'company_id'], axis=1, inplace=True) 330 | testData.drop(['person_id', 'movie_id', 'company_id'], axis=1, inplace=True) 331 | 332 | 333 | print("Train Data shape ", trainData.shape) 334 | print("Test Data shape ", testData.shape) 335 | print("Val Data shape ", valData.shape) 336 | 337 | print(trainData.columns) 338 | X_train = np.ascontiguousarray(trainData.values.astype(np.float64)) 339 | X_val = np.ascontiguousarray(valData.values.astype(np.float64)) 340 | X_test = np.ascontiguousarray(testData.values.astype(np.float64)) 341 | 342 | print(X_train.shape) 343 | print(y_train.shape) 344 | 345 | return z 346 | 347 | dataNameList = ["IMDBC5", "IMDBLargeC5"] 348 | parameterList = [0, 1] 349 | 350 | for dataName, param in zip(dataNameList, parameterList): 351 | LoadIMDBC(param, dataName, saveCSV=True) 352 | np.save("/home/jiayi/disk/C-craig/dataset/{}-train-X.npy".format(dataName),X_train) 353 | np.save("/home/jiayi/disk/C-craig/dataset/{}-train-y.npy".format(dataName),y_train) 354 | 355 | np.save("/home/jiayi/disk/C-craig/dataset/{}-val-X.npy".format(dataName),X_val) 356 | np.save("/home/jiayi/disk/C-craig/dataset/{}-val-y.npy".format(dataName),y_val) 357 | 358 | np.save("/home/jiayi/disk/C-craig/dataset/{}-test-X.npy".format(dataName),X_test) 359 | np.save("/home/jiayi/disk/C-craig/dataset/{}-test-y.npy".format(dataName),y_test) 360 | 361 | # dataName = "IMDBC" 362 | dataName = "IMDBLargeC5" 363 | # dataName = "IMDBC5" 364 | 365 | df = pd.read_csv('/home/jiayi/disk/C-craig/dataset/{}-train.csv'.format(dataName)) 366 | 367 | print(df.columns) 368 | print(df.shape) 369 | 370 | print(np.unique(df.rating)) 371 | 372 | midLE = preprocessing.LabelEncoder() 373 | midLE.fit(df.movie_id) 374 | df['movie_id'] = midLE.transform(df.movie_id) 375 | 376 | pidLE = preprocessing.LabelEncoder() 377 | pidLE.fit(df.person_id) 378 | df['person_id'] = pidLE.transform(df.person_id) 379 | 380 | cidLE = preprocessing.LabelEncoder() 381 | cidLE.fit(df.company_id) 382 | df['company_id'] = cidLE.transform(df.company_id) 383 | 384 | PROP = 1 385 | trainData = df.values 386 | print(trainData.shape) 387 | print(trainData[:5,:]) 388 | np.save('/home/jiayi/disk/C-craig/dataset/{}-joined-prop-{}.npy'.format(dataName, PROP), np.ascontiguousarray(trainData.astype(np.float64))) 389 | 390 | print(midLE.classes_.shape) 391 | print(pidLE.classes_.shape) 392 | print(cidLE.classes_.shape) 393 | num = midLE.classes_.shape[0] 394 | num = num * pidLE.classes_.shape[0] 395 | num = num * cidLE.classes_.shape[0] 396 | print(num) 397 | assert num< 4* (10**18) 398 | print("\n【 Passed 】") 399 | 400 | uni = np.unique(df[['movie_id', 'person_id']], axis=0) 401 | print(uni.shape) 402 | uni = np.unique(df[['movie_id', 'person_id']], axis=0) 403 | print(uni.shape) 404 | 405 | print(df.shape) 406 | 407 | uni = df[['movie_id', 'person_id', 'company_id']].copy() 408 | # uni = np.unique(df[['movie_id', 'person_id', 'company_id']], axis=0) 409 | print(uni.shape) 410 | print(uni) 411 | rowNumMap = np.zeros((uni.shape[0],2), np.int64) 412 | i = 0 413 | # for row in uni.values: 414 | # print(row) 415 | for row in uni.values: 416 | # if row.sha 417 | # print(row) 418 | # print(row.shape) 419 | x,y,z = row 420 | x = np.int64(x) 421 | y = np.int64(y) 422 | z = np.int64(z) 423 | 424 | rowNumMap[i,0] = (x+1) + (y+1)*(10**5) + (z+1) * (10**11) 425 | assert 0 <= rowNumMap[i,0] and rowNumMap[i,0] < (4*(10**18)) 426 | # print(rowNumMap[i,0]) 427 | rowNumMap[i,1] = i 428 | i = i + 1 429 | 430 | mycsDIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataName) 431 | print(mycsDIR) 432 | np.save(mycsDIR + 'idMap.npy', np.ascontiguousarray(rowNumMap)) 433 | 434 | print(rowNumMap) 435 | 436 | CATE = 10 437 | 438 | Databackup = df.copy() 439 | 440 | for cate in range(CATE): 441 | print("#"*10 ,' '*5, '【cate】 ', cate, ' '*10, '#'*10) 442 | trainData = Databackup[Databackup['rating'] == cate] 443 | 444 | 445 | mixColumns = ['movie_id', 'votes', 'rating'] 446 | mixNotUniqued = trainData[mixColumns].copy() 447 | mixUniqued = mixNotUniqued.drop_duplicates(mixNotUniqued.columns).copy() 448 | mixUniqued.sort_values(['movie_id'], inplace=True) 449 | print('【Movie_info_idx】') 450 | print(mixUniqued.shape) 451 | print(len(np.unique(mixUniqued.movie_id))) 452 | 453 | 454 | miColumns = ['movie_id', 'color', 's0', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 455 | 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 456 | 's18', 's19', 's20', 's21', 's22', 's23', 's24', 's25', 's26', 's27', 457 | 's28', 's29'] 458 | miNotUniqued = trainData[miColumns].copy() 459 | miUniqued = miNotUniqued.drop_duplicates(miNotUniqued.columns).copy() 460 | miUniqued.sort_values(['movie_id'], inplace=True) 461 | print('【Movie_info】') 462 | print(miUniqued.shape) 463 | print(len(np.unique(miUniqued.movie_id))) 464 | 465 | 466 | 467 | ciColumns = ['person_id', 'movie_id'] 468 | ciNotUniqued = trainData[ciColumns].copy() 469 | ciUniqued = ciNotUniqued.drop_duplicates(ciNotUniqued.columns).copy() 470 | print('【Cast_info】') 471 | print(ciUniqued.shape) 472 | print('in cast_info movie_id unique ', len(np.unique(ciUniqued.movie_id))) 473 | print('in cast_info person_id unique ', len(np.unique(ciUniqued.person_id))) 474 | 475 | 476 | nameColumns = ['person_id', 'gender'] 477 | nameNotUniqued = trainData[nameColumns].copy() 478 | nameUniqued = nameNotUniqued.drop_duplicates(nameNotUniqued.columns).copy() 479 | nameUniqued.sort_values(['person_id'], inplace=True) 480 | print('【Name】') 481 | print(nameUniqued.shape) 482 | print(len(np.unique(nameUniqued.person_id))) 483 | 484 | 485 | titleColumns = ['movie_id', 'production_year', 'k1', 'k2', 'k3', 'k4', 'k6', 'k7'] 486 | titleNotUniqued = trainData[titleColumns].copy() 487 | titleUniqued = titleNotUniqued.drop_duplicates(titleNotUniqued.columns).copy() 488 | titleUniqued.sort_values(['movie_id'], inplace=True) 489 | print('【Title】') 490 | print(titleUniqued.shape) 491 | print(len(np.unique(titleUniqued.movie_id))) 492 | 493 | 494 | mcColumns = ['movie_id', 'company_id', 'country_code'] 495 | mcNotUniqued = trainData[mcColumns].copy() 496 | mcUniqued = mcNotUniqued.drop_duplicates(mcNotUniqued.columns).copy() 497 | mcUniqued.sort_values(['movie_id'], inplace=True) 498 | print('【Movie Company】') 499 | print(mcUniqued.shape) 500 | print(len(np.unique(mcUniqued.movie_id))) 501 | 502 | 503 | mycsDIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataName) 504 | np.save(mycsDIR + 'train-cate-{}-mix.npy'.format(cate), np.ascontiguousarray(mixUniqued.values.astype(np.float64))) 505 | np.save(mycsDIR + 'train-cate-{}-mi.npy'.format(cate), np.ascontiguousarray(miUniqued.values.astype(np.float64))) 506 | np.save(mycsDIR + 'train-cate-{}-ci.npy'.format(cate), np.ascontiguousarray(ciUniqued.values.astype(np.float64))) 507 | np.save(mycsDIR + 'train-cate-{}-name.npy'.format(cate), np.ascontiguousarray(nameUniqued.values.astype(np.float64))) 508 | np.save(mycsDIR + 'train-cate-{}-title.npy'.format(cate), np.ascontiguousarray(titleUniqued.values.astype(np.float64))) 509 | np.save(mycsDIR + 'train-cate-{}-mc.npy'.format(cate), np.ascontiguousarray(mcUniqued.values.astype(np.float64))) 510 | 511 | 512 | mixUniqued.to_csv(mycsDIR + 'train-cate-{}-mix.csv'.format(cate),index=False) 513 | miUniqued.to_csv(mycsDIR + 'train-cate-{}-mi.csv'.format(cate), index=False) 514 | ciUniqued.to_csv(mycsDIR + 'train-cate-{}-ci.csv'.format(cate), index=False) 515 | nameUniqued.to_csv(mycsDIR + 'train-cate-{}-name.csv'.format(cate), index=False) 516 | titleUniqued.to_csv(mycsDIR + 'train-cate-{}-title.csv'.format(cate), index=False) 517 | mcUniqued.to_csv(mycsDIR + 'train-cate-{}-mc.csv'.format(cate), index=False) 518 | 519 | 520 | 521 | 522 | -------------------------------------------------------------------------------- /preprocess/IMDBC-Linear.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn import metrics 8 | import os 9 | from sklearn import preprocessing 10 | 11 | X_train = [] 12 | X_test = [] 13 | y_train = [] 14 | y_test = [] 15 | 16 | from scipy import sparse 17 | def transMultihot(df, rowName, colName, IDName, onehotName='s'): 18 | tmp = df[colName].factorize() 19 | df.drop(colName,axis=1, inplace=True) 20 | df.insert(df.shape[1],colName,tmp[0]) 21 | 22 | values = np.ones(df.shape[0]) 23 | rows = df[rowName].values 24 | cols = df[colName].values 25 | 26 | sparse_matrix = sparse.coo_matrix((values, (rows,cols))) 27 | ar = sparse_matrix.toarray() 28 | sm = ar.sum(axis=1) 29 | 30 | idxs = sm>0 31 | IDs = np.arange(ar.shape[0]) 32 | 33 | IDs = IDs[idxs] 34 | ARs = ar[idxs] 35 | 36 | col_name_list = ['{}{}'.format(onehotName, i) for i in range(ARs.shape[1])] 37 | col_name_list = [IDName] + col_name_list 38 | 39 | assert IDs.shape[0] == ARs.shape[0] 40 | IDs = IDs.reshape(-1,1) 41 | 42 | z = np.concatenate((IDs,ARs),axis=1) 43 | 44 | 45 | ret = pd.DataFrame(z, columns=col_name_list) 46 | ret[IDName] = ret[IDName].astype(np.int64) 47 | return ret 48 | 49 | get_ipython().run_cell_magic('time', '', "\nDIR = '/home/jiayi/disk/neurocard/datasets/job/'\n\nfile = 'title.csv'\ntitle = pd.read_csv(DIR+file)\n\nfile = 'info_type.csv'\nit = pd.read_csv(DIR+file)\n\nfile = 'movie_info.csv'\nmi = pd.read_csv(DIR+file)\n\nfile = 'movie_info_idx.csv'\nmix = pd.read_csv(DIR+file)\n\nfile = 'name.csv'\nname = pd.read_csv(DIR+file)\n\nfile = 'cast_info.csv'\nci = pd.read_csv(DIR+file)\n\nfile = 'movie_companies.csv'\nmc = pd.read_csv(DIR+file)\n\nfile = 'company_name.csv'\ncn = pd.read_csv(DIR+file)") 50 | 51 | def changeToFloor(arrLike, col): 52 | 53 | colValue = arrLike[col] 54 | colValue = np.around(arrLike[col],1) 55 | 56 | return colValue 57 | 58 | def LoadIMDBC(Large=0,dataName="", saveCSV=False, useFor='test'): 59 | global X_train, X_test, X_val, y_val, y_train, y_test 60 | 61 | z = mix.copy() 62 | 63 | votes = z[z['info_type_id']==100].copy() 64 | rating = z[z['info_type_id']==101].copy() 65 | 66 | votes['info'] = votes['info'].astype(int) 67 | useVotes = votes[votes['info']>100].copy() 68 | 69 | useVotes.rename(columns={'info':'votes'},inplace=True) 70 | useVotes = useVotes[['movie_id', 'votes']] 71 | 72 | MAX = useVotes.votes.max() 73 | MIN = useVotes.votes.min() 74 | useVotes.votes = (useVotes.votes - MIN)/(MAX - MIN) 75 | 76 | rating['info'] = rating['info'].astype(np.double) 77 | useRating = rating.copy() 78 | 79 | useRating.rename(columns={'info':'rating'},inplace=True) 80 | 81 | useRating = useRating[['movie_id', 'rating']] 82 | useRating['rating'] = useRating['rating'].astype(np.double) 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | useRating['rating'] = useRating.apply(changeToFloor, axis=1, args=['rating']) 91 | 92 | if useFor == 'train': 93 | midLE = preprocessing.LabelEncoder() 94 | midLE.fit(useRating.rating) 95 | useRating['rating'] = midLE.transform(useRating.rating) 96 | 97 | useRating['rating']=useRating['rating'].astype(int) 98 | useRating.rating -=1 99 | 100 | useMIX = pd.merge(useVotes, useRating) 101 | print(useMIX.shape) 102 | print(useMIX.columns) 103 | 104 | useMI = mi.copy() 105 | color = useMI[useMI['info_type_id']==2].copy() 106 | genres = useMI[useMI['info_type_id']==3].copy() 107 | 108 | color.rename(columns={'info':'color'},inplace=True) 109 | 110 | color = color[['movie_id', 'color']] 111 | 112 | BWIndex = color[color['color']=='Black and White'].index 113 | ColorIndex = color[color['color']=='Color'].index 114 | color.loc[BWIndex,'color'] = 0 115 | color.loc[ColorIndex,'color'] = 1 116 | 117 | genres.rename(columns={'info':'genres'},inplace=True) 118 | genres = genres[['movie_id', 'genres']] 119 | genres.drop_duplicates(inplace=True) 120 | 121 | genres = transMultihot(genres, 'movie_id', 'genres', IDName='movie_id', onehotName='s') 122 | 123 | useMI = pd.merge(color, genres) 124 | 125 | print(useMI.shape) 126 | print(useMI.columns) 127 | 128 | if Large==0: 129 | useCI = ci[ci['role_id']==4].copy() 130 | else: 131 | useCI= ci.copy() 132 | 133 | useCI = useCI[['person_id', 'movie_id']] 134 | print(useCI.shape) 135 | print(useCI.columns) 136 | 137 | 138 | useNAME = name.copy() 139 | 140 | mIndex = useNAME[useNAME['gender']=='m'].index 141 | fIndex = useNAME[useNAME['gender']=='f'].index 142 | 143 | useNAME.loc[mIndex,'gender'] = 1 144 | useNAME.loc[fIndex,'gender'] = 0 145 | 146 | genderNA = ~useNAME['gender'].isna() 147 | # purchaseNA = ~tmp['order_purchase_timestamp'].isna() 148 | 149 | useNAME = useNAME[genderNA] 150 | 151 | useNAME.rename(columns={'id':'person_id'},inplace=True) 152 | useNAME = useNAME[['person_id', 'gender']] 153 | print(useNAME.shape) 154 | print(useNAME.columns) 155 | 156 | useTITLE = title.copy() 157 | useTITLE.rename(columns={'id':'movie_id'},inplace=True) 158 | yearNA = ~useTITLE.production_year.isna() 159 | kindNA = ~useTITLE.kind_id.isna() 160 | yearNA = yearNA & kindNA 161 | useTITLE = useTITLE[yearNA] 162 | 163 | useTITLE = useTITLE[['movie_id', 'production_year','kind_id']].copy() 164 | MIN = useTITLE.production_year.min() 165 | MAX = useTITLE.production_year.max() 166 | 167 | useTITLE['production_year'] = (useTITLE['production_year'] - MIN)/(MAX - MIN) 168 | useTITLE = useTITLE.join(pd.get_dummies(useTITLE.kind_id)) 169 | useTITLE.rename(columns={1:'k1',2:'k2',3:'k3',4:'k4',6:'k6',7:'k7'},inplace=True) 170 | 171 | useTITLE.drop(['kind_id'],axis=1, inplace=True) 172 | print(useTITLE.shape) 173 | print(useTITLE.columns) 174 | 175 | 176 | 177 | useMC = mc.copy() 178 | useCN = cn.copy() 179 | useCN.rename(columns={'id':'company_id'},inplace=True) 180 | le = preprocessing.LabelEncoder() 181 | le.fit(useCN.country_code) 182 | useCN['country_code'] = le.transform(useCN.country_code) 183 | 184 | joinedMC = pd.merge(useMC, useCN) 185 | 186 | tMC = joinedMC[['company_id', 'country_code','movie_id']].copy() 187 | 188 | 189 | MAX = tMC.country_code.max() 190 | tMC['country_code'] = (tMC['country_code']/MAX) 191 | 192 | 193 | useTITLE.drop_duplicates(useTITLE.columns,inplace=True) 194 | useMIX.drop_duplicates(subset=['movie_id'], keep='first', inplace=True) 195 | useMIX.drop_duplicates(useMIX.columns,inplace=True) 196 | useCI.drop_duplicates(useCI.columns,inplace=True) 197 | useNAME.drop_duplicates(useNAME.columns,inplace=True) 198 | useMI.drop_duplicates(subset=['movie_id'], keep='first', inplace=True) 199 | useMI.drop_duplicates(useMI.columns,inplace=True) 200 | 201 | 202 | 203 | 204 | useTITLE.drop_duplicates(inplace=True) 205 | useCI.drop_duplicates(inplace=True) 206 | useNAME.drop_duplicates(inplace=True) 207 | useMI.drop_duplicates(inplace=True) 208 | useMIX.drop_duplicates(inplace=True) 209 | tMC.drop_duplicates(inplace=True) 210 | 211 | 212 | 213 | 214 | 215 | z = pd.merge(useTITLE, useMIX) 216 | print(z.shape) 217 | z = pd.merge(z, useCI) 218 | print(z.shape) 219 | z = pd.merge(z, useNAME) 220 | print(z.shape) 221 | z = pd.merge(z, useMI) 222 | print(z.shape) 223 | 224 | 225 | print(z.columns) 226 | print(z.shape) 227 | 228 | z = pd.merge(z, tMC) 229 | print(z.columns) 230 | print(z.shape) 231 | 232 | 233 | from sklearn.utils import shuffle 234 | z = shuffle(z, random_state=123) 235 | 236 | 237 | movieIDs= title.id.unique() 238 | movieIDs = shuffle(movieIDs, random_state=123) 239 | 240 | 241 | trainSize = int(0.5 * movieIDs.shape[0]) 242 | valSize = trainSize + int(0.25 * movieIDs.shape[0]) 243 | 244 | 245 | trainMovies = movieIDs[:trainSize] 246 | trainTMP = pd.DataFrame(trainMovies.reshape(-1,1), columns=["movie_id"]) 247 | trainData = pd.merge(z, trainTMP) 248 | 249 | valMovies = movieIDs[trainSize:valSize] 250 | valTMP = pd.DataFrame(valMovies.reshape(-1,1), columns=["movie_id"]) 251 | valData = pd.merge(z, valTMP) 252 | 253 | testMovies = movieIDs[valSize:] 254 | testTMP = pd.DataFrame(testMovies.reshape(-1,1), columns=["movie_id"]) 255 | testData = pd.merge(z, testTMP) 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | y_train = trainData.rating.values 264 | y_val = valData.rating.values 265 | y_test = testData.rating.values 266 | 267 | 268 | 269 | if saveCSV: 270 | trainData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-train.csv'.format(dataName), index=False) 271 | valData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-val.csv'.format(dataName), index=False) 272 | testData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-test.csv'.format(dataName), index=False) 273 | 274 | 275 | trainData.drop(['rating'], axis=1, inplace=True) 276 | valData.drop(['rating'], axis=1, inplace=True) 277 | testData.drop(['rating'], axis=1, inplace=True) 278 | 279 | trainData.drop(['person_id','movie_id', 'company_id'], axis=1, inplace=True) 280 | valData.drop(['person_id', 'movie_id', 'company_id'], axis=1, inplace=True) 281 | testData.drop(['person_id', 'movie_id', 'company_id'], axis=1, inplace=True) 282 | 283 | 284 | print("Train Data shape ", trainData.shape) 285 | print("Test Data shape ", testData.shape) 286 | print("Val Data shape ", valData.shape) 287 | 288 | print(trainData.columns) 289 | X_train = np.ascontiguousarray(trainData.values.astype(np.float64)) 290 | X_val = np.ascontiguousarray(valData.values.astype(np.float64)) 291 | X_test = np.ascontiguousarray(testData.values.astype(np.float64)) 292 | 293 | print(X_train.shape) 294 | print(y_train.shape) 295 | 296 | return z 297 | 298 | dataNameList = ["IMDBCLinear","IMDBCLinearC++" ] 299 | parameterList = [0,0] 300 | 301 | # dataNameList = ["IMDBLargeCLinear","IMDBLargeCLinearC++" ] 302 | # parameterList = [1,1] 303 | 304 | useForList = ["test", "train"] 305 | 306 | for dataName, param,useFor in zip(dataNameList, parameterList, useForList): 307 | LoadIMDBC(param, dataName, saveCSV=True,useFor=useFor) 308 | np.save("/home/jiayi/disk/C-craig/dataset/{}-train-X.npy".format(dataName),X_train) 309 | np.save("/home/jiayi/disk/C-craig/dataset/{}-train-y.npy".format(dataName),y_train) 310 | 311 | np.save("/home/jiayi/disk/C-craig/dataset/{}-val-X.npy".format(dataName),X_val) 312 | np.save("/home/jiayi/disk/C-craig/dataset/{}-val-y.npy".format(dataName),y_val) 313 | 314 | np.save("/home/jiayi/disk/C-craig/dataset/{}-test-X.npy".format(dataName),X_test) 315 | np.save("/home/jiayi/disk/C-craig/dataset/{}-test-y.npy".format(dataName),y_test) 316 | 317 | # dataName = "IMDBCLinearC++" 318 | # dataName = "IMDBCLinear" 319 | dataName = "IMDBLargeCLinearC++" 320 | 321 | df = pd.read_csv('/home/jiayi/disk/C-craig/dataset/{}-train.csv'.format(dataName)) 322 | 323 | # dataName = "IMDBCLinearC++" 324 | dataName2 = "IMDBLargeCLinear" 325 | 326 | df2 = pd.read_csv('/home/jiayi/disk/C-craig/dataset/{}-train.csv'.format(dataName2)) 327 | 328 | print(df.iloc[:3]) 329 | print(df2.iloc[:3]) 330 | 331 | print(df.columns) 332 | print(df.shape) 333 | 334 | print(np.unique(df.rating)) 335 | 336 | midLE = preprocessing.LabelEncoder() 337 | midLE.fit(df.movie_id) 338 | df['movie_id'] = midLE.transform(df.movie_id) 339 | 340 | pidLE = preprocessing.LabelEncoder() 341 | pidLE.fit(df.person_id) 342 | df['person_id'] = pidLE.transform(df.person_id) 343 | 344 | cidLE = preprocessing.LabelEncoder() 345 | cidLE.fit(df.company_id) 346 | df['company_id'] = cidLE.transform(df.company_id) 347 | 348 | PROP = 1 349 | trainData = df.values 350 | print(trainData.shape) 351 | print(trainData[:5,:]) 352 | np.save('/home/jiayi/disk/C-craig/dataset/{}-joined-prop-{}.npy'.format(dataName, PROP), np.ascontiguousarray(trainData.astype(np.float64))) 353 | 354 | print(midLE.classes_.shape) 355 | print(pidLE.classes_.shape) 356 | print(cidLE.classes_.shape) 357 | num = midLE.classes_.shape[0] 358 | num = num * pidLE.classes_.shape[0] 359 | num = num * cidLE.classes_.shape[0] 360 | print(num) 361 | assert num< 4* (10**18) 362 | print("\n【 Passed 】") 363 | 364 | uni = np.unique(df[['movie_id', 'person_id']], axis=0) 365 | print(uni.shape) 366 | uni = np.unique(df[['movie_id', 'person_id']], axis=0) 367 | print(uni.shape) 368 | 369 | print(df.shape) 370 | 371 | uni = df[['movie_id', 'person_id', 'company_id']].copy() 372 | # uni = np.unique(df[['movie_id', 'person_id', 'company_id']], axis=0) 373 | print(uni.shape) 374 | print(uni) 375 | rowNumMap = np.zeros((uni.shape[0],2), np.int64) 376 | i = 0 377 | # for row in uni.values: 378 | # print(row) 379 | for row in uni.values: 380 | # if row.sha 381 | # print(row) 382 | # print(row.shape) 383 | x,y,z = row 384 | x = np.int64(x) 385 | y = np.int64(y) 386 | z = np.int64(z) 387 | 388 | rowNumMap[i,0] = (x+1) + (y+1)*(10**5) + (z+1) * (10**11) 389 | assert 0 <= rowNumMap[i,0] and rowNumMap[i,0] < (4*(10**18)) 390 | # print(rowNumMap[i,0]) 391 | rowNumMap[i,1] = i 392 | i = i + 1 393 | 394 | mycsDIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataName) 395 | np.save(mycsDIR + 'idMap.npy', np.ascontiguousarray(rowNumMap)) 396 | 397 | print(rowNumMap) 398 | 399 | CATE = len(np.unique(df.rating)) 400 | print("Cate num is ",CATE) 401 | 402 | Databackup = df.copy() 403 | 404 | for cate in range(CATE + 1): 405 | print("#"*10 ,' '*5, '【cate】 ', cate, ' '*10, '#'*10) 406 | trainData = Databackup[Databackup['rating'] == cate] 407 | 408 | 409 | mixColumns = ['movie_id', 'votes', 'rating'] 410 | mixNotUniqued = trainData[mixColumns].copy() 411 | mixUniqued = mixNotUniqued.drop_duplicates(mixNotUniqued.columns).copy() 412 | mixUniqued.sort_values(['movie_id'], inplace=True) 413 | print('【Movie_info_idx】') 414 | print(mixUniqued.shape) 415 | print(len(np.unique(mixUniqued.movie_id))) 416 | 417 | 418 | 419 | miColumns = ['movie_id', 'color', 's0', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 420 | 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 421 | 's18', 's19', 's20', 's21', 's22', 's23', 's24', 's25', 's26', 's27', 422 | 's28', 's29'] 423 | miNotUniqued = trainData[miColumns].copy() 424 | miUniqued = miNotUniqued.drop_duplicates(miNotUniqued.columns).copy() 425 | miUniqued.sort_values(['movie_id'], inplace=True) 426 | print('【Movie_info】') 427 | print(miUniqued.shape) 428 | print(len(np.unique(miUniqued.movie_id))) 429 | 430 | 431 | 432 | ciColumns = ['person_id', 'movie_id'] 433 | ciNotUniqued = trainData[ciColumns].copy() 434 | ciUniqued = ciNotUniqued.drop_duplicates(ciNotUniqued.columns).copy() 435 | print('【Cast_info】') 436 | print(ciUniqued.shape) 437 | print('in cast_info movie_id unique ', len(np.unique(ciUniqued.movie_id))) 438 | print('in cast_info person_id unique ', len(np.unique(ciUniqued.person_id))) 439 | 440 | 441 | 442 | nameColumns = ['person_id', 'gender'] 443 | nameNotUniqued = trainData[nameColumns].copy() 444 | nameUniqued = nameNotUniqued.drop_duplicates(nameNotUniqued.columns).copy() 445 | nameUniqued.sort_values(['person_id'], inplace=True) 446 | print('【Name】') 447 | print(nameUniqued.shape) 448 | print(len(np.unique(nameUniqued.person_id))) 449 | 450 | titleColumns = ['movie_id', 'production_year', 'k1', 'k2', 'k3', 'k4', 'k6', 'k7'] 451 | titleNotUniqued = trainData[titleColumns].copy() 452 | titleUniqued = titleNotUniqued.drop_duplicates(titleNotUniqued.columns).copy() 453 | titleUniqued.sort_values(['movie_id'], inplace=True) 454 | print('【Title】') 455 | print(titleUniqued.shape) 456 | print(len(np.unique(titleUniqued.movie_id))) 457 | 458 | 459 | 460 | mcColumns = ['movie_id', 'company_id', 'country_code'] 461 | mcNotUniqued = trainData[mcColumns].copy() 462 | mcUniqued = mcNotUniqued.drop_duplicates(mcNotUniqued.columns).copy() 463 | mcUniqued.sort_values(['movie_id'], inplace=True) 464 | print('【Movie Company】') 465 | print(mcUniqued.shape) 466 | print(len(np.unique(mcUniqued.movie_id))) 467 | 468 | 469 | mycsDIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataName) 470 | np.save(mycsDIR + 'train-cate-{}-mix.npy'.format(cate), np.ascontiguousarray(mixUniqued.values.astype(np.float64))) 471 | np.save(mycsDIR + 'train-cate-{}-mi.npy'.format(cate), np.ascontiguousarray(miUniqued.values.astype(np.float64))) 472 | np.save(mycsDIR + 'train-cate-{}-ci.npy'.format(cate), np.ascontiguousarray(ciUniqued.values.astype(np.float64))) 473 | np.save(mycsDIR + 'train-cate-{}-name.npy'.format(cate), np.ascontiguousarray(nameUniqued.values.astype(np.float64))) 474 | np.save(mycsDIR + 'train-cate-{}-title.npy'.format(cate), np.ascontiguousarray(titleUniqued.values.astype(np.float64))) 475 | np.save(mycsDIR + 'train-cate-{}-mc.npy'.format(cate), np.ascontiguousarray(mcUniqued.values.astype(np.float64))) 476 | 477 | 478 | 479 | mixUniqued.to_csv(mycsDIR + 'train-cate-{}-mix.csv'.format(cate),index=False) 480 | miUniqued.to_csv(mycsDIR + 'train-cate-{}-mi.csv'.format(cate), index=False) 481 | ciUniqued.to_csv(mycsDIR + 'train-cate-{}-ci.csv'.format(cate), index=False) 482 | nameUniqued.to_csv(mycsDIR + 'train-cate-{}-name.csv'.format(cate), index=False) 483 | titleUniqued.to_csv(mycsDIR + 'train-cate-{}-title.csv'.format(cate), index=False) 484 | mcUniqued.to_csv(mycsDIR + 'train-cate-{}-mc.csv'.format(cate), index=False) 485 | 486 | 487 | 488 | 489 | print(np.unique(df.rating)) 490 | 491 | z = np.unique(df.rating) 492 | print(z[85]) 493 | # print(len(np.unique())) 494 | 495 | -------------------------------------------------------------------------------- /preprocess/stack.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn import metrics 5 | import os 6 | from sklearn import preprocessing 7 | 8 | DIR = '/home/jiayi/disk/stackData/' 9 | 10 | user = pd.read_csv(DIR + 'user' + '.csv',usecols=['id','site_id', 'reputation', 'upvotes', 'downvotes'] ) 11 | question = pd.read_csv(DIR + 'question' + '.csv', usecols=['id', 'site_id', 'score','view_count', 'favorite_count']) 12 | answer = pd.read_csv(DIR + 'answer' + '.csv', usecols=['id', 'site_id', 'question_id','owner_user_id','score']) 13 | 14 | useUser= user.copy() 15 | useAnswer = answer.copy() 16 | useQuestion = question.copy() 17 | 18 | useUser.rename(columns={'id':'user_id'},inplace=True) 19 | useAnswer.rename(columns={'owner_user_id':'user_id', 'score':'Y'},inplace=True) 20 | useQuestion.rename(columns={'id':'question_id'},inplace=True) 21 | 22 | z = pd.merge(useAnswer, useUser) 23 | z = pd.merge(z, useQuestion) 24 | inUser = z[['user_id', 'site_id', 'reputation', 'upvotes', 'downvotes']].copy() 25 | inAnswer = z[['id', 'site_id', 'question_id', 'Y', 'user_id']].copy() 26 | inQuestion = z[['question_id', 'site_id', 'score', 'view_count', 'favorite_count']].copy() 27 | inUser.drop_duplicates(inplace=True) 28 | inAnswer.drop_duplicates(inplace=True) 29 | inQuestion.drop_duplicates(inplace=True) 30 | 31 | print(inUser.shape) 32 | print(inAnswer.shape) 33 | print(inQuestion.shape) 34 | 35 | print(inUser.site_id.min(), inUser.site_id.max()) 36 | print(inUser.user_id.min(), inUser.user_id.max()) 37 | inUser['Uid'] = 1000 * inUser.user_id + inUser.site_id 38 | inAnswer['Uid'] = 1000 * inAnswer.user_id + inAnswer.site_id 39 | 40 | inQuestion['Qid'] = 1000 * inQuestion.question_id + inQuestion.site_id 41 | inAnswer['Qid'] = 1000 * inAnswer.question_id + inAnswer.site_id 42 | 43 | le = preprocessing.LabelEncoder() 44 | le.fit(inAnswer.Uid) 45 | inAnswer.Uid = le.transform(inAnswer.Uid) 46 | inUser.Uid = le.transform(inUser.Uid) 47 | print(inAnswer.Uid.min(), inAnswer.Uid.max()) 48 | print(inUser.Uid.min(), inUser.Uid.max()) 49 | 50 | le = preprocessing.LabelEncoder() 51 | le.fit(inAnswer.Qid) 52 | inAnswer.Qid = le.transform(inAnswer.Qid) 53 | inQuestion.Qid = le.transform(inQuestion.Qid) 54 | print(inAnswer.Qid.min(), inAnswer.Qid.max()) 55 | print(inQuestion.Qid.min(), inQuestion.Qid.max()) 56 | 57 | print(inUser.iloc[:3,:]) 58 | tu = inUser[['reputation', 'upvotes' ,'downvotes']].copy() 59 | tu.drop_duplicates(inplace=True) 60 | print(tu.shape) 61 | 62 | tu = inUser[['reputation', 'upvotes' ,'downvotes','site_id']].copy() 63 | tu.drop_duplicates(inplace=True) 64 | print(tu.shape) 65 | 66 | tu['newUid'] = np.arange(tu.shape[0]) 67 | newU = pd.merge(inUser, tu) 68 | print(newU.shape) 69 | print(newU.columns) 70 | print(newU.iloc[:3,:]) 71 | 72 | print(inQuestion.iloc[:3,:]) 73 | tq = inQuestion[['score', 'view_count']].copy() 74 | tq.drop_duplicates(inplace=True) 75 | print(tq.shape) 76 | 77 | tq['newQid'] = np.arange(tq.shape[0]) 78 | newQ = pd.merge(inQuestion, tq) 79 | print(newQ.shape) 80 | print(newQ.columns) 81 | print(newQ.iloc[:3,:]) 82 | 83 | z = pd.merge(inAnswer, newQ) 84 | z = pd.merge(z, newU) 85 | print(z.columns) 86 | print(z.shape) 87 | 88 | doAnswer = z[['id','newUid','newQid','Y']].copy().drop_duplicates() 89 | doQuestion = z[['newQid','score', 'view_count']].copy().drop_duplicates() 90 | doUser = z[['newUid','site_id', 'reputation','upvotes','downvotes']].copy().drop_duplicates() 91 | 92 | print(doAnswer.shape) 93 | print(doQuestion.shape) 94 | print(doUser.shape) 95 | 96 | doJoin = pd.merge(doAnswer, doQuestion) 97 | doJoin = pd.merge(doJoin, doUser) 98 | 99 | STD = doJoin.reputation.std() 100 | 101 | doUserBackup = doUser.copy() 102 | 103 | print(doUser.iloc[:3,:]) 104 | doUser = doUser.join(pd.get_dummies(doUser.site_id, prefix='st')) 105 | doUser.upvotes = (doUser.upvotes - doUser.upvotes.min()) / (doUser.upvotes.max() - doUser.upvotes.min()) 106 | doUser.downvotes = (doUser.downvotes - doUser.downvotes.min()) / (doUser.downvotes.max() - doUser.downvotes.min()) 107 | 108 | # doUser.reputation /= doUser.reputation.std() 109 | # doUser.reputation /= STD 110 | 111 | print(doUser.iloc[:3,:]) 112 | 113 | doAnswerBackup = doAnswer.copy() 114 | doAnswer.Y = (doAnswer.Y - doAnswer.Y.min()) / (doAnswer.Y.max() - doAnswer.Y.min()) 115 | 116 | print(doAnswer.columns) 117 | print(doAnswer.iloc[:3,:]) 118 | 119 | doQuestionBackup = doQuestion.copy() 120 | 121 | print(doQuestion.columns) 122 | print(doQuestion.iloc[:3,:]) 123 | 124 | doQuestion.score = (doQuestion.score - doQuestion.score.min()) / (doQuestion.score.max() - doQuestion.score.min()) 125 | doQuestion.view_count = (doQuestion.view_count - doQuestion.view_count.min()) / (doQuestion.view_count.max() - doQuestion.view_count.min()) 126 | 127 | print(doQuestion.iloc[:3,:]) 128 | 129 | doingUser = doUser.copy() 130 | 131 | rng = np.random.RandomState(123) 132 | from sklearn.utils import shuffle 133 | doingUser = shuffle(doingUser, random_state=rng) 134 | 135 | TrainProp = 0.5 136 | ValProp = 0.25 137 | TrainEnd = int(TrainProp * doingUser.shape[0]) 138 | ValEnd = TrainEnd + int(ValProp * doingUser.shape[0]) 139 | 140 | print(doingUser.columns) 141 | 142 | doingUser.reputation /= doingUser.reputation.std() 143 | 144 | trainUser = doingUser[:TrainEnd].copy() 145 | valUser = doingUser[TrainEnd:ValEnd].copy() 146 | testUser = doingUser[ValEnd:].copy() 147 | 148 | print(doingUser.columns) 149 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/' 150 | dataset = 'stackn-single' 151 | y_train = trainUser.reputation 152 | y_val = valUser.reputation 153 | y_test = testUser.reputation 154 | 155 | trainUser.drop(['newUid', 'site_id', 'reputation'], axis=1, inplace=True) 156 | valUser.drop(['newUid', 'site_id', 'reputation'], axis=1, inplace=True) 157 | testUser.drop(['newUid', 'site_id', 'reputation'], axis=1, inplace=True) 158 | 159 | X_train = np.ascontiguousarray(trainUser.values.astype(np.float64)) 160 | X_val = np.ascontiguousarray(valUser.values.astype(np.float64)) 161 | X_test = np.ascontiguousarray(testUser.values.astype(np.float64)) 162 | 163 | np.save( DATASET_DIR + "{}-train-X.npy".format(dataset),X_train) 164 | np.save(DATASET_DIR + "{}-val-X.npy".format(dataset),X_val) 165 | np.save( DATASET_DIR + "{}-test-X.npy".format(dataset),X_test) 166 | 167 | np.save(DATASET_DIR + "{}-train-y.npy".format(dataset),y_train) 168 | np.save(DATASET_DIR + "{}-val-y.npy".format(dataset),y_val) 169 | np.save(DATASET_DIR + "{}-test-y.npy".format(dataset),y_test) 170 | 171 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/' 172 | dataset = 'stackn' 173 | 174 | doingUser = doUser.copy() 175 | doingUser.reputation/=STD 176 | 177 | rng = np.random.RandomState(123) 178 | from sklearn.utils import shuffle 179 | doingUser = shuffle(doingUser, random_state=rng) 180 | 181 | TrainProp = 0.5 182 | ValProp = 0.25 183 | TrainEnd = int(TrainProp * doingUser.shape[0]) 184 | ValEnd = TrainEnd + int(ValProp * doingUser.shape[0]) 185 | 186 | print(doingUser.columns) 187 | 188 | # User 189 | trainUser = doingUser[:TrainEnd].copy() 190 | valUser = doingUser[TrainEnd:ValEnd].copy() 191 | testUser = doingUser[ValEnd:].copy() 192 | 193 | # join 194 | trainSet = pd.merge(trainUser, doAnswer) 195 | trainSet = pd.merge(trainSet, doQuestion) 196 | 197 | valSet = pd.merge(valUser, doAnswer) 198 | valSet = pd.merge(valSet, doQuestion) 199 | 200 | testSet = pd.merge(testUser, doAnswer) 201 | testSet = pd.merge(testSet, doQuestion) 202 | 203 | y_train = trainSet.reputation 204 | y_val = valSet.reputation 205 | y_test = testSet.reputation 206 | 207 | trainSet.to_csv(DATASET_DIR + "{}-train-X.csv".format(dataset), index=False) 208 | valSet.to_csv(DATASET_DIR + "{}-val-X.csv".format(dataset), index=False) 209 | testSet.to_csv(DATASET_DIR + "{}-test-X.csv".format(dataset), index=False) 210 | 211 | trainSet.drop(['newUid', 'newQid', 'id', 'site_id', 'reputation'], axis=1, inplace=True) 212 | valSet.drop(['newUid', 'newQid', 'id', 'newUid', 'site_id', 'reputation'], axis=1, inplace=True) 213 | testSet.drop(['newUid', 'newQid', 'id', 'newUid', 'site_id', 'reputation'], axis=1, inplace=True) 214 | 215 | print(trainSet.shape) 216 | print(trainSet.columns) 217 | X_train = np.ascontiguousarray(trainSet.values.astype(np.float64)) 218 | X_val = np.ascontiguousarray(valSet.values.astype(np.float64)) 219 | X_test = np.ascontiguousarray(testSet.values.astype(np.float64)) 220 | 221 | np.save( DATASET_DIR + "{}-train-X.npy".format(dataset),X_train) 222 | np.save(DATASET_DIR + "{}-val-X.npy".format(dataset),X_val) 223 | np.save( DATASET_DIR + "{}-test-X.npy".format(dataset),X_test) 224 | 225 | np.save(DATASET_DIR + "{}-train-y.npy".format(dataset),y_train) 226 | np.save(DATASET_DIR + "{}-val-y.npy".format(dataset),y_val) 227 | np.save(DATASET_DIR + "{}-test-y.npy".format(dataset),y_test) 228 | 229 | # doAnswer.to_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doAnswer.csv', index=False) 230 | # doUser.to_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doUser.csv', index=False) 231 | # doQuestion.to_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doQuestion.csv', index=False) 232 | 233 | doAnswer = pd.read_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doAnswer.csv') 234 | doUser = pd.read_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doUser.csv') 235 | doQuestion = pd.read_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doQuestion.csv') 236 | 237 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/' 238 | dataset = 'stackn' 239 | 240 | dfBackup = pd.read_csv(DATASET_DIR + "{}-train-X.csv".format(dataset)) 241 | print(dfBackup.shape) 242 | 243 | df = dfBackup.copy() 244 | print(df.columns) 245 | 246 | df.rename(columns={'reputation':'target'},inplace=True) 247 | doUser.rename(columns={'reputation':'target'},inplace=True) 248 | doUser.drop(['site_id','target'],axis=1,inplace=True) 249 | 250 | df['rowID'] = np.arange(df.shape[0]) 251 | 252 | le = preprocessing.LabelEncoder() 253 | le.fit(df.target) 254 | df.target = le.transform(df.target) 255 | 256 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/' 257 | dataset = 'stacknC++' 258 | 259 | y_train = df.target 260 | y_train = np.ascontiguousarray(y_train.values.astype(np.int64)) 261 | dfC = df.drop(['rowID','newUid', 'newQid', 'id', 'site_id', 'target'], axis=1) 262 | X_train = np.ascontiguousarray(dfC.values.astype(np.float64)) 263 | 264 | testy = np.load(DATASET_DIR + "{}-train-y.npy".format(dataset)) 265 | print(np.unique(testy)) 266 | print(len(np.unique(testy))) 267 | print(np.min(testy), np.max(testy)) 268 | 269 | np.save( DATASET_DIR + "{}-train-X.npy".format(dataset),X_train) 270 | np.save(DATASET_DIR + "{}-train-y.npy".format(dataset),y_train) 271 | 272 | print(dataset) 273 | 274 | print(dfC.columns) 275 | 276 | print(np.sort(df.target.unique())) 277 | print(len(df.target.unique())) 278 | print(df.shape) 279 | 280 | print(np.unique(y_train)) 281 | print(len(np.unique(y_train))) 282 | print(np.min(y_train), np.max(y_train)) 283 | 284 | print(df.target.value_counts()) 285 | 286 | uni = np.sort(df.target.unique()) 287 | print(uni.shape) 288 | print(uni) 289 | 290 | df = df [['newUid', 'newQid', 'id', 'rowID','site_id', 'target', 'upvotes', 'downvotes', 'st_0', 'st_1', 'st_2', 'st_3', 'st_4', 'st_5', 'st_6', 'st_7', 'st_8', 'st_9', 'st_10', 'st_11', 'st_12', 'st_13', 'st_14', 'st_15', 'st_16', 'st_17', 'st_18', 'st_19', 'st_20', 'st_21', 'st_22', 'st_23', 'st_24', 'st_25', 'st_26', 'st_27', 'st_28', 'st_29', 'st_30', 'st_31', 'st_32', 'st_33', 'st_34', 'st_35', 'st_36', 'st_37', 'st_38', 'st_39', 'st_40', 'st_41', 'st_42', 'st_43', 'st_44', 'st_45', 'st_46', 'st_47', 'st_48', 'st_49', 'st_50', 'st_51', 'st_52', 'st_53', 'st_54', 'st_55', 'st_56', 'st_57', 'st_58', 'st_59', 'st_60', 'st_61', 'st_62', 'st_63', 'st_64', 'st_65', 'st_66', 'st_67', 'st_68', 'st_69', 'st_70', 'st_71', 'st_72', 'st_73', 'st_74', 'st_75', 'st_76', 'st_77', 'st_78', 'st_79', 'st_80', 'st_81', 'st_82', 'st_83', 'st_84', 'st_85', 'st_86', 'st_87', 'st_88', 'st_89', 'st_90', 'st_91', 'st_92', 'st_93', 'st_94', 'st_95', 'st_96', 'st_97', 'st_98', 'st_99', 'st_100', 'st_101', 'st_102', 'st_103', 'st_104', 'st_105', 'st_106', 'st_107', 'st_108', 'st_109', 'st_110', 'st_111', 'st_112', 'st_113', 'st_114', 'st_115', 'st_116', 'st_117', 'st_118', 'st_119', 'st_120', 'st_121', 'st_122', 'st_123', 'st_124', 'st_125', 'st_126', 'st_127', 'st_128', 'st_129', 'st_130', 'st_131', 'st_132', 'st_133', 'st_134', 'st_135', 'st_136', 'st_137', 'st_138', 'st_139', 'st_140', 'st_141', 'st_142', 'st_143', 'st_144', 'st_145', 'st_146', 'st_147', 'st_148', 'st_149', 'st_150', 'st_151', 'st_152', 'st_153', 'st_154', 'st_155', 'st_156', 'st_157', 'st_158', 'st_159', 'st_160', 'st_161', 'st_162', 'st_163', 'st_164', 'st_165', 'st_166', 'st_167', 'st_168', 'st_169', 'st_170', 'st_171', 'st_172', 'Y', 'score', 'view_count']].copy() 291 | 292 | dataset = 'stackn' 293 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataset) 294 | 295 | for cate in uni: 296 | print("#"*20, " "*10, cate, " "*10, "#"*20) 297 | tmpDF = df[df['target'] == cate].copy() 298 | 299 | 300 | le = preprocessing.LabelEncoder() 301 | le.fit(tmpDF.newUid) 302 | tmpDF.newUid = le.transform(tmpDF.newUid) 303 | 304 | 305 | le = preprocessing.LabelEncoder() 306 | le.fit(tmpDF.newQid) 307 | tmpDF.newQid = le.transform(tmpDF.newQid) 308 | 309 | le = preprocessing.LabelEncoder() 310 | le.fit(tmpDF.id) 311 | tmpDF.id = le.transform(tmpDF.id) 312 | 313 | 314 | 315 | tmpDF.sort_values("id",inplace=True) 316 | tmpDF.to_csv(DATASET_DIR + "train-{}-joined.csv".format(cate), index=False) 317 | tmp_ = np.ascontiguousarray(tmpDF.values.astype(np.float64)) 318 | np.save(DATASET_DIR + "train-{}-joined.npy".format(cate), tmp_) 319 | 320 | 321 | tmpUser = tmpDF[doUser.columns].copy() 322 | tmpUser.drop_duplicates(inplace=True) 323 | tmpUser.sort_values("newUid",inplace=True) 324 | tmpUser.to_csv(DATASET_DIR + "train-{}-user.csv".format(cate), index=False) 325 | tmpUser = np.ascontiguousarray(tmpUser.values.astype(np.float64)) 326 | np.save(DATASET_DIR + "train-{}-user.npy".format(cate), tmpUser) 327 | 328 | 329 | tmpQuestion = tmpDF[doQuestion.columns].copy() 330 | tmpQuestion.drop_duplicates(inplace=True) 331 | tmpQuestion.sort_values("newQid",inplace=True) 332 | tmpQuestion.to_csv(DATASET_DIR + "train-{}-question.csv".format(cate), index=False) 333 | tmpQuestion = np.ascontiguousarray(tmpQuestion.values.astype(np.float64)) 334 | np.save(DATASET_DIR + "train-{}-question.npy".format(cate), tmpQuestion) 335 | print(tmpQuestion.shape) 336 | 337 | 338 | tmpAnswer = tmpDF[doAnswer.columns].copy() 339 | tmpAnswer.drop_duplicates(inplace=True) 340 | tmpAnswer.sort_values("id",inplace=True) 341 | tmpAnswer.to_csv(DATASET_DIR + "train-{}-answer.csv".format(cate), index=False) 342 | tmpAnswer = np.ascontiguousarray(tmpAnswer.values.astype(np.float64)) 343 | np.save(DATASET_DIR + "train-{}-answer.npy".format(cate), tmpAnswer) 344 | 345 | 346 | -------------------------------------------------------------------------------- /preprocess/taxi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn import metrics 8 | import os 9 | from sklearn import preprocessing 10 | 11 | import datetime 12 | def parseDatetime(s): 13 | # print('s is ',s) 14 | pre, suf = s.split(' ') 15 | 16 | year_s, mon_s, day_s = pre.split('-') 17 | hour_s, minute_s, second_s = suf.split(':') 18 | # retuabsrn datetime.datetime(int(year_s), int(mon_s), int(day_s), int(hour_s), int(minute_s), int(second_s)) 19 | return datetime.datetime(int(year_s), int(mon_s), int(day_s), int(hour_s), int(minute_s), int(second_s)).date() 20 | 21 | def parseYMD(arrLike, col1): 22 | YMD = parseDatetime(arrLike[col1]) 23 | return str(YMD) 24 | 25 | def timeDelta(arrLike, col1, col2): 26 | purchase = parseDatetime(arrLike[col1]) 27 | approve = parseDatetime(arrLike[col2]) 28 | delta = approve - purchase 29 | return delta.total_seconds() 30 | 31 | X_train = [] 32 | X_test = [] 33 | y_train = [] 34 | y_test = [] 35 | 36 | DIR= "/home/jiayi/disk/gits/craig/datasets/taxi/data/" 37 | 38 | taxi = pd.read_csv(DIR+"taxi.csv") 39 | 40 | def readDF(ID): 41 | df = pd.read_csv(DIR+"tbl_{}.csv".format(ID)) 42 | return df 43 | 44 | t16 = readDF(16) 45 | t16['f642'] = t16.apply(parseYMD, axis=1, args=['f405']) 46 | 47 | t5 = readDF(5) 48 | t20 = readDF(20) 49 | #t23 = readDF(23) 50 | 51 | #t24 = readDF(24) 52 | t14 = readDF(14) 53 | t11 = readDF(11) 54 | #t6 = readDF(6) 55 | 56 | t5.rename({'f188':'f642'},axis=1,inplace=True) 57 | t20.rename({'f520':'f642'},axis=1,inplace=True) 58 | # t23.rename({'f607':'f642'},axis=1,inplace=True) 59 | 60 | # t24.rename({'f634':'f642'},axis=1,inplace=True) 61 | t14.rename({'f373':'f642'},axis=1,inplace=True) 62 | t11.rename({'f299':'f642'},axis=1,inplace=True) 63 | # t6.rename({'f195':'f642'},axis=1,inplace=True) 64 | 65 | t11 = t11[['f642', 'f294','f298', 'f300','f302', 'f306']].copy() 66 | print(t11.shape) 67 | print(t11.columns) 68 | 69 | """ t5""" 70 | # print(t5.shape) 71 | # print(t5.columns) 72 | # print(t5.f189.min(), t5.f189.max()) 73 | 74 | """ t20""" 75 | # print(t20.shape) 76 | # print(t20.columns) 77 | # print(t20) 78 | # print(t20.f189.min(), t5.f189.max()) 79 | 80 | """ t16""" 81 | t16.drop(['f405'],axis=1,inplace=True) 82 | t16 = t16[['f642','f406','f407','f408']].copy() 83 | # print(t16.shape) 84 | # print(t16.columns) 85 | # print(t16) 86 | # print(t20.f189.min(), t5.f189.max()) 87 | 88 | z = pd.merge(taxi, t11,left_on='f642', right_on='f642') 89 | print(z.shape) 90 | 91 | z = pd.merge(z, t5) 92 | print(z.shape) 93 | 94 | z = pd.merge(z, t20) 95 | print(z.shape) 96 | 97 | z = pd.merge(z, t16) 98 | print(z.shape) 99 | 100 | le = preprocessing.LabelEncoder() 101 | le.fit(z.f642) 102 | z.f642 = le.transform(z.f642) 103 | # aUser.user_id = le.transform(aUser.user_id) 104 | 105 | taxi = z[taxi.columns].copy().drop_duplicates() 106 | print(taxi.shape) 107 | 108 | t11 = z[t11.columns].copy().drop_duplicates() 109 | print(t11.shape) 110 | 111 | t5 = z[t5.columns].copy().drop_duplicates() 112 | print(t5.shape) 113 | 114 | t20 = z[t20.columns].copy().drop_duplicates() 115 | print(t20.shape) 116 | 117 | t16 = z[t16.columns].copy().drop_duplicates() 118 | print(t16.shape) 119 | 120 | taxi.dropna(inplace=True) 121 | taxi = taxi[['f642','f643','target']].copy() 122 | print(taxi.f643.min(), taxi.f643.max()) 123 | print(taxi.target.min(), taxi.target.max()) 124 | 125 | taxi.f643 = (taxi.f643 - (taxi.f643.min())) / (taxi.f643.max()-taxi.f643.min()) 126 | std = taxi.target.std() 127 | print(std) 128 | taxi.target/=std 129 | print(taxi) 130 | 131 | t11.dropna(inplace=True) 132 | print(t11) 133 | 134 | cols = ['f300','f302','f306'] 135 | for col in cols: 136 | True_idx = t11[col].map(lambda x: x==True) 137 | False_idx = t11[col].map(lambda x: x==False) 138 | t11.loc[True_idx, col] = 1 139 | t11.loc[False_idx, col] = 0 140 | 141 | print(t11) 142 | t11 = t11.join(pd.get_dummies(t11.f294)) 143 | t11.drop(['f294'], axis=1,inplace=True) 144 | t11.f298 = (t11.f298 - t11.f298.min())/(t11.f298.max()-t11.f298.min()) 145 | t11["ID11"] = np.arange(t11.shape[0]) 146 | print(t11.shape) 147 | 148 | t5.dropna(inplace=True) 149 | print(t5) 150 | print(t5.f189.min(), t5.f189.max()) 151 | t5.f189 = (t5.f189 - t5.f189.min())/(t5.f189.max()-t5.f189.min()) 152 | 153 | t5["ID5"] = np.arange(t5.shape[0]) 154 | 155 | t20.dropna(inplace=True) 156 | print(t20) 157 | print(t20.f521.min(), t20.f521.max()) 158 | print(t20.f522.min(), t20.f522.max()) 159 | print(t20.f523.min(), t20.f523.max()) 160 | t20.f521 = (t20.f521 - t20.f521.min())/(t20.f521.max()-t20.f521.min()) 161 | t20.f522 = (t20.f522 - t20.f522.min())/(t20.f522.max()-t20.f522.min()) 162 | t20.f523 = (t20.f523 - t20.f523.min())/(t20.f523.max()-t20.f523.min()) 163 | 164 | t20["ID20"] = np.arange(t20.shape[0]) 165 | 166 | t16.dropna(inplace=True) 167 | t16.f406 = (t16.f406 - t16.f406.min())/(t16.f406.max()-t16.f406.min()) 168 | t16.f407 = (t16.f407 - t16.f407.min())/(t16.f407.max()-t16.f407.min()) 169 | t16.f408 = (t16.f408 - t16.f408.min())/(t16.f408.max()-t16.f408.min()) 170 | t16["ID16"] = np.arange(t16.shape[0]) 171 | 172 | print(taxi.shape) 173 | 174 | rng = np.random.RandomState(123) 175 | from sklearn.utils import shuffle 176 | taxi = shuffle(taxi, random_state=rng) 177 | 178 | TrainProp = 0.5 179 | ValProp = 0.25 180 | TrainEnd = int(TrainProp * taxi.shape[0]) 181 | ValEnd = TrainEnd + int(ValProp * taxi.shape[0]) 182 | 183 | trainTaxi = taxi[:TrainEnd] 184 | valTaxi = taxi[TrainEnd:ValEnd] 185 | testTaxi = taxi[ValEnd:] 186 | 187 | print(trainTaxi.shape) 188 | print(valTaxi.shape) 189 | print(testTaxi.shape) 190 | 191 | print(trainTaxi.columns) 192 | 193 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/' 194 | dataset = 'taxi-single' 195 | y_train = trainTaxi.target 196 | y_val = valTaxi.target 197 | y_test = testTaxi.target 198 | 199 | X_train = np.ascontiguousarray(trainTaxi[['f643']].copy().values.astype(np.float64)) 200 | X_val = np.ascontiguousarray(valTaxi[['f643']].copy().values.astype(np.float64)) 201 | X_test = np.ascontiguousarray(testTaxi[['f643']].copy().values.astype(np.float64)) 202 | 203 | np.save( DATASET_DIR + "{}-train-X.npy".format(dataset),X_train) 204 | np.save(DATASET_DIR + "{}-val-X.npy".format(dataset),X_val) 205 | np.save( DATASET_DIR + "{}-test-X.npy".format(dataset),X_test) 206 | 207 | np.save(DATASET_DIR + "{}-train-y.npy".format(dataset),y_train) 208 | np.save(DATASET_DIR + "{}-val-y.npy".format(dataset),y_val) 209 | np.save(DATASET_DIR + "{}-test-y.npy".format(dataset),y_test) 210 | 211 | trainSet = pd.merge(trainTaxi, t11) 212 | trainSet = pd.merge(trainSet, t5) 213 | trainSet = pd.merge(trainSet, t20) 214 | trainSet = pd.merge(trainSet, t16) 215 | print(trainSet.shape) 216 | 217 | valSet = pd.merge(valTaxi, t11) 218 | valSet = pd.merge(valSet, t5) 219 | valSet = pd.merge(valSet, t20) 220 | valSet = pd.merge(valSet, t16) 221 | print(valSet.shape) 222 | 223 | testSet = pd.merge(testTaxi, t11) 224 | testSet = pd.merge(testSet, t5) 225 | testSet = pd.merge(testSet, t20) 226 | testSet = pd.merge(testSet, t16) 227 | print(testSet.shape) 228 | 229 | z = 0.9 230 | print(z **20) 231 | 232 | z = 0.8 233 | print(z ** 20) 234 | 235 | z = 0.7 236 | print(z ** 20) 237 | 238 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/' 239 | dataset = 'taxi' 240 | 241 | y_train = trainSet.target.copy() 242 | y_val = valSet.target.copy() 243 | y_test = testSet.target.copy() 244 | 245 | trainSet.to_csv(DATASET_DIR + "{}-train.csv".format(dataset), index=False) 246 | valSet.to_csv(DATASET_DIR + "{}-val.csv".format(dataset), index=False) 247 | testSet.to_csv(DATASET_DIR + "{}-test.csv".format(dataset), index=False) 248 | 249 | trainSet.drop(['target','f642',"ID11","ID5","ID20","ID16"],axis=1,inplace=True) 250 | valSet.drop( ['target','f642',"ID11","ID5","ID20","ID16"],axis=1,inplace=True) 251 | testSet.drop(['target','f642',"ID11","ID5","ID20","ID16"],axis=1,inplace=True) 252 | 253 | X_train = np.ascontiguousarray(trainSet.values.astype(np.float64)) 254 | X_val = np.ascontiguousarray(valSet.values.astype(np.float64)) 255 | X_test = np.ascontiguousarray(testSet.values.astype(np.float64)) 256 | 257 | np.save( DATASET_DIR + "{}-train-X.npy".format(dataset),X_train) 258 | np.save(DATASET_DIR + "{}-val-X.npy".format(dataset),X_val) 259 | np.save( DATASET_DIR + "{}-test-X.npy".format(dataset),X_test) 260 | 261 | np.save(DATASET_DIR + "{}-train-y.npy".format(dataset),y_train) 262 | np.save(DATASET_DIR + "{}-val-y.npy".format(dataset),y_val) 263 | np.save(DATASET_DIR + "{}-test-y.npy".format(dataset),y_test) 264 | 265 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/' 266 | dataset = 'taxi' 267 | 268 | df = pd.read_csv(DATASET_DIR + "{}-train-X.csv".format(dataset)) 269 | print(df.shape) 270 | 271 | print(df.columns) 272 | 273 | print(df.target.value_counts()) 274 | 275 | df['rowID'] = np.arange(df.shape[0]) 276 | 277 | le = preprocessing.LabelEncoder() 278 | le.fit(df.target) 279 | df.target = le.transform(df.target) 280 | 281 | print(len(df.target.unique())) 282 | cate_list = df.target.unique() 283 | 284 | le = preprocessing.LabelEncoder() 285 | le.fit(df.ID5) 286 | df.ID5 = le.transform(df.ID5) 287 | 288 | le = preprocessing.LabelEncoder() 289 | le.fit(df.ID11) 290 | df.ID11 = le.transform(df.ID11) 291 | 292 | le = preprocessing.LabelEncoder() 293 | le.fit(df.ID16) 294 | df.ID16 = le.transform(df.ID16) 295 | 296 | le = preprocessing.LabelEncoder() 297 | le.fit(df.ID20) 298 | df.ID20 = le.transform(df.ID20) 299 | 300 | le = preprocessing.LabelEncoder() 301 | le.fit(df.f642) 302 | df.f642 = le.transform(df.f642) 303 | 304 | taxi = df[taxi.columns].copy().drop_duplicates() 305 | print(taxi.shape) 306 | 307 | t11 = df[t11.columns].copy().drop_duplicates() 308 | print(t11.shape) 309 | 310 | t5 = df[t5.columns].copy().drop_duplicates() 311 | print(t5.shape) 312 | 313 | t20 = df[t20.columns].copy().drop_duplicates() 314 | print(t20.shape) 315 | 316 | t16 = df[t16.columns].copy().drop_duplicates() 317 | print(t16.shape) 318 | 319 | taxi.sort_values("f642",inplace=True) 320 | t5.sort_values("ID5",inplace=True) 321 | t20.sort_values("ID20",inplace=True) 322 | t16.sort_values("ID16",inplace=True) 323 | t11.sort_values("ID11",inplace=True) 324 | 325 | print(taxi.columns) 326 | t5 = t5[['ID5', 'f642', 'f189']].copy() 327 | t20 = t20[['ID20', 'f642', 'f521', 'f522', 'f523']].copy() 328 | t16 = t16[['ID16', 'f642', 'f406', 'f407', 'f408']].copy() 329 | t11 = t11[['ID11', 'f642', 'f298', 'f300', 'f302', 'f306', 'Booted in Error', 330 | 'Duplicate Case', 'Executed', 'NJS Released', 'Other', 331 | 'Paid in the Field', 'Redeemed', 'Reduced', 'Salvage History', 332 | 'Salvage and Total Loss', 'Salvage/Total Loss/Export', 'Sold', 333 | 'Sold Abandoned', 'Stolen Vehicle', 'Total Loss', 'Towed in Error', 334 | 'Vehicle Not Towed', 'Zero Released']].copy() 335 | print(t5.columns) 336 | print(t20.columns) 337 | print(t16.columns) 338 | print(t11.columns) 339 | 340 | dataset = 'taxi' 341 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataset) 342 | 343 | taxi.to_csv(DATASET_DIR + "train-taxi.csv", index=False) 344 | t5.to_csv(DATASET_DIR + "train-t5.csv", index=False) 345 | t20.to_csv(DATASET_DIR + "train-t20.csv", index=False) 346 | t16.to_csv(DATASET_DIR + "train-t16.csv", index=False) 347 | 348 | taxi_ = np.ascontiguousarray(taxi.values.astype(np.float64)) 349 | t5_ = np.ascontiguousarray(t5.values.astype(np.float64)) 350 | t20_ = np.ascontiguousarray(t20.values.astype(np.float64)) 351 | t16_ = np.ascontiguousarray(t16.values.astype(np.float64)) 352 | 353 | np.save(DATASET_DIR + "train-taxi.npy", taxi_) 354 | np.save(DATASET_DIR + "train-t5.npy", t5_) 355 | np.save(DATASET_DIR + "train-t20.npy", t20_) 356 | np.save(DATASET_DIR + "train-t16.npy", t16_) 357 | 358 | uni = np.sort(df.target.unique()) 359 | print(uni) 360 | # print(df.target.unique()) 361 | 362 | print(df.columns) 363 | 364 | df = df[['f642', 'ID5', 'ID11', 'ID16', 'ID20', 'f643', 'target', 'f298', 'f300', 'f302', 'f306', 365 | 'Booted in Error', 'Duplicate Case', 'Executed', 'NJS Released', 366 | 'Other', 'Paid in the Field', 'Redeemed', 'Reduced', 'Salvage History', 367 | 'Salvage and Total Loss', 'Salvage/Total Loss/Export', 'Sold', 368 | 'Sold Abandoned', 'Stolen Vehicle', 'Total Loss', 'Towed in Error', 369 | 'Vehicle Not Towed', 'Zero Released', 'f189', 'ID5', 'f521', 370 | 'f522', 'f523', 'f406', 'f407', 'f408', 'rowID']].copy() 371 | 372 | for cate in uni: 373 | tmpDF = df[df['target'] == cate].copy() 374 | 375 | le = preprocessing.LabelEncoder() 376 | le.fit(tmpDF.ID11) 377 | tmpDF.ID11 = le.transform(tmpDF.ID11) 378 | 379 | # print(tmpDF.shape) 380 | tmpDF.to_csv(DATASET_DIR + "train-{}-joined.csv".format(cate), index=False) 381 | tmp_ = np.ascontiguousarray(tmpDF.values.astype(np.float64)) 382 | np.save(DATASET_DIR + "train-{}-joined.npy".format(cate), tmp_) 383 | 384 | tmpt11 = tmpDF[t11.columns].copy() 385 | tmpt11.drop_duplicates(inplace=True) 386 | tmpt11.to_csv(DATASET_DIR + "train-{}-t11.csv".format(cate), index=False) 387 | tmpt11_ = np.ascontiguousarray(tmpt11.values.astype(np.float64)) 388 | np.save(DATASET_DIR + "train-{}-t11.npy".format(cate), tmpt11_) 389 | print(tmpt11_.shape) 390 | 391 | print(len(t11.f642.unique())) 392 | print(len(t5.f642.unique())) 393 | print(len(t20.f642.unique())) 394 | print(len(t16.f642.unique())) 395 | # print(len(t11.f642.unique())) 396 | 397 | --------------------------------------------------------------------------------