├── .gitignore
├── LICENSE.md
├── MLModel
    ├── Global.py
    ├── LoadCoreset.py
    ├── LoadData.py
    ├── MLmodel
    │   ├── linearRegression.py
    │   └── logisticRegression.py
    ├── hidden.py
    ├── optimizer.py
    └── paramRange.py
├── README.md
├── RECON
    ├── CMakeLists.txt
    ├── data.h
    ├── global.h
    ├── main.cpp
    ├── mycsBrazil.h
    ├── mycsIMDBC.h
    ├── mycsStackn.h
    ├── mycsTaxi.h
    ├── type.h
    └── util.h
├── linear-universal.py
├── logistic-universal.py
└── preprocess
    ├── Brazil.py
    ├── IMDBC-5.py
    ├── IMDBC-Linear.py
    ├── stack.py
    └── taxi.py


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | *.xml
3 | *.iml
4 | MLModel/.DS_Store
5 | .DS_Store
6 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright (c)
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/MLModel/Global.py:
--------------------------------------------------------------------------------
1 | DATAPATH = "/home/jiayi/disk/C-craig/dataset/"
2 | CSPATH = "/home/jiayi/disk/C-craig/"


--------------------------------------------------------------------------------
/MLModel/LoadCoreset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from MLModel.Global import *
 3 | 
 4 | def LoadCoreset(coreset_from, data, subset_size, batch=0, sampleSize=0):
 5 |     assert coreset_from == 'diskOurs'
 6 |     if coreset_from == 'diskOurs':
 7 |         assert batch==0
 8 |         if batch==0:
 9 |             if subset_size == 0.00001:
10 |                 file_name = CSPATH+"inuse/{}-0.00001-ours.npz".format(data)
11 |             else:
12 |                 file_name = CSPATH+'inuse/{}-{}-ours.npz'.format(data, str(subset_size))
13 |     print("【Load file path】 is ", file_name)
14 | 
15 | 
16 |     if file_name != '':
17 |         print(f'reading from {file_name}')
18 |         dataset = np.load(f'{file_name}')
19 |         order, weights, total_ordering_time = dataset['order'], dataset['weight'], dataset['order_time']
20 |         print(" 【Coreset size】 is ", order.shape)
21 |         return order, weights, total_ordering_time


--------------------------------------------------------------------------------
/MLModel/LoadData.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from MLModel.Global import *
 3 | def load_dataset(dataset, prop=0.1, regression=False):
 4 |     assert dataset in ['IMDBCLinear', 'IMDBLargeCLinear', 'Brazilnew', 'IMDBC5', 'IMDBLargeC5', 'taxi', 'stackn']
 5 | 
 6 |     X_train = np.load(DATAPATH + "{}-train-X.npy".format(dataset))
 7 |     X_val = np.load(DATAPATH + "{}-val-X.npy".format(dataset))
 8 |     X_test = np.load(DATAPATH + "{}-test-X.npy".format(dataset))
 9 |     y_train = np.load(DATAPATH + "{}-train-y.npy".format(dataset))
10 |     y_val = np.load(DATAPATH + "{}-val-y.npy".format(dataset))
11 |     y_test = np.load(DATAPATH + "{}-test-y.npy".format(dataset))
12 | 
13 |     if regression == False:
14 |         assert  dataset in ['IMDBC5','IMDBLargeC5', 'Brazilnew']
15 |         print("Is Multi class")
16 |         if dataset in ['IMDBC5', 'IMDBLargeC5', 'Brazilnew']:
17 |             num_class = 5
18 |         print("Num class  ", num_class)
19 |         if dataset in ['Brazil5']:
20 |             y_train-=1
21 |             y_val-=1
22 |             y_test-=1
23 |         print(np.unique(y_train))
24 |         print(np.unique(y_val))
25 |         print(np.unique(y_test))
26 |         y_train = y_train.astype(np.int32)
27 |         y_val = y_val.astype(np.int32)
28 |         y_test = y_test.astype(np.int32)
29 |         y_train = np.eye(num_class)[y_train]
30 |         y_val = np.eye(num_class)[y_val]
31 |         y_test = np.eye(num_class)[y_test]
32 |     elif not regression:
33 |         y_train = np.reshape(y_train, (-1, 1))
34 |         y_val = np.reshape(y_val, (-1, 1))
35 |         y_test = np.reshape(y_test, (-1, 1))
36 |     print(f'Training size: {len(y_train)}, Test size: {len(y_test)}')
37 |     return X_train, y_train, X_val, y_val, X_test, y_test
38 | 
39 | 


--------------------------------------------------------------------------------
/MLModel/MLmodel/linearRegression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn import metrics
 3 | class LinearRegression(object):
 4 |     def __init__(self, dim):
 5 |         self.W = np.zeros(dim)
 6 |         self.params = self.W
 7 | 
 8 |     def activation(self, X, params=None):
 9 |         pred_ys = X.dot(self.W)
10 |         return pred_ys
11 | 
12 |     def loss(self, X,y, l2_reg=0.00, ):
13 |         num_of_samples = X.shape[0]
14 |         f_mat = X.dot(self.W)
15 |         diff = f_mat - y
16 |         loss = 1.0 * np.sum(diff * diff) / num_of_samples
17 | 
18 |         return loss + l2_reg * np.linalg.norm(self.W) ** 2 / 2
19 | 
20 |     def gradient(self, X, y, l2_reg=0.00, params=None, cnt=0):
21 |         num_of_samples = X.shape[0]
22 |         f_mat = X.dot(self.W)
23 |         diff = f_mat - y
24 |         if type(diff)==np.array and diff.shape[0]==1:
25 |             gradient = (diff[0]*(X)).T - l2_reg * self.W
26 |             return gradient
27 |         else:
28 |             if type(diff) ==np.float64:
29 |                 gradient = (diff *X).T - l2_reg * self.W
30 |             else:
31 |                 gradient = ((diff.T).dot(X)).T - l2_reg * self.W
32 |             return gradient
33 | 
34 | 
35 |     def MASLE(self, X,y):
36 |         predict_y = self.activation(X)
37 |         MAE = metrics.mean_absolute_error(y, predict_y)
38 |         MSE = metrics.mean_squared_error(y,  predict_y)
39 |         MSLE=0
40 |         return MAE, MSE, MSLE


--------------------------------------------------------------------------------
/MLModel/MLmodel/logisticRegression.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn import metrics
  3 | 
  4 | def sigmoid(x):
  5 |     return 1. / (1 + np.exp(-x))
  6 | 
  7 | 
  8 | def softmax(x):
  9 |     if x.ndim == 1:
 10 |         e = np.exp(x - np.max(x))
 11 |     else:
 12 |         e = np.exp(x - np.max(x, axis=1, keepdims=True))
 13 | 
 14 |     if e.ndim == 1:
 15 |         return e / np.sum(e, axis=0)
 16 |     else:
 17 |         return e / np.array([np.sum(e, axis=1)]).T
 18 | 
 19 | 
 20 | class LogisticRegression(object):
 21 |     def __init__(self, dim, num_class):
 22 |         self.binary = num_class == 1
 23 |         self.W = np.zeros((dim, num_class))
 24 |         self.b = np.zeros(num_class)
 25 |         self.params = np.array([self.W, self.b])
 26 | 
 27 |     def activation(self, input, params=None):
 28 |         W, b = params if params is not None else self.params
 29 |         if self.binary:
 30 |             return sigmoid(np.dot(input, W) + b)
 31 |         else:
 32 |             return softmax(np.dot(input, W) + b)
 33 | 
 34 |     def loss(self, input, label, l2_reg=0.00, params=None):
 35 |         sigmoid_activation = self.activation(input, params)
 36 | 
 37 |         cross_entropy = - np.mean(np.sum(label * np.log(sigmoid_activation) +
 38 |                                          (1 - label) * np.log(1 - sigmoid_activation), axis=1))
 39 | 
 40 |         return cross_entropy + l2_reg * np.linalg.norm(self.W) ** 2 / 2
 41 | 
 42 |     def f1(self, input, label, params=None):
 43 |         if self.binary:
 44 |             return metrics.f1_score(label, np.rint(self.predict(input, params)), average = 'weighted')
 45 |         else:
 46 |             return metrics.f1_score(np.argmax(label, axis=1), np.argmax(self.predict(input, params), axis=1),
 47 |                                     average='weighted')
 48 |     def recall(self, input, label, params=None):
 49 |         if self.binary:
 50 |             return metrics.recall_score(label, np.rint(self.predict(input, params)), average = 'weighted')
 51 |         else:
 52 |             return metrics.recall_score(np.argmax(label,axis=1), np.argmax(np.rint(self.predict(input, params)), axis=1), average = 'weighted')
 53 |     def precision(self, input, label, params=None):
 54 |         if self.binary:
 55 |             return metrics.precision_score(label, np.rint(self.predict(input, params)), average = 'weighted')
 56 |         else:
 57 |             return metrics.precision_score(np.argmax(label, axis=1), np.argmax(self.predict(input, params),axis=1), average = 'weighted')
 58 | 
 59 |     def acc(self, input, label, params=None):
 60 |         if self.binary:
 61 |             return metrics.accuracy_score(label, np.rint(self.predict(input, params)))
 62 |         else:
 63 |             if len(label.shape)>1:
 64 | 
 65 |                 label = np.argmax(label, axis=1)
 66 |             pred = self.predict(input, params)
 67 |             if len(pred.shape)>1:
 68 |                 pred = np.argmax(pred, axis=1)
 69 |             return metrics.accuracy_score(label,pred)
 70 | 
 71 | 
 72 |     def predict(self, input, params=None):
 73 |         return self.activation(input, params)
 74 | 
 75 |     def accuracy(self, input, label, params=None):
 76 |         if self.binary:
 77 |             return np.mean(np.isclose(np.rint(self.predict(input, params)), label))
 78 |         else:
 79 |             if len(label.shape)>1:
 80 |                 label = np.argmax(label, axis=1)
 81 |             pred = self.predict(input, params)
 82 |             if len(pred.shape)>1:
 83 |                 pred = np.argmax(pred, axis=1)
 84 |             return metrics.accuracy_score(label,
 85 |                                           pred)
 86 |     def gradient(self, input, label, l2_reg=0.00, params=None,cnt=1):
 87 |         p_y_given_x = self.activation(input, params)
 88 |         d_y = label - p_y_given_x
 89 |         d_W = -np.dot(np.reshape(input, (cnt, -1)).T, np.reshape(d_y.T, (cnt, -1))) - l2_reg * self.W
 90 |         d_b = -np.mean(d_y, axis=0)
 91 |         return np.array([d_W, d_b])
 92 | 
 93 |     def gradientVec(self, input, label, cnt, l2_reg=0.00, params=None):
 94 |         p_y_given_x = self.activation(input, params)
 95 |         d_y = label - p_y_given_x
 96 |         d_W = -np.dot(np.reshape(input, (cnt, -1)).T, np.reshape(d_y.T, (cnt, -1))) - l2_reg * self.W
 97 |         d_b = -np.mean(d_y, axis=0)
 98 |         return np.array([d_W, d_b])
 99 | 
100 |     def MASLE(self, X,y):
101 |         predict_y = self.activation(X)
102 |         if len(predict_y.shape)>0:
103 |             predict_y = np.argmax(predict_y, axis=1)
104 | 
105 |         if len(y.shape) > 0:
106 |             y = np.argmax(y, axis=1)
107 |         MAE = metrics.mean_absolute_error(y, predict_y)
108 |         MSE = metrics.mean_squared_error(y,  predict_y)
109 |         if np.any(y<0):
110 |             MSLE=0
111 |         else:
112 |             MSLE = metrics.mean_squared_log_error(y, predict_y)
113 | 
114 |         return MAE, MSE, MSLE


--------------------------------------------------------------------------------
/MLModel/hidden.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | class HiddenPrints:
 5 |     def __init__(self, activated=True):
 6 |         self.activated = activated
 7 |         self.original_stdout = None
 8 | 
 9 |     def open(self):
10 |         sys.stdout.close()
11 |         sys.stdout = self.original_stdout
12 | 
13 |     def close(self):
14 |         self.original_stdout = sys.stdout
15 |         sys.stdout = open(os.devnull, 'w')
16 | 
17 |     def __enter__(self):
18 |         if self.activated:
19 |             self.close()
20 | 
21 |     def __exit__(self, exc_type, exc_val, exc_tb):
22 |         if self.activated:
23 |             self.open()


--------------------------------------------------------------------------------
/MLModel/optimizer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | from torch.utils.tensorboard import SummaryWriter
  4 | from MLModel.Global import *
  5 | class Optimizer(object):
  6 | 
  7 |     @staticmethod
  8 |     def order_elements(shuffle, n, seed=1234):
  9 |         if shuffle == 0:
 10 |             indices = np.arange(n)
 11 |         elif shuffle == 1:
 12 |             indices = np.random.permutation(n)
 13 |         elif shuffle == 2:
 14 |             indices = np.random.randint(0, n, n)
 15 |         else:  # fixed permutation
 16 |             np.random.seed(seed)
 17 |             indices = np.random.permutation(n)
 18 |         return indices
 19 | 
 20 |     def optimize(self, method, model, data, labels, weights, num_epochs, shuffle, lr, l2_reg):
 21 |         if method == 'sgd':
 22 |             return self.sgd(model, data, labels, weights, num_epochs, shuffle, lr, l2_reg)
 23 |         elif method == 'saga':
 24 |             return self.saga(model, data, labels, weights, num_epochs, shuffle, lr, l2_reg)
 25 |         elif method == 'svrg':
 26 |             return self.svrg(model, data, labels, weights, num_epochs, shuffle, lr, l2_reg)
 27 |         elif method =='BGD':
 28 |             return self.BGD(model, data, labels, weights, num_epochs, shuffle, lr, l2_reg)
 29 |         else:
 30 |             print('Optimizer is not defined!')
 31 | 
 32 |     def sgd(self, model, data, labels, weights, num_epochs, shuffle, lr, l2_reg):
 33 |         n = len(data)
 34 |         W = [[]] * num_epochs
 35 |         T = np.empty(num_epochs)
 36 | 
 37 |         time.sleep(.1)
 38 |         start_epoch = time.process_time()
 39 |         writer = SummaryWriter(CSPATH+'/tensorboard/')
 40 |         for epoch in range(num_epochs):
 41 |             indices = self.order_elements(shuffle, n)
 42 |             for i in indices:
 43 |                 grads = model.gradient(data[i], labels[i], l2_reg / n) * weights[i]
 44 | 
 45 |                 model.params -= lr[epoch] * grads
 46 |             W[epoch] = model.params.copy()
 47 |             T[epoch] = (time.process_time() - start_epoch)
 48 | 
 49 |             writer.add_scalar('loss', model.loss(data,labels), global_step=epoch)
 50 |         return W, T
 51 | 
 52 |     def BGD(self, model, data, labels, weights, num_epochs, shuffle, lr, l2_reg):
 53 |         n = len(data)
 54 |         W = [[]] * num_epochs
 55 |         T = np.empty(num_epochs)
 56 | 
 57 |         time.sleep(.1)
 58 |         start_epoch = time.process_time()
 59 | 
 60 |         for epoch in range(num_epochs):
 61 |             indices = self.order_elements(shuffle, n)
 62 |             # grads_ = None
 63 | 
 64 |             grads_ = model.gradient(data, labels,l2_reg, cnt=n)/n
 65 |             # print('grads_ is ', grads_)
 66 |             # for i in indices:
 67 |             #     if grads_ is None:
 68 |             #         grads_ = model.gradient(data[i], labels[i],  l2_reg / n) * weights[i]
 69 |             #         # grads_ = np.dot(model.gradientVec(data, labels, n, l2_reg / n) , weights)
 70 |             #     else:
 71 |             #         grads_ += model.gradient(data[i], labels[i],  l2_reg / n) * weights[i]
 72 |             #         # grads_ += np.dot(model.gradient(data, labels, n, l2_reg / n) , weights)
 73 |             model.params -= lr[epoch] * grads_
 74 |             W[epoch] = model.params.copy()
 75 |             T[epoch] = (time.process_time() - start_epoch)
 76 |         return W, T
 77 | 
 78 |     def saga(self, model, data, labels, weights, num_epochs, shuffle, lr, l2_reg):
 79 |         n = len(data)
 80 |         W = [[]] * num_epochs
 81 |         T = np.empty(num_epochs)
 82 | 
 83 |         time.sleep(.1)
 84 |         start_epoch = time.process_time()
 85 | 
 86 |         saved_grads = np.array([model.gradient(data[i], labels[i], l2_reg / n) * weights[i] for i in range(n)])
 87 |         avg_saved_grads = saved_grads.mean(axis=0)
 88 | 
 89 |         for epoch in range(num_epochs):
 90 |             indices = self.order_elements(shuffle, n)
 91 |             for i in indices:
 92 |                 grads = model.gradient(data[i], labels[i], l2_reg / n) * weights[i]
 93 |                 model.params -= lr[epoch] * (grads - saved_grads[i] + avg_saved_grads)
 94 |                 avg_saved_grads += (grads - saved_grads[i]) / n
 95 |                 saved_grads[i] = grads
 96 | 
 97 |             W[epoch] = model.params.copy()
 98 |             T[epoch] = (time.process_time() - start_epoch)
 99 |         return W, T
100 | 
101 |     def svrg(self, model, data, labels, weights, num_epochs, shuffle, lr, l2_reg):
102 |         n = len(data)
103 |         W = [[]] * num_epochs
104 |         T = np.empty(num_epochs)
105 | 
106 |         time.sleep(.1)
107 |         start_epoch = time.process_time()
108 | 
109 |         for epoch in range(num_epochs):
110 |             init_grads = np.array([model.gradient(data[i], labels[i], l2_reg / n) * weights[i] for i in range(n)])
111 |             avg_init_grads = np.mean(init_grads, axis=0)
112 | 
113 |             indices = self.order_elements(shuffle, n)
114 |             for i in indices:
115 |                 grads = model.gradient(data[i], labels[i], l2_reg / n) * weights[i]
116 |                 model.params -= lr[epoch] * (grads - init_grads[i] + avg_init_grads)
117 | 
118 |             W[epoch] = model.params.copy()
119 |             T[epoch] = (time.process_time() - start_epoch)
120 |         return W, T
121 | 


--------------------------------------------------------------------------------
/MLModel/paramRange.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | def get_param_range(subset_size, exp_decay, method, data):
 3 |     if method=='BGD':
 4 |         g_range = [0.001]
 5 |         b_range = np.arange(180, 200, 1) * .005
 6 |         return g_range, b_range
 7 |     if data=='IMDBLargeCLinear':
 8 |         g_range = [0.0002]
 9 |         b_range = np.arange(20, 40, 1) * 0.005
10 |     elif data in [ 'IMDBCLinear','IMDBLargeC5']:
11 |         g_range = [0.0001]
12 |         b_range = np.arange(180, 200, 1) * .005
13 |     elif data in ['IMDBC5']:
14 |         g_range = [0.001]
15 |         b_range = np.arange(180, 200, 1) * .005
16 |     elif data in ['Brazilnew']:
17 |         g_range = [0.01]
18 |         b_range = np.arange(20, 40, 1) * .005
19 |     elif data in ['stackn']:
20 |         g_range = [0.0001]
21 |         b_range = np.arange(180, 200, 1) * .005
22 |     elif data in ['taxi']:
23 |         g_range = [0.0001]
24 |         b_range = np.arange(20, 40, 1) * .005
25 |     else:
26 |         g_range = [0.1, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.25, 0.3, 0.35]
27 |         b_range = [0.7, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.9, 0.95]
28 |         if subset_size < 1:
29 |             g_range = [0.000035, 0.009, 0.01, 0.013, 0.015, 0.017, 0.018, 0.019, 0.02, 0.025, 0.03]
30 |             b_range = np.arange(0, 19) * .01
31 |     return g_range, b_range
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RECON
  2 | This repo contains the codes for the VLDB 2023 paper [_Coresets over multiple tables for feature-rich and data-efficient machine learning_](https://www.vldb.org/pvldb/vol16/p64-wang.pdf). 
  3 | 
  4 | 
  5 | # Quick Start
  6 | 
  7 | ## Folder Structure
  8 | 
  9 |     .
 10 |     ├── preprocess              # Data preprocessing codes
 11 |     ├── RECON                   # RECON codes for coreset construction
 12 |     ├── MLModel                 # ML models training codes to test the performance of RECON
 13 |     ├── linear-universal.py     # Evaluation of regression models
 14 |     ├── logistic-universal.py   # Evaluation of classification models
 15 |     └── README.md               
 16 | 
 17 | 
 18 | 
 19 | ## Requirements
 20 | Before running the codes, please make sure your C++ version is above `C++14`. 
 21 | Library cnpy is also needed to save results in the format of npz.
 22 | 
 23 | The dataset path is configured by variable `DATAPATH` (line 9 in  global.h), which should also be configured properly before running the codes.
 24 | The datasets can be downloaded from [dataset link](https://drive.google.com/drive/folders/1kOLJQRnJk-_87y3WVq8Dwu18JYylbQhb?usp=sharing).
 25 | - `Python 3.7+`
 26 | 
 27 | - ` C++14`
 28 | - `cnpy: a library to read/write .npy and .npz files in C/C++`  [link](https://github.com/rogersce/cnpy)
 29 | 
 30 | 
 31 | 
 32 | ## Usage
 33 | 
 34 | ### RECON on IMDB / IMDB-Large:
 35 | First build `./RECON` by:
 36 | 
 37 | - `cd RECON`
 38 | 
 39 | - `cmake .`
 40 | 
 41 | - `make`
 42 | 
 43 | 
 44 | and then perform RECON on different datasets by passing different arguments.
 45 | > parameter setting:  
 46 | >> [dataName] [proportion] [0:IMDB 1:IMDB-Large] [0:Classification 1:Regression]
 47 | 
 48 | - `IMDB,  p=0.0128 for classification:   ./RECON IMDB 0.0128 0 0 `
 49 | - `IMDB,  p=0.0032 for regression:   ./RECON IMDB 0.0032 0 1`
 50 | - `IMDB-Large, p=0.0016 for classification: ./RECON IMDB 0.0016 1 0`
 51 | - `IMDB-Large, p=0.0016 for regression ./RECON IMDB 0.0016 1 1`
 52 | 
 53 | 
 54 | 
 55 | ### RECON on stack / Brazil / taxi:
 56 | 
 57 | 
 58 | > parameter setting:  
 59 | >> [dataName] [proportion] 
 60 | - `stack, p=0.0032: ./RECON stack 0.0032`
 61 | - `Brazil, p=0.0016: ./RECON Brazil 0.0016`
 62 | - `taxi, p=0.0032: ./RECON taxi 0.0032`
 63 | 
 64 | >  Note: '-L/usr/local/lib/ -lcnpy -lz' may also need to be added to the program arguments, which depends on the method to install cnpy.
 65 | 
 66 | **Note:** Before running RECON, make sure the variable `DATAPATH` (line 9 in  global.h) is configured as the path of dataset.
 67 | Besides, make sure the vaiable `CSPATH` (line 10 in gloabl.h) is configured as the location to save RECON's output, i.e., coresets.
 68 | 
 69 | 
 70 | ### Training Logistic Regression
 71 | Run `logsitic-universal.py` to train logistic regression models.
 72 | 
 73 | - IMDB: `python logistic-universal.py --data IMDBC5 --method sgd -s 0.0128 `
 74 | 
 75 | - IMDB-Large: `python logistic-universal.py --data IMDBLargeC5 --method sgd -s 0.0016 `
 76 | 
 77 | 
 78 | - Brazil: `python logistic-universal.py --data Brazilnew --method sgd -s 0.0016 `
 79 | 
 80 |  
 81 | 
 82 | ### Training Linear Regression
 83 | Run `linear-universal.py` to train linear regression models.
 84 | 
 85 | - IMDB: `python linear-universal.py --data IMDBCLinear --method sgd -s 0.0032 `
 86 | 
 87 | - IMDB-Large: `python linear-universal.py --data IMDBLargeCLinear --method sgd -s 0.0016 `
 88 | 
 89 | - stack: `python linear-universal.py --data stackn --method sgd -s 0.0032`
 90 | 
 91 | 
 92 | - taxi: `python linear-universal.py --data taxi --method sgd -s 0.0032`
 93 | 
 94 | **Note:** Before training models, make sure variable `DATAPATH` (line 1 in  Global.py) is configured as the path of datasets. 
 95 | And `CSPATH`(line 2 in  Global.py) is configured as the path to RECON's output (path of coreset).  
 96 | 
 97 | ### Other Baselines
 98 | 
 99 | - **Sample-Join**: The argument `--greedy [0:Uniform Sampling 1:Coreset (default)]` specifies the subset for training. 
100 | Sample-Join can be achieved by setting `--greedy 0`.
101 | For example, to train a logistic regression model on a uniform sampling of IMDB, you may use:
102 | ```sh
103 | python logistic-universal.py --data IMDBC5 --method sgd -s 0.0128 --greedy 0
104 | ```
105 | 
106 | 
107 | - **Full**: Full can be achieved by setting `-s` to `1` on top of Sample-Join. 
108 | For example, to train a logistic regression model using full data of IMDB, you may use:
109 | ```sh
110 | python logistic-universal.py --data IMDBC5 --method sgd -s 1 --greedy 0
111 | ```
112 | 
113 | - **Coreset-Join** and **Join-Coreset**: You can find their official implementations from [link](https://github.com/baharanm/craig).
114 | 
115 | ### Data Preprocessing
116 | In general, our preprocessing of each dataset in the `preprocess` directory can be summarized as data cleaning, normalization, and partition by label. 
117 | We provide the preprocessed data in [dataset link](https://drive.google.com/drive/folders/1kOLJQRnJk-_87y3WVq8Dwu18JYylbQhb?usp=sharing).
118 | The raw datasets can be found in their original sources.
119 | 
120 | ## License
121 | 
122 | The project is available under the [MIT](LICENSE.md) license.
123 | 
124 | ## Citation
125 | If our work is helpful to you, please cite our [paper](https://www.vldb.org/pvldb/vol16/p64-wang.pdf):
126 | ```bibtex
127 | @article{wang2022coresets,
128 |   title={Coresets over multiple tables for feature-rich and data-efficient machine learning},
129 |   author={Wang, Jiayi and Chai, Chengliang and Tang, Nan and Liu, Jiabin and Li, Guoliang},
130 |   journal={Proceedings of the VLDB Endowment},
131 |   volume={16},
132 |   number={1},
133 |   pages={64--76},
134 |   year={2022},
135 |   publisher={VLDB Endowment}
136 | }
137 | 
138 | ```
139 | 


--------------------------------------------------------------------------------
/RECON/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(RECON)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 14)
 5 | 
 6 | set(LINK_DIR /usr/local/lib/)
 7 | set(INC_DIR /usr/local/include/)
 8 | 
 9 | include_directories(${INC_DIR})
10 | link_directories(${LINK_DIR})
11 | link_libraries(cnpy)
12 | link_libraries(z)
13 | 
14 | FIND_PACKAGE(OpenMP REQUIRED)
15 | if(OPENMP_FOUND)
16 |     set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS}")
17 |     set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS}")
18 | endif()
19 | 
20 | 
21 | add_executable(RECON main.cpp)
22 | target_link_libraries(RECON  cnpy z)


--------------------------------------------------------------------------------
/RECON/data.h:
--------------------------------------------------------------------------------
 1 | #ifndef UNTITLED2_DATA_H
 2 | #define UNTITLED2_DATA_H
 3 | #include "type.h"
 4 | #include <sstream>
 5 | #include "type.h"
 6 | #include "global.h"
 7 | void loadData(char * data= nullptr){
 8 |     if(!data) {
 9 |         assert(0);
10 |     }
11 |     std::stringstream ss;
12 |     std::string dataName(data);
13 |     ss.str("");
14 |     ss << DATAPATH << dataName << "-train-X.npy";
15 |     cnpy::NpyArray trainX = cnpy::npy_load(ss.str());
16 |     n = trainX.shape[0];
17 |     d = trainX.shape[1];
18 |     X = (dtype *) malloc(n * d * sizeof(dtype));
19 |     memcpy(X, trainX.data<dtype>(), n * d * sizeof(dtype));
20 | 
21 |     ss.str("");
22 |     ss << DATAPATH << dataName << "-train-y.npy";
23 |     cnpy::NpyArray trainY = cnpy::npy_load(ss.str());
24 |     assert(trainY.shape[0] == trainX.shape[0]);
25 |     std::cout<<"word size is "<<trainY.word_size<<"\n";
26 |     n = trainY.shape[0];
27 |     Y = (labeltype *) malloc(1LL * n * sizeof(labeltype));
28 |     memcpy(Y, trainY.data<labeltype>(), n * sizeof(labeltype));
29 | 
30 | }
31 | 
32 | #endif //UNTITLED2_DATA_H
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/RECON/global.h:
--------------------------------------------------------------------------------
 1 | #ifndef UNTITLED2_GLOBAL_H
 2 | #define UNTITLED2_GLOBAL_H
 3 | #include<map>
 4 | #include<string>
 5 | #include<omp.h>
 6 | #include "cnpy.h"
 7 | 
 8 | 
 9 | const std::string DATAPATH ="/home/jiayi/disk/C-craig/dataset/";
10 | const std::string CSPATH ="/home/jiayi/disk/C-craig/inuse/";
11 | const int tc = 16;
12 | dtype *X;
13 | labeltype *Y;
14 | dtype *similarity;
15 | idtype n, d, N;
16 | idtype * Map;
17 | std::map<idtype,int> cateNum;
18 | int cateCnt;
19 | dtype alpha = 1.;
20 | idtype target_coreset_size;
21 | idtype real_coreset_size;
22 | idtype* nn;
23 | dtype* maxSim;
24 | dtype* weight;
25 | dtype * lazy;
26 | idtype * idx;
27 | idtype * invidx;
28 | std::vector<dtype> weight_vec;
29 | std::priority_queue<std::pair<dtype, idtype> > pq;
30 | std::vector<idtype> coreset;
31 | std::vector<idtype> coresetAll;
32 | dtype curSum;
33 | dtype f_norm;
34 | dtype norm;
35 | idtype cSize;
36 | 
37 | void freeAll(){
38 |     free(Map);
39 |     free(lazy);
40 |     free(invidx);
41 |     free(idx);
42 |     free(similarity);
43 |     free(nn);
44 |     free(maxSim);
45 |     free(weight);
46 | }
47 | #endif //UNTITLED2_GLOBAL_H
48 | 
49 | 


--------------------------------------------------------------------------------
/RECON/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <queue>
 3 | #include <cstring>
 4 | #include "type.h"
 5 | #include "util.h"
 6 | #include "data.h"
 7 | #include "omp.h"
 8 | #include "cnpy.h"
 9 | #include <cstdio>
10 | #include<random>
11 | #include<algorithm>
12 | #include <fstream>
13 | #include <map>
14 | #include <time.h>
15 | #include "mycsIMDBC.h"
16 | #include "mycsStackn.h"
17 | #include "mycsTaxi.h"
18 | #include "mycsBrazil.h"
19 | #include <chrono>
20 | 
21 | using namespace  std::chrono;
22 | using std::chrono::system_clock;
23 | 
24 | 
25 | int main(int argc, char** argv) {
26 | 
27 |     omp_set_num_threads(tc);
28 | 
29 |     std::cout<<argv[1]<<"\n";
30 |     std::string dataName = (std::string)argv[1];
31 |     auto st   = system_clock::now();
32 | 
33 |     std::chrono::duration<long, std::ratio<1, 1000000>> sim_time(0);
34 | 
35 |     if(dataName == "IMDB")
36 |         sim_time = IMDBC::testIMDBC(std::stod(argv[2]), std::atol(argv[3]),0.01, std::atol(argv[4]));
37 |     else if(dataName == "stack")
38 |         sim_time = stackn::testStackn(std::stod(argv[2]));
39 |     else if(dataName == "Brazil")
40 |         sim_time = Brazil::testBrazil(std::stod(argv[2]));
41 |     else if(dataName == "taxi")
42 |         sim_time = taxi::testTaxi(std::stod(argv[2]));
43 | 
44 | 
45 |     auto en = system_clock::now();
46 |     auto duration = duration_cast<microseconds>(en - st);
47 |     std::cout << "### Find Coreset Spent "
48 |               << double(duration.count()) * microseconds::period::num / microseconds::period::den << " seconds.\n";
49 |     std::cout << "### Find Coreset(except sim) Spent "
50 |               << double((duration - sim_time).count()) * microseconds::period::num / microseconds::period::den
51 |               << " seconds.\n";
52 | }
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/RECON/mycsBrazil.h:
--------------------------------------------------------------------------------
  1 | #ifndef UNTITLED2_MYCSBrazil_H
  2 | #define UNTITLED2_MYCSBrazil_H
  3 | 
  4 | 
  5 | #include "cnpy.h"
  6 | #include "type.h"
  7 | #include "util.h"
  8 | #include "data.h"
  9 | #include <cstring>
 10 | #include <fstream>
 11 | #include <random>
 12 | #include <unordered_map>
 13 | 
 14 | 
 15 | namespace Brazil{
 16 |     using std::chrono::system_clock;
 17 |     using std::chrono::duration_cast;
 18 |     using std::chrono::microseconds;
 19 |     std::random_device rd;
 20 |     std::mt19937 mt(rd());
 21 | 
 22 | 
 23 |     cnpy::NpyArray reviewArr;
 24 |     cnpy::NpyArray orderArr;
 25 |     cnpy::NpyArray orderItemArr;
 26 |     cnpy::NpyArray productArr;
 27 |     cnpy::NpyArray joinArr;
 28 | 
 29 | 
 30 |     dtype *dp;
 31 |     dtype *review, *order, *orderItem, *product, *join;
 32 |     idtype reviewNum, reviewDim, orderNum, orderDim, orderItemNum, orderItemDim, productNum, productDim, joinNum, joinDim;
 33 |     dtype *reviewSim, *orderSim, *orderItemSim, *productSim;
 34 | 
 35 | 
 36 |     cnpy::NpyArray loadNpy(std::string fileDir);
 37 |     void readBrazilnewNpy(int cate);
 38 |     void mallocBrazilnewArray();
 39 |     void loadToArr(int cate);
 40 | 
 41 | 
 42 |     void mallocBrazilnewSim();
 43 |     void calBrazilnewSim();
 44 |     void initWeight();
 45 | 
 46 | 
 47 |     void sampleOneBrazilnew(idtype &uID, idtype &ID, idtype &qID, idtype &rowID,
 48 |                             idtype &samplejoinID);
 49 |     void sampleBatchBrazilnew(int sampleSize,
 50 |                               std::vector <idtype> &uIDs,
 51 |                               std::vector <idtype> &IDs,
 52 |                               std::vector <idtype> &qIDs,
 53 |                               std::vector <idtype> &rowIDs,
 54 |                               std::vector <idtype> &joinIDs);
 55 | 
 56 |     void realAddOne(idtype joinID);
 57 | 
 58 | 
 59 |     dtype getBenefitBrazilnew(idtype uID,
 60 |                               idtype ID,
 61 |                               idtype qID,
 62 |                               idtype rowID,
 63 |                               idtype joinID,
 64 |                               bool change,
 65 |                               int verbose);
 66 | 
 67 |     std::chrono::duration<long, std::ratio<1, 1000000>> testBrazil(dtype PROP,
 68 |                                                                       dtype epsilon,
 69 |                                                                       int saveWhere,
 70 |                                                                       int verbose
 71 |     );
 72 | 
 73 |     std::vector <idtype> fullCS;
 74 |     std::vector <dtype> fullCSWeight;
 75 |     dtype rW = 33. / 100, oW = 33. / 100, orderItemW = 33. / 100, pW= 1./100;
 76 | 
 77 | 
 78 |     void freeBrazilnew() {
 79 |         free(review);
 80 |         free(orderItem);
 81 |         free(order);
 82 |         free(product);
 83 |         free(join);
 84 |         free(reviewSim);
 85 |         free(orderItemSim);
 86 |         free(orderSim);
 87 |         free(productSim);
 88 |         free(dp);
 89 |     }
 90 | 
 91 | 
 92 |     cnpy::NpyArray loadNpy(std::string fileDir) {
 93 | 
 94 |         cnpy::NpyArray arr = cnpy::npy_load(fileDir);
 95 |         return arr;
 96 |     }
 97 | 
 98 | 
 99 |     void readBrazilnewNpy(int cate) {
100 |         std::stringstream dir;
101 |         dir.str("");
102 | 
103 | 
104 |         dir <<  DATAPATH<< "Brazilnew-formycs/train-cate-"
105 |             << cate << "-review.npy";
106 |         reviewArr = loadNpy(dir.str());
107 |         dir.str("");
108 | 
109 | 
110 |         dir << DATAPATH<< "Brazilnew-formycs/train-cate-"
111 |             << cate << "-orderItem.npy";
112 |         orderItemArr = loadNpy(dir.str());
113 |         dir.str("");
114 | 
115 | 
116 |         dir << DATAPATH<< "Brazilnew-formycs/train-cate-"
117 |             << cate << "-order.npy";
118 |         orderArr = loadNpy(dir.str());
119 |         dir.str("");
120 | 
121 | 
122 |         dir << DATAPATH<< "Brazilnew-formycs/train-cate-"
123 |             << cate << "-product.npy";
124 |         productArr = loadNpy(dir.str());
125 |         dir.str("");
126 | 
127 | 
128 |         dir << DATAPATH<< "Brazilnew-formycs/train-cate-"
129 |             << cate << "-joined.npy";
130 |         joinArr = loadNpy(dir.str());
131 |         dir.str("");
132 |     }
133 | 
134 | 
135 |     void mallocBrazilnewArray() {
136 | 
137 | 
138 |         reviewNum = reviewArr.shape[0];
139 |         reviewDim = reviewArr.shape[1];
140 |         review = (dtype *) malloc(reviewNum * reviewDim * sizeof(dtype));
141 | 
142 | 
143 |         orderItemNum = orderItemArr.shape[0];
144 |         orderItemDim = orderItemArr.shape[1];
145 |         orderItem = (dtype *) malloc(orderItemNum * orderItemDim * sizeof(dtype));
146 | 
147 | 
148 |         orderNum = orderArr.shape[0];
149 |         orderDim = orderArr.shape[1];
150 |         order = (dtype *) malloc(orderNum * orderDim * sizeof(dtype));
151 | 
152 |         productNum = productArr.shape[0];
153 |         productDim = productArr.shape[1];
154 |         product = (dtype *) malloc(productNum * productDim * sizeof(dtype));
155 | 
156 | 
157 |         joinNum = joinArr.shape[0];
158 |         joinDim = joinArr.shape[1];
159 |         join = (dtype *) malloc(joinNum * joinDim * sizeof(dtype));
160 |     }
161 | 
162 |     void loadToArr(int cate) {
163 | 
164 |         readBrazilnewNpy(cate);
165 |         mallocBrazilnewArray();
166 | 
167 |         memcpy(review, reviewArr.data<dtype>(), 1LL * reviewNum * reviewDim * sizeof(dtype));
168 |         memcpy(orderItem, orderItemArr.data<dtype>(), 1LL * orderItemNum * orderItemDim * sizeof(dtype));
169 |         memcpy(order, orderArr.data<dtype>(), 1LL * orderNum * orderDim * sizeof(dtype));
170 |         memcpy(product, productArr.data<dtype>(), 1LL * productNum * productDim * sizeof(dtype));
171 |         memcpy(join, joinArr.data<dtype>(), 1LL * joinNum * joinDim * sizeof(dtype));
172 |     }
173 | 
174 |     void mallocBrazilnewSim() {
175 | 
176 | 
177 |         reviewSim = (dtype *) malloc(reviewNum * reviewNum * sizeof(dtype));
178 |         orderItemSim = (dtype *) malloc(orderItemNum * orderItemNum * sizeof(dtype));
179 |         orderSim = (dtype *) malloc(orderNum * orderNum * sizeof(dtype));
180 |         productSim = (dtype *) malloc(productNum * productNum * sizeof(dtype));
181 |     }
182 | 
183 |     void calBrazilnewSim() {
184 | 
185 |         initSim(reviewSim, review, reviewNum, reviewDim, 3);
186 | 
187 |         initSim(orderSim, order, orderNum, orderDim, 1);
188 | 
189 |         initSim(orderItemSim, orderItem, orderItemNum, orderItemDim, 3);
190 | 
191 |         initSim(productSim, product, productNum, productDim, 1);
192 | 
193 | 
194 | 
195 |     }
196 | 
197 | 
198 |     std::vector<int> joinIDs;
199 |     void initWeight(){
200 | 
201 |         joinIDs.clear();
202 |         joinIDs.reserve(orderItemNum);
203 |         dp = (dtype *)malloc(orderItemNum * sizeof(dtype));
204 |         memset(dp,0, orderItemNum * sizeof(dtype));
205 |         for(int i= 0 ;i < orderItemNum; i++)
206 |             joinIDs.emplace_back(i);
207 |     }
208 | 
209 | 
210 |     void sampleOneBrazilnew(idtype &rID,
211 |                             idtype &oID,
212 |                             idtype &pID,
213 |                             idtype &rowID,
214 |                             idtype &joinID){
215 | 
216 |         int id = joinIDs[mt()% joinIDs.size()];
217 | 
218 |         idtype idx_st = id * joinDim;
219 |         rID    = join[idx_st];
220 |         oID    = join[idx_st + 1];
221 |         pID    = join[idx_st + 2];
222 |         rowID  = join[idx_st + 3];
223 |         joinID = id;
224 |     }
225 | 
226 | 
227 |     void sampleBatchBrazilnew(int sampleSize,
228 |                               std::vector<idtype>& rIDs,
229 |                               std::vector<idtype>& oIDs,
230 |                               std::vector<idtype>& pIDs,
231 |                               std::vector<idtype>& rowIDs,
232 |                               std::vector<idtype>& joinIDs){
233 |         rIDs.resize(sampleSize);
234 |         oIDs.resize(sampleSize);
235 |         pIDs.resize(sampleSize);
236 |         rowIDs.resize(sampleSize);
237 |         joinIDs.resize(sampleSize);
238 | 
239 |         for(int i = 0; i < sampleSize; i ++)
240 |             sampleOneBrazilnew(rIDs[i],
241 |                                oIDs[i],
242 |                                pIDs[i],
243 |                                rowIDs[i],
244 |                                joinIDs[i]);
245 |     }
246 | 
247 |     void realAddOne(idtype joinID){
248 | 
249 |         for(int i = 0 ;i < joinIDs.size();i++){
250 |             if(joinIDs[i] == joinID){
251 |                 std::swap(joinIDs[joinIDs.size()-1 ], joinIDs[i]);
252 |                 joinIDs.pop_back();
253 |                 break;
254 |             }
255 |         }
256 |     }
257 | 
258 |     dtype getBenefitBrazilnew(idtype rID,
259 |                               idtype oID,
260 |                               idtype pID,
261 |                               idtype rowID,
262 |                               idtype joinID,
263 |                               bool change=false,
264 |                               int verbose=1){
265 | 
266 |         dtype simSum = 0;
267 |         dtype thisWeight = 0.;
268 | 
269 |         idtype sim_loc_review = oID * orderNum;
270 |         idtype sim_loc_order = oID * orderNum;
271 |         idtype sim_loc_orderItem = joinID * orderItemNum;
272 |         idtype sim_loc_product = pID * productNum;
273 | 
274 | 
275 | 
276 |         idtype idx_loc = 0;
277 |         for(int i = 0, jID=0 ; i < orderItemNum; i++, idx_loc+=orderItemDim, jID++){
278 |             idtype oid_  = orderItem[idx_loc];
279 |             idtype rowID_ = orderItem[idx_loc + 1];
280 |             idtype pID_ = orderItem[idx_loc + 2];
281 | 
282 | 
283 |             dtype tempDP = reviewSim[sim_loc_review + oid_] * rW;
284 |             tempDP += orderSim[sim_loc_order + oid_] * oW;
285 |             tempDP += orderItemSim[sim_loc_orderItem + jID] * orderItemW;
286 |             tempDP += productSim[sim_loc_product + pID_] * pW;
287 | 
288 |             if(tempDP > dp[i] && change){
289 |                 dp[i] = tempDP;
290 |                 if(cs.nn[i] !=-1){
291 |                     cs.weight[cs.nn[i]] -= 1;
292 |                 }
293 |                 cs.nn[i] = cs.weight.size();
294 |                 thisWeight += 1;
295 |             }
296 |             simSum += std::max(tempDP, dp[i]);
297 |         }
298 | 
299 | 
300 |         if(change) {
301 |             cs.curSum = simSum;
302 |             cs.curSum = cs.norm * std::log(1. + cs.f_norm * cs.curSum);
303 | 
304 |             cs.add(rowID);
305 |             cs.weight.emplace_back(thisWeight);
306 |             if(verbose)
307 |                 printf("    add this weight is %.2f         Current progress 【%.2f %%】\n", thisWeight,
308 |                        100. * cs.weight.size() / cs.siz);
309 |             realAddOne(joinID);
310 |         }
311 | 
312 |         return cs.norm * std::log(1. + cs.f_norm * simSum) - cs.curSum;
313 |     }
314 | 
315 | 
316 | 
317 |     std::chrono::duration<long, std::ratio<1,1000000> > testBrazil(dtype PROP,
318 |                                                                       dtype epsilon = 0.01,
319 |                                                                       int saveWhere=0,
320 |                                                                       int verbose=1
321 |     ) {
322 |         fullCS.clear();
323 |         fullCSWeight.clear();
324 |         std::chrono::duration<long, std::ratio<1, 1000000>> sim_time(0);
325 | 
326 |         std::vector <idtype> rIDs;
327 |         std::vector <idtype> oDs;
328 |         std::vector <idtype> pIDs;
329 |         std::vector <idtype> rowIDs;
330 |         std::vector <idtype> samplejoinIDs;
331 | 
332 |         for (int cate = 0; cate <5; cate++) {
333 |             auto st = system_clock::now();
334 |             if (verbose)
335 |                 std::cout << "#############       Current category is " << cate << "     ##########\n";
336 | 
337 | 
338 |             loadToArr(cate);
339 |             initWeight();
340 | 
341 |             mallocBrazilnewSim();
342 |             calBrazilnewSim();
343 | 
344 | 
345 |             assert(joinNum == orderItemNum);
346 | 
347 |             if (verbose)std::cout << "join N is " << joinNum << "\n";
348 |             if (verbose)std::cout << "PROP is " << PROP << "\n";
349 | 
350 |             idtype csSize = (idtype) (PROP * joinNum + 0.5);
351 |             if (verbose)std::cout << "This cate should have [" << csSize << "]\n";
352 | 
353 | 
354 |             idtype sampleEachStep = 1. / PROP * std::log(1. / epsilon) + 0.5;
355 | 
356 | 
357 |             idtype ano = 1. / PROP * std::log(1. / epsilon) + 0.5;
358 |             if (ano < sampleEachStep)
359 |                 sampleEachStep = ano;
360 | 
361 |             std::cout<<"sample each step is "<<sampleEachStep<<"\n";
362 | 
363 |             cs.init(joinNum, csSize);
364 |             cs.f_norm = 1. / joinNum;
365 | 
366 |             auto en = system_clock::now();
367 |             auto duration = duration_cast<microseconds>(en - st);
368 |             sim_time += duration;
369 | 
370 |             while (csSize--) {
371 |                 dtype curMaxBenefit = -1;
372 |                 idtype curMaxBenefitID = 0;
373 | 
374 | 
375 |                 std::vector <idtype> rIDs;
376 |                 std::vector <idtype> oIDs;
377 |                 std::vector <idtype> pIDs;
378 |                 std::vector <idtype> rowIDs;
379 |                 std::vector <idtype> samplejoinIDs;
380 | 
381 |                 sampleBatchBrazilnew(sampleEachStep, rIDs, oIDs, pIDs, rowIDs, samplejoinIDs);
382 |                 std::vector <dtype> benefit_vec(sampleEachStep);
383 | 
384 |                 #pragma omp parallel for schedule(static)
385 |                 for (int i = 0; i < sampleEachStep; i++)
386 |                     benefit_vec[i] = getBenefitBrazilnew(rIDs[i], oIDs[i], pIDs[i], rowIDs[i],samplejoinIDs[i], 0,0);
387 |                 idtype i = 0;
388 |                 for (auto val : benefit_vec) {
389 |                     if (val > curMaxBenefit) {
390 |                         curMaxBenefit = val;
391 |                         curMaxBenefitID = i;
392 |                     }
393 |                     ++i;
394 |                 }
395 |                 i = curMaxBenefitID;
396 | 
397 |                 if (verbose)std::cout << "Benefit is " << curMaxBenefit<<"\n";
398 |                 benefit_vec[i] = getBenefitBrazilnew(rIDs[i], oIDs[i], pIDs[i], rowIDs[i],samplejoinIDs[i], 1, 0);
399 |             }
400 | 
401 |             freeAll();
402 |             freeBrazilnew();
403 | 
404 |             fullCS.insert(fullCS.end(), cs.coresetAll.begin(), cs.coresetAll.end());
405 |             fullCSWeight.insert(fullCSWeight.end(), cs.weight.begin(), cs.weight.end());
406 | 
407 | 
408 |             if(verbose)std::cout << "Finished!\n";
409 |         }
410 | 
411 |         if(verbose)printf("Total coreset size 【%d】\n", fullCS.size());
412 | 
413 |         if(verbose)std::cout <<  "@### 【Similarity】 Spent " << double(sim_time.count()) * microseconds::period::num / microseconds::period::den << " seconds.\n";
414 | 
415 | 
416 |         assert(!saveWhere);
417 |         if (!saveWhere) {
418 |             std::stringstream dir;
419 |             dir.str("");
420 |             dir<<CSPATH <<"Brazilnew";
421 |             dir<< "-"<<PROP<<"-ours.npz";
422 |             std::cout<<"Save to "<< dir.str() <<"\n";
423 |             cnpy::npz_save(dir.str(), "order", &fullCS[0], {fullCS.size()}, "w");
424 |             cnpy::npz_save(dir.str(), "weight", &fullCSWeight[0],
425 |                            {fullCSWeight.size()},
426 |                            "a");
427 |             dtype order_time = 0.;
428 |             cnpy::npz_save(dir.str(), "order_time", &order_time, {1}, "a");
429 |             printf("%s\n", cur_time());
430 |             printf("Save finished\n");
431 |         }
432 |         return sim_time;
433 |     }
434 | 
435 | }
436 | #endif
437 | 
438 | 
439 | 
440 | 


--------------------------------------------------------------------------------
/RECON/mycsIMDBC.h:
--------------------------------------------------------------------------------
  1 | #ifndef UNTITLED2_MYCSIMDBC_H
  2 | #define UNTITLED2_MYCSIMDBC_H
  3 | 
  4 | #include "cnpy.h"
  5 | #include "type.h"
  6 | #include "util.h"
  7 | #include "data.h"
  8 | 
  9 | #include <cstring>
 10 | #include <fstream>
 11 | #include <random>
 12 | #include <unordered_map>
 13 | #include <chrono>
 14 | #include "time.h"
 15 | #include "assert.h"
 16 | 
 17 | namespace IMDBC {
 18 |     using std::chrono::system_clock;
 19 |     using std::chrono::duration_cast;
 20 |     using std::chrono::microseconds;
 21 |     std::random_device rd;
 22 |     std::mt19937 mt(rd());
 23 | 
 24 |     cnpy::NpyArray miArr;
 25 |     cnpy::NpyArray mixArr;
 26 |     cnpy::NpyArray titleArr;
 27 |     cnpy::NpyArray nameArr;
 28 |     cnpy::NpyArray ciArr;
 29 |     cnpy::NpyArray mcArr;
 30 |     cnpy::NpyArray mapArr;
 31 | 
 32 |     dtype *genders, *countries;
 33 |     idtype jN;
 34 |     dtype *dp;
 35 |     dtype *mi, *mix, *title, *name, *ci, *mc;
 36 |     idtype maxMovieID;
 37 |     idtype miNum, miDim, mixNum, mixDim, titleNum, titleDim, nameNum, nameDim, ciNum, ciDim, mcNum, mcDim;
 38 |     idtype mapNum, mapDim;
 39 |     idtype *hashMapV;
 40 |     std::unordered_map<idtype, idtype> hashMap;
 41 |     dtype *mvSim, *mixSim, *miSim, *titleSim;
 42 |     dtype *mRowMap;
 43 |     dtype mixWeight, miWeight, titleWeight, personWeight, companyWeight;
 44 |     std::discrete_distribution<> movieDis;
 45 |     std::vector<idtype> movies;
 46 |     std::vector<idtype> movieWeight;
 47 |     std::vector<std::vector<idtype> > moviePerson;
 48 |     std::vector<std::vector<idtype> > movieCompany;
 49 |     std::vector<idtype> constmovieWeight;
 50 |     std::vector<idtype> fullCS;
 51 |     std::vector<dtype> fullCSWeight;
 52 | 
 53 | 
 54 |     cnpy::NpyArray loadNpy(std::string fileDir) {
 55 |         cnpy::NpyArray arr = cnpy::npy_load(fileDir);
 56 |         return arr;
 57 |     }
 58 | 
 59 |     void readIMDBCNpy(int cate, int Large = 0, int linear = 0, int cateNum = 10) {
 60 |         std::stringstream dir;
 61 |         dir.str("");
 62 |         if (linear == 0) {
 63 |             if (cateNum == 10)
 64 |                 dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-"
 65 |                     << cate << "-" << "mi.npy";
 66 |             else
 67 |                 dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-"
 68 |                     << cate << "-" << "mi.npy";
 69 |         } else
 70 |             dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate
 71 |                 << "-" << "mi.npy";
 72 |         miArr = loadNpy(dir.str());
 73 |         dir.str("");
 74 |         if (linear == 0) {
 75 |             if (cateNum == 10)
 76 |                 dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-"
 77 |                     << cate << "-" << "mix.npy";
 78 |             else
 79 |                 dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-"
 80 |                     << cate << "-" << "mix.npy";
 81 |         } else
 82 |             dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate
 83 |                 << "-" << "mix.npy";
 84 |         mixArr = loadNpy(dir.str());
 85 |         dir.str("");
 86 |         if (linear == 0) {
 87 |             if (cateNum == 10)
 88 |                 dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-"
 89 |                     << cate << "-" << "title.npy";
 90 |             else
 91 |                 dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-"
 92 |                     << cate << "-" << "title.npy";
 93 |         } else
 94 |             dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate
 95 |                 << "-" << "title.npy";
 96 |         titleArr = loadNpy(dir.str());
 97 |         dir.str("");
 98 |         if (linear == 0) {
 99 |             if (cateNum == 10)
100 |                 dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-"
101 |                     << cate << "-" << "name.npy";
102 |             else
103 |                 dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-"
104 |                     << cate << "-" << "name.npy";
105 |         } else
106 |             dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate
107 |                 << "-" << "name.npy";
108 |         nameArr = loadNpy(dir.str());
109 |         dir.str("");
110 |         if (linear == 0) {
111 |             if (cateNum == 10)
112 |                 dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-"
113 |                     << cate << "-" << "ci.npy";
114 |             else
115 |                 dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-"
116 |                     << cate << "-" << "ci.npy";
117 |         } else
118 |             dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate
119 |                 << "-" << "ci.npy";
120 |         ciArr = loadNpy(dir.str());
121 |         dir.str("");
122 |         if (linear == 0) {
123 |             if (cateNum == 10)
124 |                 dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/train-cate-"
125 |                     << cate << "-" << "mc.npy";
126 |             else
127 |                 dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/train-cate-"
128 |                     << cate << "-" << "mc.npy";
129 |         } else
130 |             dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/train-cate-" << cate
131 |                 << "-" << "mc.npy";
132 |         mcArr = loadNpy(dir.str());
133 |     }
134 | 
135 |     void mallocCArray() {
136 |         mRowMap = (dtype *) malloc(100000 * sizeof(dtype));
137 |         genders = (dtype *) malloc(1000000 * sizeof(dtype));
138 |         countries = (dtype *) malloc(1000000 * sizeof(dtype));
139 | 
140 |         miNum = miArr.shape[0];
141 |         miDim = miArr.shape[1];
142 |         mi = (dtype *) malloc(miNum * miDim * sizeof(dtype));
143 | 
144 |         mixNum = mixArr.shape[0];
145 |         mixDim = mixArr.shape[1];
146 |         mix = (dtype *) malloc(mixNum * mixDim * sizeof(dtype));
147 |         titleNum = titleArr.shape[0];
148 |         titleDim = titleArr.shape[1];
149 |         title = (dtype *) malloc(titleNum * titleDim * sizeof(dtype));
150 |         nameNum = nameArr.shape[0];
151 |         nameDim = nameArr.shape[1];
152 |         name = (dtype *) malloc(nameNum * nameDim * sizeof(dtype));
153 |         ciNum = ciArr.shape[0];
154 |         ciDim = ciArr.shape[1];
155 |         ci = (dtype *) malloc(ciNum * ciDim * sizeof(dtype));
156 |         mcNum = mcArr.shape[0];
157 |         mcDim = mcArr.shape[1];
158 |         mc = (dtype *) malloc(mcNum * mcDim * sizeof(dtype));
159 |     }
160 | 
161 |     void loadToArr(int cate, int Large = 0, int linear = 0, int cateNum = 10) {
162 |         readIMDBCNpy(cate, Large, linear, cateNum);
163 |         mallocCArray();
164 |         memcpy(mi, miArr.data<dtype>(), 1LL * miNum * miDim * sizeof(dtype));
165 |         memcpy(mix, mixArr.data<dtype>(), 1LL * mixNum * mixDim * sizeof(dtype));
166 |         memcpy(title, titleArr.data<dtype>(), 1LL * titleNum * titleDim * sizeof(dtype));
167 |         memcpy(name, nameArr.data<dtype>(), 1LL * nameNum * nameDim * sizeof(dtype));
168 |         memcpy(ci, ciArr.data<dtype>(), 1LL * ciNum * ciDim * sizeof(dtype));
169 |         memcpy(mc, mcArr.data<dtype>(), 1LL * mcNum * mcDim * sizeof(dtype));
170 |     }
171 | 
172 |     void mallocIMDBCSim() {
173 |         mvSim = (dtype *) malloc(titleNum * titleNum * sizeof(dtype));
174 |         mixSim = (dtype *) malloc(titleNum * titleNum * sizeof(dtype));
175 |         miSim = (dtype *) malloc(titleNum * titleNum * sizeof(dtype));
176 |         titleSim = (dtype *) malloc(titleNum * titleNum * sizeof(dtype));
177 |     }
178 | 
179 |     void calIMDBCSim() {
180 |         initSim(mixSim, mix, mixNum, mixDim, 1, mixDim - 1);
181 |         initSim(miSim, mi, miNum, miDim, 1);
182 |         initSim(titleSim, title, titleNum, titleDim, 1);
183 |         mixWeight = 1.0 / 6;
184 |         miWeight = 1.0 / 6;
185 |         titleWeight = 1.0 / 6;
186 |         personWeight = 1.0 / 2;
187 |         idtype st_id = 0;
188 | 
189 |         for (idtype i = 0; i < miNum; i++, st_id += miNum) {
190 |             #pragma omp parallel for schedule(static)
191 |             for (idtype j = 0; j < miNum; j++) {
192 |                 mvSim[st_id + j] = mixSim[st_id + j] * mixWeight
193 |                                    + miSim[st_id + j] * miWeight
194 |                                    + titleSim[st_id + j] * titleWeight;
195 |             }
196 |         }
197 |     }
198 |     void initWeight(int verbose = 0) {
199 |         maxMovieID = 0;
200 |         jN = 0;
201 |         movies.clear();
202 |         for (idtype i = 0; i < nameNum; i++) {
203 |             idtype pid = name[i * nameDim];
204 |             genders[pid] = name[i * nameDim + 1];
205 |         }
206 |         for (idtype i = 0; i < mcNum; i++) {
207 |             idtype cid = mc[i * mcDim + 1];
208 |             countries[cid] = mc[i * mcDim + 2];
209 |         }
210 |         for (idtype i = 0; i < titleNum; i++) {
211 |             maxMovieID = std::max(maxMovieID, (idtype) title[i * titleDim]);
212 |             mRowMap[(idtype) title[i * titleDim]] = i;
213 |             movies.emplace_back(title[i * titleDim]);
214 |         }
215 |         if (verbose)std::cout << "Max movie ID is " << maxMovieID << "!\n";
216 | 
217 |         moviePerson.resize(maxMovieID + 1);
218 |         movieCompany.resize(maxMovieID + 1);
219 | 
220 |         constmovieWeight.clear();
221 |         constmovieWeight.resize(2 * (maxMovieID + 1));
222 |         movieWeight.clear();
223 |         movieWeight.resize(maxMovieID + 1);
224 | 
225 |         for (idtype i = 0; i <= maxMovieID; i++) {
226 |             moviePerson[i].clear();
227 |             movieCompany[i].clear();
228 |         }
229 | 
230 |         for (idtype i = 0; i < ciNum; i++) {
231 |             idtype person_id = ci[i * ciDim + 0];
232 |             idtype movie_id = ci[i * ciDim + 1];
233 |             moviePerson[movie_id].emplace_back(person_id);
234 |         }
235 |         if (verbose)std::cout << "moviePerson Weight set finished!\n";
236 | 
237 |         for (idtype i = 0; i < mcNum; i++) {
238 |             idtype movie_id = mc[i * mcDim + 0];
239 |             idtype company_id = mc[i * mcDim + 1];
240 |             movieCompany[movie_id].emplace_back(company_id);
241 |         }
242 |         if (verbose)std::cout << "movieCompany Weight set finished!\n";
243 |         idtype sm = 0;
244 |         for (idtype i = 0; i <= maxMovieID; i++) {
245 |             movieWeight[i] = (idtype) moviePerson[i].size() * movieCompany[i].size();
246 |             idtype maleCnt = 0, femaleCnt = 0;
247 |             for (auto p:moviePerson[i]) {
248 |                 if (genders[p] == 1)++maleCnt;
249 |                 else ++femaleCnt;
250 |             }
251 |             constmovieWeight[i << 1] = (idtype) femaleCnt * movieCompany[i].size();
252 |             constmovieWeight[i << 1 | 1] = (idtype) maleCnt * movieCompany[i].size();
253 |             jN += movieWeight[i];
254 |             sm += moviePerson[i].size();
255 |         }
256 |         if (verbose)std::cout << "sm total is " << sm << "\n";
257 |         if (verbose)std::cout << "movie Weight set finished!\n";
258 |         movieDis = std::discrete_distribution<>(movieWeight.begin(), movieWeight.end());
259 |         dp = (dtype *) malloc(3 * (maxMovieID + 1) * sizeof(dtype));
260 |     }
261 | 
262 |     void sampleOneIMDBC(idtype & m, idtype & p, idtype & c) {
263 |         m = movieDis(mt);
264 |         std::uniform_int_distribution<> personDis = std::uniform_int_distribution<>(0, moviePerson[m].size() - 1);
265 |         p = moviePerson[m][personDis(mt)];
266 |         std::uniform_int_distribution<> companyDis = std::uniform_int_distribution<>(0, movieCompany[m].size() - 1);
267 |         c = movieCompany[m][companyDis(mt)];
268 |     }
269 | 
270 |     void sampleBatchIMDBC(int sampleSize, std::vector<idtype> &ms, std::vector<idtype> &ps, std::vector<idtype> &cs) {
271 |         for (int i = 0; i < sampleSize; i++)
272 |             sampleOneIMDBC(ms[i], ps[i], cs[i]);
273 |     }
274 | 
275 |     void realAddOne(idtype m, idtype p, idtype c) {
276 |         --movieWeight[m];
277 |         movieDis = std::discrete_distribution<>(movieWeight.begin(), movieWeight.end());
278 |     }
279 | 
280 |     void initHashMap(int Large, int linear, int cateNum) {
281 |         std::stringstream dir;
282 |         dir.str("");
283 |         if (linear == 0) {
284 |             if (cateNum == 10)
285 |                 dir << DATAPATH << (Large ? "IMDBLargeC10" : "IMDBC10") << "-formycs/idMap.npy";
286 |             else
287 |                 dir << DATAPATH << (Large ? "IMDBLargeC5" : "IMDBC5") << "-formycs/idMap.npy";
288 |         } else
289 |             dir << DATAPATH << (Large ? "IMDBLargeCLinearC++" : "IMDBCLinearC++") << "-formycs/idMap.npy";
290 | 
291 |         mapArr = loadNpy(dir.str());
292 | 
293 |         mapNum = mapArr.shape[0];
294 |         mapDim = mapArr.shape[1];
295 |         hashMapV = (idtype *) malloc(mapNum * mapDim * sizeof(idtype));
296 | 
297 |         memcpy(hashMapV, mapArr.data<idtype>(), 1LL * mapNum * mapDim * sizeof(idtype));
298 |         hashMap.clear();
299 |         for (idtype i = 0; i < mapNum; i++) {
300 |             idtype hashV = hashMapV[i * mapDim];
301 |             idtype ID = hashMapV[i * mapDim + 1];
302 |             hashMap[hashV] = ID;
303 |         }
304 | 
305 |     }
306 | 
307 |     idtype idInJoin(idtype m, idtype p, idtype c) {
308 |         idtype hashValue = (m + 1) + (p + 1) * 100000LL + (c + 1) * 100000000000LL;
309 |         assert(hashMap.find(hashValue) != hashMap.end());
310 |         return hashMap[hashValue];
311 |     }
312 | 
313 | 
314 |     dtype getBenefitIMDBC(idtype m, idtype p, idtype c, bool change = true, int verbose = 1) {
315 | 
316 |         dtype simSum = 0;
317 |         dtype thisWeight = 0.;
318 | 
319 |         idtype gender = genders[p];
320 |         dtype country = countries[c];
321 | 
322 |         idtype mRowID = mRowMap[m];
323 |         assert((idtype) title[mRowID * titleDim] == m);
324 | 
325 |         idtype mSt = mRowID * titleNum;
326 |         for (idtype i: movies) {
327 |             idtype iRowID = mRowMap[i];
328 |             assert((idtype) title[iRowID * titleDim] == i);
329 | 
330 |             dtype newSim = mvSim[mSt + iRowID];
331 |             bool addCompanyDiff = false;
332 |             for (auto c_: movieCompany[m])
333 |                 if (countries[c_] != country) {
334 |                     addCompanyDiff = true;
335 |                     break;
336 |                 }
337 |             if (!addCompanyDiff)
338 |                 newSim += companyWeight;
339 | 
340 |             dtype maleSim = ((gender == 1) ? personWeight : 0) + newSim;
341 |             dtype femaleSim = ((gender == 0) ? personWeight : 0) + newSim;
342 | 
343 |             simSum += std::max(dp[i << 1 | 1], maleSim) * constmovieWeight[i << 1 | 1];
344 |             simSum += std::max(dp[i << 1], femaleSim) * constmovieWeight[i << 1];
345 | 
346 |             if (maleSim > dp[i << 1 | 1] && change) {
347 |                 dp[i << 1 | 1] = maleSim;
348 |                 if (cs.nn[i << 1 | 1] != -1) {
349 |                     cs.weight[cs.nn[i << 1 | 1]] -= constmovieWeight[i << 1 | 1]; //
350 |                 }
351 |                 cs.nn[i << 1 | 1] = cs.weight.size();
352 |                 thisWeight += constmovieWeight[i << 1 | 1];
353 |             }
354 | 
355 |             if (femaleSim > dp[i << 1] && change) {
356 |                 dp[i << 1] = femaleSim;
357 |                 if (cs.nn[i << 1] != -1) {
358 |                     cs.weight[cs.nn[i << 1]] -= constmovieWeight[i << 1]; //
359 |                 }
360 |                 cs.nn[i << 1] = cs.weight.size();
361 |                 thisWeight += constmovieWeight[i << 1];
362 |             }
363 | 
364 |         }
365 |         if (change) {
366 |             cs.curSum = simSum;
367 |             cs.curSum = cs.norm * std::log(1. + cs.f_norm * cs.curSum);
368 |             cs.add(idInJoin(m, p, c));
369 |             cs.weight.emplace_back(thisWeight);
370 |             if (verbose)
371 |                 printf("    add this weight is %.2f         Current progress 【%.2f %%】\n", thisWeight,
372 |                        100. * cs.weight.size() / cs.siz);
373 |             realAddOne(m, p, c);
374 |         }
375 |         return cs.norm * std::log(1. + cs.f_norm * simSum) - cs.curSum;
376 |     }
377 | 
378 |     std::chrono::duration<long, std::ratio<1, 1000000>> testIMDBC(dtype PROP,
379 |                                                                   idtype Large = 0,
380 |                                                                   dtype epsilon = 0.01,
381 |                                                                   int linear = 0,
382 |                                                                   int cateNum = 5,
383 |                                                                   int saveWhere = 0,
384 |                                                                   int verbose = 1,
385 |                                                                   int assignSampleSize = 0
386 |     ) {
387 |         fullCS.clear();
388 |         fullCSWeight.clear();
389 | 
390 |         std::chrono::duration<long, std::ratio<1, 1000000>> sim_time(0);
391 |         auto st = system_clock::now();
392 |         initHashMap(Large, linear, cateNum);
393 |         auto en = system_clock::now();
394 |         auto duration = duration_cast<microseconds>(en - st);
395 |         sim_time += duration;
396 | 
397 |         for (int cate = 0; cate < (linear == 0 ? cateNum : 87); cate++) {
398 |             st = system_clock::now();
399 |             if (verbose)std::cout << "#############       Current category is " << cate << "     ##########\n";
400 | 
401 |             loadToArr(cate, Large, linear, cateNum);
402 |             if (verbose)std::cout << "title num is " << titleNum << "\n";
403 | 
404 |             initWeight(verbose);
405 |             mallocIMDBCSim();
406 |             calIMDBCSim();
407 | 
408 |             if (verbose)std::cout << "join N is " << jN << "\n";
409 |             if (verbose)std::cout << "PROP is " << PROP << "\n";
410 |             idtype csSize = (idtype) (PROP * jN);
411 | 
412 |             if (verbose)std::cout << "This cate should have [" << csSize << "]\n";
413 |             idtype sampleEachStep = 500;
414 | 
415 |             en = system_clock::now();
416 |             duration = duration_cast<microseconds>(en - st);
417 |             sim_time += duration;
418 |             std::vector<idtype> Ms(sampleEachStep), Ps(sampleEachStep), Cs(sampleEachStep);
419 | 
420 | 
421 |             cs.init(2 * (maxMovieID + 1), csSize);
422 |             cs.f_norm = 1. / jN;
423 | 
424 |             if (verbose)
425 |                 std::cout << "company weight is " << companyWeight << " person weight is " << personWeight << "\n";
426 | 
427 |             while (csSize--) {
428 |                 dtype curMaxBenefit = -1;
429 |                 idtype curMaxBenefitID = 0;
430 | 
431 |                 sampleBatchIMDBC(sampleEachStep, Ms, Ps, Cs);
432 | 
433 |                 std::vector<dtype> benefit_vec(sampleEachStep);
434 |                 #pragma omp parallel for schedule(static)
435 |                 for (int i = 0; i < sampleEachStep; i++)
436 |                     benefit_vec[i] = getBenefitIMDBC(Ms[i], Ps[i], Cs[i], false);
437 | 
438 |                 idtype i = 0;
439 |                 for (auto val : benefit_vec) {
440 |                     if (val > curMaxBenefit) {
441 |                         curMaxBenefit = val;
442 |                         curMaxBenefitID = i;
443 |                     }
444 |                     ++i;
445 |                 }
446 |                 i = curMaxBenefitID;
447 |                 if (verbose)std::cout << "Benefit is " << curMaxBenefit;
448 |                 getBenefitIMDBC(Ms[i], Ps[i], Cs[i], true, verbose);
449 | 
450 |             }
451 | 
452 |             fullCS.insert(fullCS.end(), cs.coresetAll.begin(), cs.coresetAll.end());
453 |             fullCSWeight.insert(fullCSWeight.end(), cs.weight.begin(), cs.weight.end());
454 | 
455 |         }
456 |         printf("Total coreset size 【%d】\n", fullCS.size());
457 | 
458 |         std::cout << "@### 【Similarity】 Spent "
459 |                   << double(sim_time.count()) * microseconds::period::num / microseconds::period::den << " seconds.\n";
460 | 
461 |         assert(saveWhere==0);
462 |         if (!saveWhere) {
463 |             std::stringstream dir;
464 |             dir.str("");
465 |             if (linear == 0) {
466 |                 if (cateNum == 10)
467 |                     dir << CSPATH << (Large ? "IMDBLargeC10" : "IMDBC10");
468 |                 else
469 |                     dir << CSPATH << (Large ? "IMDBLargeC5" : "IMDBC5");
470 |             } else
471 |                 dir << CSPATH << (Large ? "IMDBLargeCLinear" : "IMDBCLinear");
472 |             dir<< "-"<<PROP<<"-ours.npz";
473 |             std::cout<<"Save to "<< dir.str() <<"\n";
474 |             cnpy::npz_save(dir.str(), "order", &fullCS[0], {fullCS.size()}, "w");
475 |             cnpy::npz_save(dir.str(), "weight", &fullCSWeight[0],
476 |                            {fullCSWeight.size()},
477 |                            "a");
478 |             dtype order_time = 0.;
479 |             cnpy::npz_save(dir.str(), "order_time", &order_time, {1}, "a");
480 |             printf("%s\n", cur_time());
481 |             printf("Save finished\n");
482 |         }
483 |         return sim_time;
484 |     }
485 | }
486 | #endif //UNTITLED2_MYCSIMDBC_H
487 | 
488 | 
489 | 
490 | 


--------------------------------------------------------------------------------
/RECON/mycsStackn.h:
--------------------------------------------------------------------------------
  1 | #ifndef UNTITLED2_MYCSStackn_H
  2 | #define UNTITLED2_MYCSStackn_H
  3 | 
  4 | 
  5 | #include "cnpy.h"
  6 | #include "type.h"
  7 | #include "util.h"
  8 | #include "data.h"
  9 | #include <cstring>
 10 | #include <fstream>
 11 | #include <random>
 12 | #include <unordered_map>
 13 | 
 14 | 
 15 | namespace stackn {
 16 |     using std::chrono::system_clock;
 17 |     using std::chrono::duration_cast;
 18 |     using std::chrono::microseconds;
 19 |     std::random_device rd;
 20 |     std::mt19937 mt(rd());
 21 | 
 22 | 
 23 |     dtype userW = 1. / 14, questionW = 2. / 14, answerW = 7. / 14;
 24 | 
 25 |     cnpy::NpyArray userArr;
 26 |     cnpy::NpyArray questionArr;
 27 |     cnpy::NpyArray answerArr;
 28 |     cnpy::NpyArray joinArr;
 29 | 
 30 |     dtype *dp;
 31 |     dtype *user, *question, *answer, *join;
 32 |     idtype userNum, userDim, questionNum, questionDim, answerNum, answerDim, joinNum, joinDim;
 33 |     dtype *userSim, *questionSim, *answerSim;
 34 | 
 35 |     std::vector<idtype> users;
 36 |     std::vector<idtype> fullCS;
 37 |     std::vector<dtype> fullCSWeight;    
 38 | 
 39 | 
 40 |     void freeStackn() {
 41 |         free(user);
 42 |         free(answer);
 43 |         free(question);
 44 |         free(join);
 45 |         free(userSim);
 46 |         free(answerSim);
 47 |         free(questionSim);
 48 |         free(dp);
 49 |     }
 50 | 
 51 |     cnpy::NpyArray loadNpy(std::string fileDir) {
 52 |         cnpy::NpyArray arr = cnpy::npy_load(fileDir);
 53 |         return arr;
 54 |     }
 55 | 
 56 | 
 57 |     void readStacknNpy(int cate) {
 58 |         std::stringstream dir;
 59 |         dir.str("");
 60 | 
 61 |         dir << DATAPATH + "stackn-formycs/train-"
 62 |             << cate << "-user.npy";
 63 |         userArr = loadNpy(dir.str());
 64 |         dir.str("");
 65 | 
 66 |         dir << DATAPATH + "stackn-formycs/train-"
 67 |             << cate << "-answer.npy";
 68 |         answerArr = loadNpy(dir.str());
 69 |         dir.str("");
 70 | 
 71 | 
 72 |         dir << DATAPATH + "stackn-formycs/train-"
 73 |             << cate << "-question.npy";
 74 |         questionArr = loadNpy(dir.str());
 75 |         dir.str("");
 76 | 
 77 | 
 78 |         dir << DATAPATH + "stackn-formycs/train-"
 79 |             << cate << "-joined.npy";
 80 |         joinArr = loadNpy(dir.str());
 81 |         dir.str("");
 82 |     }
 83 | 
 84 |     void mallocStacknArray() {
 85 | 
 86 | 
 87 |         userNum = userArr.shape[0];
 88 |         userDim = joinArr.shape[1];
 89 |         user = (dtype *) malloc(userNum * userDim * sizeof(dtype));
 90 | 
 91 | 
 92 |         answerNum = answerArr.shape[0];
 93 |         answerDim = answerArr.shape[1];
 94 |         answer = (dtype *) malloc(answerNum * answerDim * sizeof(dtype));
 95 | 
 96 | 
 97 |         questionNum = questionArr.shape[0];
 98 |         questionDim = questionArr.shape[1];
 99 |         question = (dtype *) malloc(questionNum * questionDim * sizeof(dtype));
100 | 
101 | 
102 |         joinNum = joinArr.shape[0];
103 |         joinDim = joinArr.shape[1];
104 |         join = (dtype *) malloc(joinNum * joinDim * sizeof(dtype));
105 |     }
106 | 
107 |     void loadToArr(int cate) {
108 | 
109 | 
110 |         readStacknNpy(cate);
111 |         mallocStacknArray();
112 | 
113 |         memcpy(user, userArr.data<dtype>(), 1LL * userNum * userDim * sizeof(dtype));
114 |         memcpy(answer, answerArr.data<dtype>(), 1LL * answerNum * answerDim * sizeof(dtype));
115 |         memcpy(question, questionArr.data<dtype>(), 1LL * questionNum * questionDim * sizeof(dtype));
116 |         memcpy(join, joinArr.data<dtype>(), 1LL * joinNum * joinDim * sizeof(dtype));
117 | 
118 | 
119 | 
120 |         for (int i = 0; i < joinNum; i++)
121 |             join[i * joinDim + 5] = i;
122 |     }
123 | 
124 |     void mallocStacknSim() {
125 | 
126 | 
127 |         userSim = (dtype *) malloc(userNum * userNum * sizeof(dtype));
128 |         answerSim = (dtype *) malloc(answerNum * answerNum * sizeof(dtype));
129 |         questionSim = (dtype *) malloc(questionNum * questionNum * sizeof(dtype));
130 |     }
131 | 
132 |     void calStacknSim() {
133 | 
134 |         initSim(userSim, user, userNum, userDim, 1);
135 | 
136 |         initSim(answerSim, answer, answerNum, answerDim, 3);
137 | 
138 |         initSim(questionSim, question, questionNum, questionDim, 2);
139 | 
140 | 
141 |     }
142 | 
143 |     std::vector<int> joinIDs;
144 | 
145 |     void initWeight() {
146 | 
147 |         joinIDs.clear();
148 |         joinIDs.reserve(answerNum);
149 |         dp = (dtype *) malloc(answerNum * sizeof(dtype));
150 |         memset(dp, 0, answerNum * sizeof(dtype));
151 |         for (int i = 0; i < answerNum; i++)
152 |             joinIDs.emplace_back(i);
153 |     }
154 | 
155 | 
156 |     void sampleOneStackn(idtype & uID,
157 |                          idtype & ID,
158 |                          idtype & qID,
159 |                          idtype & rowID,
160 |                          idtype & samplejoinID) {
161 | 
162 |         int id = joinIDs[mt() % joinIDs.size()];
163 | 
164 | 
165 | 
166 |         idtype idx_st = id * joinDim;
167 | 
168 |         uID = join[idx_st];
169 |         qID = join[idx_st + 1];
170 |         ID = join[idx_st + 2];
171 |         rowID = join[idx_st + 3];
172 |         samplejoinID = join[idx_st + 5];
173 |     }
174 | 
175 | 
176 |     void sampleBatchStackn(int sampleSize,
177 |                            std::vector<idtype> &uIDs,
178 |                            std::vector<idtype> &IDs,
179 |                            std::vector<idtype> &qIDs,
180 |                            std::vector<idtype> &rowIDs,
181 |                            std::vector<idtype> &joinIDs) {
182 |         uIDs.resize(sampleSize);
183 |         IDs.resize(sampleSize);
184 |         qIDs.resize(sampleSize);
185 |         rowIDs.resize(sampleSize);
186 |         joinIDs.resize(sampleSize);
187 | 
188 |         for (int i = 0; i < sampleSize; i++)
189 |             sampleOneStackn(uIDs[i],
190 |                             IDs[i],
191 |                             qIDs[i],
192 |                             rowIDs[i],
193 |                             joinIDs[i]);
194 |     }
195 | 
196 |     void realAddOne(idtype joinID) {
197 | 
198 |         for (int i = 0; i < joinIDs.size(); i++) {
199 |             if (joinIDs[i] == joinID) {
200 |                 std::swap(joinIDs[joinIDs.size() - 1], joinIDs[i]);
201 |                 joinIDs.pop_back();
202 |                 break;
203 |             }
204 |         }
205 |     }
206 | 
207 | 
208 |     dtype getBenefitStackn(idtype uID,
209 |                            idtype ID,
210 |                            idtype qID,
211 |                            idtype rowID,
212 |                            idtype joinID,
213 |                            bool change = false,
214 |                            int verbose = 1) {
215 | 
216 | 
217 |         dtype simSum = 0;
218 |         dtype thisWeight = 0.;
219 | 
220 |         idtype sim_loc_user = uID * userNum;
221 |         idtype sim_loc_answer = ID * answerNum;
222 |         idtype sim_loc_question = qID * questionNum;
223 | 
224 | 
225 | 
226 |         idtype idx_loc = 0;
227 |         for (int i = 0; i < answerNum; i++, idx_loc += answerDim) {
228 |             idtype _id = answer[idx_loc];
229 |             idtype _uID = answer[idx_loc + 1];
230 |             idtype _qID = answer[idx_loc + 2];
231 | 
232 |             dtype tempDP = answerSim[sim_loc_answer + _id] * answerW;
233 |             tempDP += questionSim[sim_loc_question + _qID] * questionW;
234 |             tempDP += userSim[sim_loc_user + _uID] * userW;
235 | 
236 | 
237 |             if (tempDP > dp[i] && change) {
238 |                 dp[i] = tempDP;
239 |                 if (cs.nn[i] != -1) {
240 |                     cs.weight[cs.nn[i]] -= 1;
241 |                 }
242 |                 cs.nn[i] = cs.weight.size();
243 |                 thisWeight += 1;
244 |             }
245 |             simSum += std::max(tempDP, dp[i]);
246 |         }
247 | 
248 |         if (change) {
249 |             cs.curSum = simSum;
250 |             cs.curSum = cs.norm * std::log(1. + cs.f_norm * cs.curSum);
251 | 
252 |             cs.add(rowID);
253 |             cs.weight.emplace_back(thisWeight);
254 |             if (verbose)
255 |                 printf("    add this weight is %.2f         Current progress 【%.2f %%】\n", thisWeight,
256 |                        100. * cs.weight.size() / cs.siz);
257 |             realAddOne(joinID);
258 |         }
259 | 
260 |         return cs.norm * std::log(1. + cs.f_norm * simSum) - cs.curSum;
261 |     }
262 | 
263 | 
264 |     std::chrono::duration<long, std::ratio<1, 1000000>> testStackn(dtype PROP,
265 |                                                                    dtype epsilon = 0.01,
266 |                                                                    int saveWhere = 0,
267 |                                                                    int verbose = 1
268 |     ) {
269 |         fullCS.clear();
270 |         fullCSWeight.clear();
271 | 
272 |         std::chrono::duration<long, std::ratio<1, 1000000>> sim_time(0);
273 | 
274 | 
275 |         std::vector<idtype> uIDs;
276 |         std::vector<idtype> IDs;
277 |         std::vector<idtype> qIDs;
278 |         std::vector<idtype> rowIDs;
279 |         std::vector<idtype> samplejoinIDs;
280 | 
281 | 
282 |         for (int cate = 0; cate <= 18305; cate++) {
283 |             auto st = system_clock::now();
284 |             if (verbose)
285 |                 std::cout << "#############       Current category is " << cate << "     ##########\n";
286 | 
287 | 
288 |             loadToArr(cate);
289 |             initWeight();
290 | 
291 |             mallocStacknSim();
292 |             calStacknSim();
293 | 
294 | 
295 |             assert(joinNum == answerNum);
296 | 
297 |             if (verbose)std::cout << "join N is " << joinNum << "\n";
298 |             if (verbose)std::cout << "PROP is " << PROP << "\n";
299 | 
300 |             idtype csSize = (idtype) (PROP * joinNum + 0.5);
301 |             if (verbose)std::cout << "This cate should have [" << csSize << "]\n";
302 | 
303 | 
304 |             idtype sampleEachStep = 1. / PROP * std::log(1. / epsilon) + 0.5;
305 | 
306 | 
307 |             idtype ano = 1. / PROP * std::log(1. / epsilon) + 0.5;
308 |             if (ano < sampleEachStep)
309 |                 sampleEachStep = ano;
310 | 
311 |             cs.init(joinNum, csSize);
312 |             cs.f_norm = 1. / joinNum;
313 | 
314 |             auto en = system_clock::now();
315 |             auto duration = duration_cast<microseconds>(en - st);
316 |             sim_time += duration;
317 | 
318 |             while (csSize--) {
319 |                 dtype curMaxBenefit = -1;
320 |                 idtype curMaxBenefitID = 0;
321 | 
322 | 
323 |                 std::vector<idtype> uIDs;
324 |                 std::vector<idtype> IDs;
325 |                 std::vector<idtype> qIDs;
326 |                 std::vector<idtype> rowIDs;
327 |                 std::vector<idtype> samplejoinIDs;
328 | 
329 |                 sampleBatchStackn(sampleEachStep, uIDs, IDs, qIDs, rowIDs, samplejoinIDs);
330 |                 std::vector<dtype> benefit_vec(sampleEachStep);
331 | 
332 |                 #pragma omp parallel for schedule(static)
333 |                 for (int i = 0; i < sampleEachStep; i++)
334 |                     benefit_vec[i] = getBenefitStackn(uIDs[i], IDs[i], qIDs[i], rowIDs[i], samplejoinIDs[i], 0, 0);
335 |                 idtype i = 0;
336 |                 for (auto val : benefit_vec) {
337 |                     if (val > curMaxBenefit) {
338 |                         curMaxBenefit = val;
339 |                         curMaxBenefitID = i;
340 |                     }
341 |                     ++i;
342 |                 }
343 |                 i = curMaxBenefitID;
344 | 
345 |                 if (verbose)std::cout << "Benefit is " << curMaxBenefit<<"\n";
346 |                 benefit_vec[i] = getBenefitStackn(uIDs[i], IDs[i], qIDs[i], rowIDs[i], samplejoinIDs[i], 1, 0);
347 |             }
348 | 
349 |             freeAll();
350 |             freeStackn();
351 | 
352 |             fullCS.insert(fullCS.end(), cs.coresetAll.begin(), cs.coresetAll.end());
353 |             fullCSWeight.insert(fullCSWeight.end(), cs.weight.begin(), cs.weight.end());
354 | 
355 | 
356 |             if (verbose)std::cout << "Finished!\n";
357 |         }
358 | 
359 |         if (verbose)printf("Total coreset size 【%d】\n", fullCS.size());
360 | 
361 |         if (verbose)
362 |             std::cout << "@### 【Similarity】 Spent "
363 |                       << double(sim_time.count()) * microseconds::period::num / microseconds::period::den
364 |                       << " seconds.\n";
365 | 
366 |         assert(!saveWhere);
367 |         if (!saveWhere) {
368 |             std::stringstream dir;
369 |             dir.str("");
370 |             dir<<CSPATH <<"stackn";
371 |             dir<< "-"<<PROP<<"-ours.npz";
372 |             std::cout<<"Save to "<< dir.str() <<"\n";
373 |             cnpy::npz_save(dir.str(), "order", &fullCS[0], {fullCS.size()}, "w");
374 |             cnpy::npz_save(dir.str(), "weight", &fullCSWeight[0],
375 |                            {fullCSWeight.size()},
376 |                            "a");
377 |             dtype order_time = 0.;
378 |             cnpy::npz_save(dir.str(), "order_time", &order_time, {1}, "a");
379 |             printf("%s\n", cur_time());
380 |             printf("Save finished\n");
381 |         }
382 |         return sim_time;
383 |     }
384 | }
385 | #endif
386 | 
387 | 
388 | 


--------------------------------------------------------------------------------
/RECON/mycsTaxi.h:
--------------------------------------------------------------------------------
  1 | #ifndef UNTITLED2_MYCSTaxi_H
  2 | #define UNTITLED2_MYCSTaxi_H
  3 | 
  4 | 
  5 | #include "cnpy.h"
  6 | #include "type.h"
  7 | 
  8 | #include "util.h"
  9 | #include "data.h"
 10 | #include <cstring>
 11 | #include <fstream>
 12 | #include <random>
 13 | #include <unordered_map>
 14 | 
 15 | 
 16 | namespace taxi {
 17 |     using std::chrono::system_clock;
 18 |     using std::chrono::duration_cast;
 19 |     using std::chrono::microseconds;
 20 |     std::random_device rd;
 21 |     std::mt19937 mt(rd());
 22 | 
 23 |     dtype taxiW = 1. / 14, t5W = 2. / 14, t11W = 7. / 14, t16W = 2. / 14, t20W = 2. / 14;
 24 | 
 25 | 
 26 |     cnpy::NpyArray taxiArr;
 27 |     cnpy::NpyArray t5Arr;
 28 |     cnpy::NpyArray t11Arr;
 29 |     cnpy::NpyArray t16Arr;
 30 |     cnpy::NpyArray t20Arr;
 31 |     cnpy::NpyArray joinArr;
 32 | 
 33 |     dtype *dp;
 34 | 
 35 | 
 36 |     dtype *taxi, *t5, *t11, *t16, *t20, *join;
 37 | 
 38 |     idtype taxiNum, taxiDim, t5Num, t5Dim, t11Num, t11Dim, t16Num, t16Dim, t20Num, t20Dim, joinNum, joinDim;
 39 | 
 40 |     dtype *taxiSim, *t5Sim, *t11Sim, *t16Sim, *t20Sim;
 41 | 
 42 | 
 43 | 
 44 |     std::vector<idtype> movies;
 45 | 
 46 | 
 47 |     idtype *f642Weight;
 48 | 
 49 | 
 50 | 
 51 |     cnpy::NpyArray loadNpy(std::string fileDir);
 52 |     void
 53 |     readTaxiNpy(int cate);
 54 |     void
 55 |     mallocTaxiArray();
 56 |     void
 57 |     loadToArr(int cate);
 58 | 
 59 |     void
 60 |     readTaxiNpyGlobal();
 61 |     void
 62 |     mallocTaxiArrayGlobal();
 63 |     void
 64 |     loadToArrGlobal();
 65 | 
 66 | 
 67 | 
 68 |     void mallocTaxiSim();
 69 |     void calTaxiSim();
 70 | 
 71 |     void mallocTaxiSimGlobal();
 72 |     void calTaxiSimGlobal();
 73 | 
 74 | 
 75 |     void initWeight();
 76 |     void initWeightGlobal();
 77 | 
 78 | 
 79 | 
 80 |     void sampleOneTaxi(idtype &ID5, idtype &ID11, idtype &ID16, idtype &ID20, idtype &f642, idtype &rowID,
 81 |                        idtype &joinID);
 82 |     void sampleBatchTaxi(int sampleSize,
 83 |                          std::vector<idtype> &ID5s,
 84 |                          std::vector<idtype> &ID11s,
 85 |                          std::vector<idtype> &ID16s,
 86 |                          std::vector<idtype> &ID20s,
 87 |                          std::vector<idtype> &f642s,
 88 |                          std::vector<idtype> &rowIDs,
 89 |                          std::vector<idtype> &joinIDs);
 90 | 
 91 |     void starDP(idtype ID5,
 92 |                 idtype ID11,
 93 |                 idtype ID16,
 94 |                 idtype ID20,
 95 |                 idtype f642);
 96 | 
 97 | 
 98 |     void realAddOne(idtype joinID);
 99 | 
100 | 
101 | 
102 | 
103 |     dtype getBenefitTaxi(idtype ID5,
104 |                          idtype ID11,
105 |                          idtype ID16,
106 |                          idtype ID20,
107 |                          idtype f642,
108 |                          idtype rowID,
109 |                          idtype joinID,
110 |                          bool change,
111 |                          int verbose);
112 | 
113 | 
114 |     std::chrono::duration<long, std::ratio<1, 1000000>> testTaxi(dtype PROP,
115 |                                                                  dtype epsilon,
116 |                                                                  int saveWhere,
117 |                                                                  int verbose
118 |     );
119 | 
120 |     std::vector<idtype> fullCS;
121 |     std::vector<dtype> fullCSWeight;
122 | 
123 | 
124 |     cnpy::NpyArray loadNpy(std::string fileDir) {
125 | 
126 |         cnpy::NpyArray arr = cnpy::npy_load(fileDir);
127 |         return arr;
128 |     }
129 | 
130 |     void readTaxiNpyGlobal() {
131 |         std::stringstream dir;
132 |         dir.str("");
133 | 
134 |         dir << DATAPATH + "taxi-formycs/train-taxi.npy";
135 |         taxiArr = loadNpy(dir.str());
136 |         dir.str("");
137 | 
138 |         dir << DATAPATH + "taxi-formycs/train-t5.npy";
139 |         t5Arr = loadNpy(dir.str());
140 |         dir.str("");
141 | 
142 |         dir << DATAPATH + "taxi-formycs/train-t16.npy";
143 |         t16Arr = loadNpy(dir.str());
144 |         dir.str("");
145 | 
146 |         dir << DATAPATH + "taxi-formycs/train-t20.npy";
147 |         t20Arr = loadNpy(dir.str());
148 |         dir.str("");
149 |     }
150 | 
151 |     void mallocTaxiArrayGlobal() {
152 | 
153 | 
154 |         taxiNum = taxiArr.shape[0];
155 |         taxiDim = taxiArr.shape[1];
156 |         taxi = (dtype *) malloc(taxiNum * taxiDim * sizeof(dtype));
157 | 
158 | 
159 |         t5Num = t5Arr.shape[0];
160 |         t5Dim = t5Arr.shape[1];
161 |         t5 = (dtype *) malloc(t5Num * t5Dim * sizeof(dtype));
162 | 
163 | 
164 |         t16Num = t16Arr.shape[0];
165 |         t16Dim = t16Arr.shape[1];
166 |         t16 = (dtype *) malloc(t16Num * t16Dim * sizeof(dtype));
167 | 
168 | 
169 |         t20Num = t20Arr.shape[0];
170 |         t20Dim = t20Arr.shape[1];
171 |         t20 = (dtype *) malloc(t20Num * t20Dim * sizeof(dtype));
172 |     }
173 | 
174 |     void loadToArrGlobal() {
175 | 
176 |         readTaxiNpyGlobal();
177 |         mallocTaxiArrayGlobal();
178 |         memcpy(taxi, taxiArr.data<dtype>(), 1LL * taxiNum * taxiDim * sizeof(dtype));
179 |         memcpy(t5, t5Arr.data<dtype>(), 1LL * t5Num * t5Dim * sizeof(dtype));
180 |         memcpy(t16, t16Arr.data<dtype>(), 1LL * t16Num * t16Dim * sizeof(dtype));
181 |         memcpy(t20, t20Arr.data<dtype>(), 1LL * t20Num * t20Dim * sizeof(dtype));
182 |     }
183 | 
184 | 
185 |     void readTaxiNpy(int cate) {
186 |         std::stringstream dir;
187 |         dir.str("");
188 | 
189 | 
190 |         dir << DATAPATH + "taxi-formycs/train-"
191 |             << cate << "-joined.npy";
192 |         joinArr = loadNpy(dir.str());
193 |         dir.str("");
194 | 
195 |         dir << DATAPATH + "taxi-formycs/train-"
196 |             << cate << "-t11.npy";
197 |         t11Arr = loadNpy(dir.str());
198 |         dir.str("");
199 | 
200 |     }
201 | 
202 |     void mallocTaxiArray() {
203 | 
204 | 
205 |         joinNum = joinArr.shape[0];
206 |         joinDim = joinArr.shape[1];
207 |         join = (dtype *) malloc(joinNum * joinDim * sizeof(dtype));
208 | 
209 | 
210 |         t11Num = t11Arr.shape[0];
211 |         t11Dim = t11Arr.shape[1];
212 |         t11 = (dtype *) malloc(t11Num * t11Dim * sizeof(dtype));
213 |     }
214 | 
215 |     void loadToArr(int cate) {
216 | 
217 | 
218 |         readTaxiNpy(cate);
219 |         mallocTaxiArray();
220 |         memcpy(join, joinArr.data<dtype>(), 1LL * joinNum * joinDim * sizeof(dtype));
221 |         memcpy(t11, t11Arr.data<dtype>(), 1LL * t11Num * t11Dim * sizeof(dtype));
222 |     }
223 | 
224 | 
225 |     void mallocTaxiSimGlobal() {
226 | 
227 | 
228 |         taxiSim = (dtype *) malloc(taxiNum * taxiNum * sizeof(dtype));
229 |         t5Sim = (dtype *) malloc(t5Num * t5Num * sizeof(dtype));
230 |         t16Sim = (dtype *) malloc(t16Num * t16Num * sizeof(dtype));
231 |         t20Sim = (dtype *) malloc(t20Num * t20Num * sizeof(dtype));
232 |     }
233 | 
234 |     void calTaxiSimGlobal() {
235 | 
236 |         initSim(taxiSim, taxi, taxiNum, taxiDim, 1, taxiDim - 1);
237 | 
238 |         initSim(t5Sim, t5, t5Num, t5Dim, 2);
239 | 
240 |         initSim(t16Sim, t16, t16Num, t16Dim, 2);
241 | 
242 |         initSim(t20Sim, t20, t20Num, t20Dim, 2);
243 |     }
244 | 
245 |     void mallocTaxiSim() {
246 | 
247 |         t11Sim = (dtype *) malloc(t11Num * t11Num * sizeof(dtype));
248 |     }
249 | 
250 |     void calTaxiSim() {
251 | 
252 |         initSim(t11Sim, t11, t11Num, t11Dim, 2);
253 |     }
254 | 
255 |     dtype *tp, *tp2;
256 |     std::vector<int> f642s;
257 | 
258 |     void initWeightGlobal() {
259 | 
260 | 
261 | 
262 |         tp = (dtype *) malloc((taxiNum + 1) * sizeof(dtype));
263 |         tp2 = (dtype *) malloc((taxiNum + 1) * sizeof(dtype));
264 | 
265 |         f642s.clear();
266 |         f642Weight = (idtype *) malloc(500 * sizeof(idtype));
267 | 
268 | 
269 | 
270 |         for (int i = 0; i < taxiNum; i++) {
271 |             idtype key = taxi[i * taxiDim];
272 | 
273 |             f642s.emplace_back(key);
274 |             f642Weight[key] = 1;
275 |             int cnt = 0;
276 |             for (int j = 0; j < t5Num; j++) {
277 |                 int loc = j * t5Dim + 1;
278 |                 if (key == t5[loc])++cnt;
279 |             }
280 |             f642Weight[key] *= cnt;
281 | 
282 |             cnt = 0;
283 |             for (int j = 0; j < t16Num; j++) {
284 |                 int loc = j * t16Dim + 1;
285 |                 if (key == t16[loc])++cnt;
286 |             }
287 |             f642Weight[key] *= cnt;
288 | 
289 |             cnt = 0;
290 |             for (int j = 0; j < t20Num; j++) {
291 |                 int loc = j * t20Dim + 1;
292 |                 if (key == t20[loc])++cnt;
293 |             }
294 |             f642Weight[key] *= cnt;
295 |         }
296 |     }
297 | 
298 | 
299 |     std::vector<int> joinIDs;
300 | 
301 |     void initWeight() {
302 | 
303 |         joinIDs.clear();
304 |         joinIDs.reserve(joinNum);
305 |         dp = (dtype *) malloc(t11Num * sizeof(dtype));
306 |         memset(dp, 0, t11Num * sizeof(dtype));
307 |         for (int i = 0; i < joinNum; i++)
308 |             joinIDs.emplace_back(i);
309 |     }
310 | 
311 |     void sampleOneTaxi(idtype &ID5,
312 |                        idtype &ID11,
313 |                        idtype &ID16,
314 |                        idtype &ID20,
315 |                        idtype &f642,
316 |                        idtype &rowID,
317 |                        idtype &joinID) {
318 | 
319 |         int id = joinIDs[mt() % joinIDs.size()];
320 | 
321 | 
322 |         idtype idx_st = id * joinDim;
323 | 
324 |         f642 = join[idx_st];
325 |         ID5 = join[idx_st + 1];
326 |         ID11 = join[idx_st + 2];
327 |         ID16 = join[idx_st + 3];
328 |         ID20 = join[idx_st + 4];
329 | 
330 |         rowID = join[idx_st + joinDim - 1];
331 |         joinID = id;
332 |     }
333 | 
334 |     void sampleBatchTaxi(int sampleSize,
335 |                          std::vector<idtype> &ID5s,
336 |                          std::vector<idtype> &ID11s,
337 |                          std::vector<idtype> &ID16s,
338 |                          std::vector<idtype> &ID20s,
339 |                          std::vector<idtype> &f642s,
340 |                          std::vector<idtype> &rowIDs,
341 |                          std::vector<idtype> &joinIDs) {
342 |         ID5s.resize(sampleSize);
343 |         ID11s.resize(sampleSize);
344 |         ID16s.resize(sampleSize);
345 |         ID20s.resize(sampleSize);
346 |         f642s.resize(sampleSize);
347 |         rowIDs.resize(sampleSize);
348 |         joinIDs.resize(sampleSize);
349 | 
350 |         for (int i = 0; i < sampleSize; i++)
351 |             sampleOneTaxi(ID5s[i],
352 |                           ID11s[i],
353 |                           ID16s[i],
354 |                           ID20s[i],
355 |                           f642s[i],
356 |                           rowIDs[i],
357 |                           joinIDs[i]);
358 |     }
359 | 
360 |     void realAddOne(idtype joinID) {
361 | 
362 |         for (int i = 0; i < joinIDs.size(); i++) {
363 |             if (joinIDs[i] == joinID) {
364 |                 std::swap(joinIDs[joinIDs.size() - 1], joinIDs[i]);
365 |                 joinIDs.pop_back();
366 |                 break;
367 |             }
368 |         }
369 |     }
370 | 
371 | 
372 |     void starDP(idtype ID5,
373 |                 idtype ID11,
374 |                 idtype ID16,
375 |                 idtype ID20,
376 |                 idtype f642) {
377 | 
378 |         memset(tp2, 0x3f, sizeof(tp2) * taxiNum);
379 |         idtype simloc = ID5 * t5Num;
380 |         for (int i = 0; i < t5Num; i++) {
381 |             idtype this_f642 = t5[i * t5Dim + 1];
382 |             dtype this_sim = t5Sim[simloc + i];
383 |             tp2[this_f642] = std::min(tp2[this_f642], this_sim);
384 |         }
385 |         for (int i = 0; i < taxiNum; i++)
386 |             tp[i] += tp2[i] * t5W;
387 | 
388 | 
389 | 
390 |         simloc = ID16 * t16Num;
391 |         memset(tp2, 0x3f, sizeof(tp2) * taxiNum);
392 |         for (int i = 0; i < t16Num; i++) {
393 |             idtype this_f642 = t16[i * t16Dim + 1];
394 |             dtype this_sim = t16Sim[simloc + i];
395 |             tp2[this_f642] = std::min(tp2[this_f642], this_sim);
396 |         }
397 |         for (int i = 0; i < taxiNum; i++)
398 |             tp[i] += tp2[i] * t16W;
399 | 
400 | 
401 |         simloc = ID20 * t20Num;
402 |         memset(tp2, 0x3f, sizeof(tp2) * taxiNum);
403 |         for (int i = 0; i < t20Num; i++) {
404 |             idtype this_f642 = t20[i * t20Dim + 1];
405 |             dtype this_sim = t20Sim[simloc + i];
406 |             tp2[this_f642] = std::min(tp2[this_f642], this_sim);
407 |         }
408 |         for (int i = 0; i < taxiNum; i++)
409 |             tp[i] += tp2[i] * t20W;
410 | 
411 | 
412 | 
413 |         simloc = f642 * taxiNum;
414 |         memset(tp2, 0x3f, sizeof(tp2) * taxiNum);
415 | 
416 |         for (int i = 0; i < taxiNum; i++) {
417 |             idtype this_f642 = taxi[i * taxiDim];
418 |             dtype this_sim = taxiSim[simloc + i];
419 |             tp2[this_f642] = std::min(tp2[this_f642], this_sim);
420 |         }
421 |         for (int i = 0; i < taxiNum; i++)
422 |             tp[i] += tp2[i] * taxiW;
423 | 
424 |     }
425 | 
426 |     dtype getBenefitTaxi(idtype ID5,
427 |                          idtype ID11,
428 |                          idtype ID16,
429 |                          idtype ID20,
430 |                          idtype f642,
431 |                          idtype rowID,
432 |                          idtype joinID,
433 |                          bool change = false,
434 |                          int verbose = 1) {
435 |         memset(tp, 0, sizeof(dtype) * taxiNum);
436 |         starDP(ID5, ID11, ID16, ID20, f642);
437 | 
438 |         dtype simSum = 0;
439 |         dtype thisWeight = 0.;
440 | 
441 | 
442 |         idtype sim_loc = ID11 * t11Num;
443 | 
444 |         for (int i = 0; i < t11Num; i++) {
445 |             idtype tmp_f642 = t11[i * t11Dim + 1];
446 |             dtype tmp_dp = tp[tmp_f642];
447 |             dtype t11_sim = t11Sim[sim_loc + i];
448 |             tmp_dp += t11_sim * t11W;
449 |             if (tmp_dp > dp[i] && change) {
450 |                 dp[i] = tmp_dp;
451 |                 if (cs.nn[i] != -1) {
452 |                     cs.weight[cs.nn[i]] -= f642Weight[tmp_f642];
453 |                 }
454 |                 cs.nn[i] = cs.weight.size();
455 |                 thisWeight += f642Weight[tmp_f642];
456 |             }
457 |             simSum += std::max(tmp_dp, dp[i]) * f642Weight[tmp_f642];
458 |         }
459 | 
460 |         if (change) {
461 |             cs.curSum = simSum;
462 |             cs.curSum = cs.norm * std::log(1. + cs.f_norm * cs.curSum);
463 | 
464 |             cs.add(rowID);
465 |             cs.weight.emplace_back(thisWeight);
466 |             if (verbose)
467 |                 printf("    add this weight is %.2f         Current progress 【%.2f %%】\n", thisWeight,
468 |                        100. * cs.weight.size() / cs.siz);
469 |             realAddOne(joinID);
470 |         }
471 | 
472 |         return cs.norm * std::log(1. + cs.f_norm * simSum) - cs.curSum;
473 |     }
474 | 
475 | 
476 |     std::chrono::duration<long, std::ratio<1, 1000000>> testTaxi(dtype PROP,
477 |                                                                  dtype epsilon = 0.01,
478 |                                                                  int saveWhere = 0,
479 |                                                                  int verbose = 1
480 |     ) {
481 |         fullCS.clear();
482 |         fullCSWeight.clear();
483 | 
484 |         std::chrono::duration<long, std::ratio<1, 1000000>> sim_time(0);
485 | 
486 | 
487 |         loadToArrGlobal();
488 |         mallocTaxiSimGlobal();
489 |         calTaxiSimGlobal();
490 |         initWeightGlobal();
491 | 
492 |         std::vector<idtype> ID5s;
493 |         std::vector<idtype> ID11s;
494 |         std::vector<idtype> ID16s;
495 |         std::vector<idtype> ID20s;
496 |         std::vector<idtype> f642s;
497 |         std::vector<idtype> rowIDs;
498 |         std::vector<idtype> samplejoinIDs;
499 | 
500 | 
501 |         for (int cate = 0; cate <= 93; cate++) {
502 |             auto st = system_clock::now();
503 |             if (verbose)std::cout << "#############       Current category is " << cate << "     ##########\n";
504 | 
505 | 
506 |             loadToArr(cate);
507 |             initWeight();
508 | 
509 |             mallocTaxiSim();
510 |             calTaxiSim();
511 | 
512 |             if (verbose)std::cout << "join N is " << joinNum << "\n";
513 |             if (verbose)std::cout << "PROP is " << PROP << "\n";
514 | 
515 |             idtype csSize = (idtype) (PROP * joinNum);
516 |             if (verbose)std::cout << "This cate should have [" << csSize << "]\n";
517 | 
518 | 
519 |             idtype sampleEachStep = 1. / PROP * std::log(1. / epsilon) + 0.5;
520 | 
521 | 
522 |             idtype ano = 1. / PROP * std::log(1. / epsilon) + 0.5;
523 |             if (ano < sampleEachStep)
524 |                 sampleEachStep = ano;
525 | 
526 |             cs.init(t11Num, csSize);
527 |             cs.f_norm = 1. / joinNum;
528 | 
529 |             auto en = system_clock::now();
530 |             auto duration = duration_cast<microseconds>(en - st);
531 |             sim_time += duration;
532 | 
533 |             while (csSize--) {
534 |                 dtype curMaxBenefit = -1;
535 |                 idtype curMaxBenefitID = 0;
536 | 
537 |                 sampleBatchTaxi(sampleEachStep, ID5s, ID11s, ID16s, ID20s, f642s, rowIDs, samplejoinIDs);
538 |                 std::vector<dtype> benefit_vec(sampleEachStep);
539 | 
540 |                 #pragma omp parallel for schedule(static)
541 |                 for (int i = 0; i < sampleEachStep; i++)
542 |                     benefit_vec[i] = getBenefitTaxi(ID5s[i], ID11s[i], ID16s[i], ID20s[i], f642s[i], rowIDs[i],
543 |                                                     samplejoinIDs[i], 0, 0);
544 |                 idtype i = 0;
545 |                 for (auto val : benefit_vec) {
546 |                     if (val > curMaxBenefit) {
547 |                         curMaxBenefit = val;
548 |                         curMaxBenefitID = i;
549 |                     }
550 |                     ++i;
551 |                 }
552 |                 i = curMaxBenefitID;
553 | 
554 |                 if (verbose)std::cout << "Benefit is " << curMaxBenefit<<"\n";
555 |                 getBenefitTaxi(ID5s[i], ID11s[i], ID16s[i], ID20s[i], f642s[i], rowIDs[i], samplejoinIDs[i], true, 0);
556 |             }
557 | 
558 |             fullCS.insert(fullCS.end(), cs.coresetAll.begin(), cs.coresetAll.end());
559 |             fullCSWeight.insert(fullCSWeight.end(), cs.weight.begin(), cs.weight.end());
560 | 
561 | 
562 |             if (verbose)std::cout << "Finished!\n";
563 |         }
564 | 
565 |         if (verbose)printf("Total coreset size 【%d】\n", fullCS.size());
566 | 
567 |         if (verbose)
568 |             std::cout << "@### 【Similarity】 Spent "
569 |                       << double(sim_time.count()) * microseconds::period::num / microseconds::period::den
570 |                       << " seconds.\n";
571 | 
572 |         assert(!saveWhere);
573 |         if (!saveWhere) {
574 |             std::stringstream dir;
575 |             dir.str("");
576 |             dir<<CSPATH <<"taxi";
577 |             dir<< "-"<<PROP<<"-ours.npz";
578 |             std::cout<<"Save to "<< dir.str() <<"\n";
579 |             cnpy::npz_save(dir.str(), "order", &fullCS[0], {fullCS.size()}, "w");
580 |             cnpy::npz_save(dir.str(), "weight", &fullCSWeight[0],
581 |                            {fullCSWeight.size()},
582 |                            "a");
583 |             dtype order_time = 0.;
584 |             cnpy::npz_save(dir.str(), "order_time", &order_time, {1}, "a");
585 |             printf("%s\n", cur_time());
586 |             printf("Save finished\n");
587 |         }
588 |         return sim_time;
589 |     }
590 | }
591 | #endif
592 | 


--------------------------------------------------------------------------------
/RECON/type.h:
--------------------------------------------------------------------------------
 1 | #ifndef UNTITLED2_TYPE_H
 2 | #define UNTITLED2_TYPE_H
 3 | 
 4 | typedef double dtype;
 5 | typedef long long labeltype;
 6 | typedef long long idtype;
 7 | 
 8 | #endif
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/RECON/util.h:
--------------------------------------------------------------------------------
  1 | #ifndef UNTITLED2_UTIL_H
  2 | #define UNTITLED2_UTIL_H
  3 | 
  4 | 
  5 | #include "type.h"
  6 | #include "global.h"
  7 | #include <ctime>
  8 | #include <cmath>
  9 | #include <iostream>
 10 | #include <fstream>
 11 | #include <string>
 12 | 
 13 | char * cur_time(){
 14 |     time_t now = time(0);
 15 |     char* dt = ctime(&now);
 16 |     dt[strlen(dt) -1 ]='\0';
 17 |     return dt;
 18 | }
 19 | 
 20 | void test(dtype *z){
 21 |     for(idtype i = 0; i < std::min((idtype)20, n * d); i++)
 22 |         std::cout << z[i] <<" ";
 23 |     puts("");
 24 |     return;
 25 | }
 26 | 
 27 | inline dtype distance(idtype idx, idtype idy){
 28 |     
 29 |     dtype ret = 0;
 30 |     idx = Map[idx];
 31 |     idy = Map[idy];
 32 | 
 33 |     for(idtype i = 0; i < d; i++)
 34 |         ret += (X[idx * d + i] - X[idy * d + i]) * (X[idx * d + i] - X[idy * d + i]);
 35 |     return ret;
 36 | }
 37 | 
 38 | inline dtype tryAdd(dtype* cur, idtype element){
 39 |     
 40 |     dtype sim_sum = 0;
 41 |     #pragma omp parallel for schedule (static) reduction(+:sim_sum)
 42 |     for(idtype i = 0; i < n; i++)
 43 |         sim_sum += std::max(cur[i], similarity[element * n + i]);
 44 | 
 45 |     return norm * std::log(1. + f_norm * sim_sum) - curSum;
 46 | }
 47 | inline void realAdd(dtype* cur, idtype element){
 48 |     
 49 |     curSum = 0;
 50 |     for(idtype i = 0; i < n; i++){
 51 |         if(similarity[element * n + i] > cur[i]) {
 52 |             cur[i] = similarity[element * n + i];
 53 |             if(nn[i]!=-1) {
 54 | 
 55 |                 --weight[nn[i]];
 56 |             }
 57 |             nn[i] = element;
 58 |             ++weight[element];
 59 |         }
 60 |         curSum += cur[i];
 61 |     }
 62 |     curSum = norm * std::log(1. + f_norm * curSum);
 63 |     coreset.emplace_back(element);
 64 |     ++cSize;
 65 |     return;
 66 | }
 67 | void initSimilarity(int verbose=1){
 68 |     
 69 | 
 70 |     dtype max_similarity = 0;
 71 | 
 72 |     if(verbose)printf("Start to cal similarity %s\n",cur_time());
 73 |     for(idtype i = 0; i < n; i++)
 74 |         #pragma omp parallel for schedule (static)
 75 |         for(idtype j = i + 1; j < n; j++){
 76 |             similarity[i * n + j] = -distance(i, j);
 77 | 
 78 |         }
 79 |     if(verbose)printf("Finish to cal similarity %s\n",cur_time());
 80 |     for(idtype i = 0; i < n; i++)
 81 |         for(idtype j = i + 1; j < n; j++){
 82 | 
 83 |             max_similarity = std::max(max_similarity, -similarity[i * n + j]);
 84 |         }
 85 |     if(verbose)std::cout << "max similarity is " << max_similarity << "\n";
 86 |     #pragma omp parallel for schedule (guided)
 87 |     for(idtype i = 0; i < n; i++)
 88 |         for(idtype j = i + 1; j < n; j++) {
 89 |             similarity[i * n + j] = (max_similarity + similarity[i * n + j]) / max_similarity;
 90 |         }
 91 | 
 92 |     for(idtype i = 0; i < n; i++)
 93 |         #pragma omp parallel for schedule (static)
 94 |         for(idtype j = 0; j < i; j++)
 95 |             similarity[i * n + j] = similarity[j * n + i];
 96 |     for(idtype i = 0; i < n; i++)
 97 |         similarity[i * n + i] = 1;
 98 | 
 99 |     return;
100 | }
101 | void initPQ(){
102 |     
103 |     while(!pq.empty())pq.pop();
104 |     for(idtype i = 1; i < n; i++)
105 |         pq.push(std::make_pair(tryAdd(maxSim, i),i));
106 |     return;
107 | }
108 | 
109 | 
110 | void initCategories(int verbose=1){
111 |     
112 |     N = n;
113 |     cateNum.clear();
114 | 
115 | 
116 | 
117 |     for(idtype i = 0; i < N; i++) {
118 | 
119 |         if(Y[i]==-1)Y[i]=0;
120 | 
121 |         ++cateNum[Y[i]];
122 |     }
123 | 
124 |     cateCnt = cateNum.size();
125 |     std::cout<<"cateCnt is "<<cateCnt<<"\n";
126 |     for(int i = 0 ; i < cateCnt; i++) {
127 |         assert(cateNum.find(i) != cateNum.end());
128 |     }
129 |     if(verbose)printf("——————————    Cate cnt is 【%d】\n", cateCnt);
130 |     for(int i= 0 ; i < cateCnt; i++)
131 |         if(verbose)printf(" |||   Cate [%3d]  has  [%8d]\n", i, cateNum[i]);
132 | 
133 | 
134 | 
135 |     real_coreset_size = 0;
136 |     for(int cate = 0; cate < cateCnt; cate++){
137 |         n = cateNum[cate];
138 |         int k = 1.0 * n / N * target_coreset_size + 0.5;
139 |         real_coreset_size += k;
140 |     }
141 |     weight_vec.clear();
142 |     coreset.clear();
143 |     coresetAll.clear();
144 |     coresetAll.reserve(real_coreset_size);
145 | 
146 | }
147 | 
148 | std::vector<dtype> lazyVec;
149 | bool cmpLazyIMDB(int i, int j){
150 |     return lazyVec[i] > lazyVec[j];
151 | }
152 | 
153 | bool cmpLazy(int i, int j){
154 |     return lazy[i] > lazy[j];
155 | }
156 | 
157 | 
158 | inline dtype Dist(idtype u1, idtype u2, dtype * data, idtype dim, idtype st_id = 1, idtype end_id=-1){
159 | 
160 |     dtype ret = 0.;
161 |     if(end_id == -1)
162 |         end_id = dim;
163 |     idtype u1Loc = u1 * dim;
164 |     idtype u2Loc = u2 * dim;
165 |     for(idtype i = st_id; i < end_id; i++)
166 |         ret += (data[u1Loc + i] - data[u2Loc + i]) * (data[u1Loc + i] - data[u2Loc + i]);
167 |     return ret;
168 | }
169 | 
170 | void initSim(dtype * sim, dtype * data, idtype num, idtype dim, idtype st_id=1,idtype end_id=-1){
171 | 
172 |     if(end_id==-1)
173 |         end_id = dim;
174 |     dtype maxDis = 0.;
175 |     #pragma omp parallel for schedule(guided)
176 |     for(idtype i = 0; i < num; i++) {
177 |         idtype now = i * num;
178 |         for (idtype j = i + 1; j < num; j++) {
179 |             sim[now + j] = -Dist(i, j, data, dim, st_id, end_id);
180 |         }
181 |     }
182 |     idtype now = 0;
183 |     for(idtype i = 0;i < num;i++) {
184 |         for (idtype j = i + 1; j < num; j++)
185 |             maxDis = std::max(maxDis, -sim[now+ j]);
186 |         now += num;
187 |     }
188 | 
189 | 
190 |     #pragma omp parallel for schedule(guided)
191 |     for(idtype i = 0; i < num; i++) {
192 |         idtype now = i * num;
193 |         for (idtype j = i + 1; j < num; j++)
194 |             sim[now + j] = (maxDis + sim[now + j]) / maxDis;
195 |     }
196 | 
197 |     #pragma omp parallel for schedule(guided)
198 |     for(idtype i = 0; i < n; i++)
199 | 
200 |         for(idtype j = 0; j < i; j++)
201 |             sim[i * num + j] = sim[j * num + i];
202 | 
203 |     for(idtype i = 0; i < n; i++)
204 |         sim[i * n + i] = 1;
205 |     return;
206 | 
207 | 
208 | 
209 | }
210 | 
211 | 
212 | struct CS{
213 |     int n;
214 |     int siz;
215 |     dtype curSum = 0.;
216 | 
217 |     dtype norm = 1./std::log(2.);
218 |     dtype f_norm;
219 | 
220 |     idtype* nn;
221 |     std::vector<idtype> coresetAll;
222 |     std::vector<dtype> weight;
223 | 
224 |     void init(int n_, int size_){
225 |         n = n_;
226 |         curSum = 0;
227 |         siz = size_;
228 |         coresetAll.clear();
229 |         weight.clear();
230 | 
231 |         coresetAll.reserve(size_);
232 |         weight.reserve(size_);
233 | 
234 |         nn = (idtype *)malloc(n * sizeof(idtype));
235 |         memset(nn, -1, n * sizeof(idtype));
236 |         f_norm = 1./(2. * n);
237 |     }
238 |     void add(idtype id_){
239 |         coresetAll.emplace_back(id_);
240 |     }
241 | }cs;
242 | 
243 | 
244 | #endif
245 | 
246 | 


--------------------------------------------------------------------------------
/linear-universal.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | from warnings import simplefilter
  4 | simplefilter(action='ignore', category=FutureWarning)
  5 | np.seterr(all='ignore')
  6 | from MLModel.optimizer import *
  7 | from MLModel.LoadData import *
  8 | from MLModel.MLmodel.linearRegression import *
  9 | from MLModel.paramRange import *
 10 | from MLModel.LoadCoreset import *
 11 | from MLModel.hidden import *
 12 | 
 13 | def test(method='sgd', data='movieLen1M', exp_decay=1, subset_size=1., greedy=1, shuffle=0, g_cnt=-1.,
 14 |          b_cnt=-1., num_runs=10, metric='', reg=1e-5, rand='', ne=-1, from_all=0,coreset_from='scratch', batch=1, sampleSize=0):
 15 |     train_data, train_target, val_data, val_target, test_data, test_target = load_dataset(data, regression=True)
 16 |     print("Dataset Loaded")
 17 | 
 18 |     g_range, b_range = get_param_range(subset_size, exp_decay, method, data)
 19 |     best_f_list = []
 20 |     best_MAE_list = []
 21 |     best_MSE_list = []
 22 |     best_MSLE_list = []
 23 | 
 24 |     train_time_list = []
 25 | 
 26 |     for itr in range(num_runs):
 27 |         f_best, acc_best, b_f, g_f, b_a, g_a = 1e10, 0, 0, 0, 0, 0
 28 | 
 29 |         print("Cur itr is ", itr)
 30 |         if ne == -1:
 31 |             ne = 20 + int(np.ceil((1. / subset_size) * 5)) + 5 if subset_size < 1 else 20
 32 |         else:
 33 |             rand += f'_e{ne}'
 34 |         if ne > 100:
 35 |             ne = 100
 36 |         # assert greedy == 1
 37 |         if greedy == 1:
 38 |             order, weights, total_ordering_time = LoadCoreset(coreset_from, data, subset_size, batch=batch,sampleSize=sampleSize)
 39 |         else:
 40 |             print('Selecting a random subset')
 41 |             order = np.arange(0, len(train_data))
 42 |             random.shuffle(order)
 43 |             order = order[:int(subset_size * len(train_data))]
 44 |             print(' 【Random subset size】 is ', int(subset_size * len(train_data)))
 45 |             weights = np.ones(int(subset_size * len(train_data)), dtype=np.float)
 46 |         print(f'--------------- run number: {itr}, rand: {rand}, '
 47 |               f'subset: {subset_size}, subset size: {len(order)}')
 48 | 
 49 |         best_test_f = 0
 50 |         best_test_MAE = 0
 51 |         best_test_MSE = 0
 52 |         best_test_MSLE = 0
 53 | 
 54 |         print("g_range is ", g_range)
 55 |         print("b_range is ", b_range)
 56 |         for gamma in g_range:
 57 |             for b in b_range:
 58 |                 dim = len(train_data[0])
 59 | 
 60 |                 model = LinearRegression(dim)
 61 |                 lr = gamma * np.power(b, np.arange(ne)) if exp_decay else gamma / (1 + b * np.arange(ne))
 62 | 
 63 |                 st_time = time.time()
 64 |                 x_s, t_s = Optimizer().optimize(
 65 |                     method, model, train_data[order, :], train_target[order], weights, ne, shuffle, lr, reg)
 66 |                 en_time = time.time()
 67 |                 print("Train time is ", en_time - st_time)
 68 |                 train_time_list.append(en_time - st_time)
 69 | 
 70 |                 f_s = model.loss(val_data, val_target, l2_reg=reg)
 71 | 
 72 |                 print(f'data: {data}, method: {method}, run: {itr}, exp_decay: {exp_decay}, size: {subset_size} {rand} '
 73 |                       f'--> f: {f_s}, b: {b}, g: {gamma}')
 74 | 
 75 |                 if f_s < f_best:
 76 |                     x_a, g_a, b_a, t_a =  x_s, gamma, b, t_s
 77 | 
 78 |                     f_best = f_s
 79 | 
 80 |                     best_test_f = model.loss(test_data, test_target)
 81 |                     best_test_MAE, best_test_MSE, best_test_MSLE = model.MASLE(test_data, test_target)
 82 |                     print("Current best f is   ", f_best)
 83 |                     print("Current best MAE is ", best_test_MAE)
 84 |                     print("Current best MSE is ", best_test_MSE)
 85 |                     print("Current best MSLE is ", best_test_MSLE)
 86 | 
 87 | 
 88 |             print(f'Best solution is => f: {f_best}, a: {acc_best}, b_f: {b_f}, g_f: {g_f}, b_a: {b_a}, g_a: {g_a}')
 89 | 
 90 | 
 91 |         best_f_list.append(f_best)
 92 |         best_MAE_list.append(best_test_MAE)
 93 |         best_MSE_list.append(best_test_MSE)
 94 |         best_MSLE_list.append(best_test_MSLE)
 95 | 
 96 |         print("   Current best f_list")
 97 |         print(best_f_list)
 98 |         print("Mean ", np.mean(best_f_list), "Max ", np.max(best_f_list), "Min ", np.min(best_f_list),
 99 |               "Median ", np.median(best_f_list))
100 | 
101 |         print("   Current best MAE_list")
102 |         print(best_MAE_list)
103 |         print("Mean ", np.mean(best_MAE_list), "Max ", np.max(best_MAE_list), "Min ", np.min(best_MAE_list),
104 |               "Median ", np.median(best_MAE_list))
105 | 
106 | 
107 |         print("   Current best MSE_list")
108 |         print(best_MSE_list)
109 |         print("Mean ", np.mean(best_MSE_list), "Max ", np.max(best_MSE_list), "Min ", np.min(best_MSE_list),
110 |               "Median ", np.median(best_MSE_list))
111 | 
112 |         print("   Current best MSLE_list")
113 |         print(best_MSLE_list)
114 |         print("Mean ", np.mean(best_MSLE_list), "Max ", np.max(best_MSLE_list), "Min ", np.min(best_MSLE_list),
115 |               "Median ", np.median(best_MSLE_list))
116 | 
117 | 
118 |         print("Train time list(one hyper-param)")
119 |         print(train_time_list)
120 |         print("Mean ", np.mean(train_time_list), "Max ", np.max(train_time_list), "Min ", np.min(train_time_list), "Median ", np.median(train_time_list))
121 |     print('Finish')
122 |     return best_MSE_list, train_time_list, best_f_list
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     p = argparse.ArgumentParser(description='Faster Training.')
127 |     p.add_argument('--data', type=str, required=False, default='IMDB',
128 |                    choices=['IMDBCLinear','IMDBLargeCLinear','stackLinear', 'taxi', 'stackn'], help='name of dataset')
129 |     p.add_argument('--greedy', type=int, required=False, default=1,
130 |                    help='greedy ordering')
131 |     p.add_argument('--reg', type=float, required=False, default=1e-5,
132 |                    help='L2 regularization constant')
133 |     p.add_argument('--method', type=str, required=False, default='sgd',
134 |                    choices=['sgd', 'svrg', 'saga', 'BGD'], help='sgd, svrg, saga, BGD')
135 |     p.add_argument('--subset_size', '-s', type=float, required=False,
136 |                    help='size of the subset')
137 |     p.add_argument('--shuffle', type=int, default=2,
138 |                    choices=[0, 1, 2, 3],
139 |                    help='0: not shuffling, 1: random permutation, 2: with replacement, 3: fixed permutation')
140 |     p.add_argument('--exp_decay', type=int, required=False, default=1,
141 |                    choices=[0, 1], help='exponentially decaying learning rate')
142 |     p.add_argument('--num_runs', type=int, required=False, default=10,
143 |                    help='number of runs')
144 |     p.add_argument('--metric', type=str, required=False, default='l2',
145 |                    help='distance metric')
146 |     p.add_argument('--b', type=float, required=False, default=-1,
147 |                    help='learning rate parameter b')
148 |     p.add_argument('--g', type=float, required=False, default=-1,
149 |                    help='learning rate parameter g')
150 |     p.add_argument('--ne', type=int, required=False, default=-1,
151 |                    help='number of epochs')
152 |     p.add_argument('--grad_diff', type=int, required=False, default=0,
153 |                    help='number of epochs')
154 |     p.add_argument('--from_all', type=int, required=False, default=0)
155 |     p.add_argument('--coreset_from', type=str, required=False, default='diskOurs',
156 |                    choices=['diskOurs'], help='Where to load coreset')
157 |     args = p.parse_args()
158 | 
159 |     if args.greedy == 0:
160 |         rand = 'rand_nw'
161 |     elif args.greedy == 1 and args.shuffle == 1:
162 |         rand = 'grd_shuff'
163 |     elif args.greedy == 1 and args.shuffle == 2:
164 |         rand = 'grd_rand'
165 |     elif args.greedy == 1 and args.shuffle == 0:
166 |         rand = 'grd_ord'
167 |     elif args.greedy == 1 and args.shuffle > 2:
168 |         rand = 'grd_fix_perm'
169 |     else:
170 |         rand = ''
171 | 
172 |     print("Start test time", time.asctime( time.localtime(time.time()) ))
173 |     test(method=args.method, data=args.data, exp_decay=args.exp_decay, subset_size=args.subset_size,
174 |          greedy=args.greedy, shuffle=args.shuffle, b_cnt=args.b, g_cnt=args.g, num_runs=args.num_runs,
175 |          metric=args.metric, rand=rand, ne=-1, from_all=args.from_all,
176 |          coreset_from=args.coreset_from, reg=args.reg, batch=0)
177 |     print("Finished test time", time.asctime(time.localtime(time.time()) ))
178 | 
179 | 


--------------------------------------------------------------------------------
/logistic-universal.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import time
  4 | from os import path
  5 | from warnings import simplefilter
  6 | import warnings
  7 | simplefilter(action='ignore', category=FutureWarning)
  8 | warnings.filterwarnings('ignore')
  9 | np.seterr(all='ignore')
 10 | import random
 11 | from MLModel.optimizer import *
 12 | from MLModel.LoadData import *
 13 | from MLModel.MLmodel.logisticRegression import *
 14 | from MLModel.paramRange import *
 15 | from MLModel.LoadCoreset import *
 16 | from MLModel.hidden import *
 17 | 
 18 | 
 19 | def test(method='sgd', data='movieLen1M', exp_decay=1, subset_size=1., greedy=1, shuffle=0, g_cnt=-1.,
 20 |          b_cnt=-1., num_runs=10, metric='', reg=1e-5, rand='', ne=-1, from_all=0,
 21 |          coreset_from='scratch',
 22 |          batch=0, sampleSize=0):
 23 |     train_data, train_target, val_data, val_target, test_data, test_target = load_dataset(data)
 24 |     print("Dataset Loaded")
 25 |     print(np.unique(train_target))
 26 |     if data in ['IMDBC5', 'IMDBLargeC5', 'Brazilnew']:
 27 |         num_class = 5
 28 |     print("num class is ", num_class)
 29 |     print(np.unique(test_target))
 30 |     print("Class number is [{}]".format(num_class))
 31 |     g_range, b_range = get_param_range(subset_size, exp_decay, method, data)
 32 | 
 33 | 
 34 |     x_runs_a = [[]] * num_runs
 35 |     if ne == -1:
 36 |         ne = 20 + int(np.ceil((1. / subset_size) * 5)) + 5 if subset_size < 1 else 20
 37 |     else:
 38 |         rand += f'_e{ne}'
 39 |     if ne > 100:
 40 |         ne = 100
 41 |     f_runs_a = np.zeros((num_runs, ne))
 42 |     ft_runs_a = np.zeros((num_runs, ne))
 43 |     acc_runs_a = np.zeros((num_runs, ne))
 44 |     t_runs_a = np.zeros((num_runs, ne))
 45 | 
 46 | 
 47 |     precision_runs_a = np.zeros((num_runs, ne))
 48 |     recall_runs_a = np.zeros((num_runs, ne))
 49 | 
 50 |     best_f1_list = []
 51 |     best_f_list = []
 52 |     best_acc_list = []
 53 |     best_precision_list = []
 54 |     best_recall_list = []
 55 |     train_time_list = []
 56 |     best_MAE_list = []
 57 |     best_MSE_list = []
 58 |     best_MSLE_list = []
 59 | 
 60 |     for itr in range(num_runs):
 61 |         f_best, acc_best, b_f, g_f, b_a, g_a = 1e10, 0, 0, 0, 0, 0
 62 | 
 63 | 
 64 |         print("Cur itr is ", itr)
 65 |         # assert greedy == 1
 66 |         if greedy == 1:
 67 |             order, weights, total_ordering_time = LoadCoreset(coreset_from, data, subset_size, batch=batch, sampleSize=sampleSize)
 68 |         else:
 69 |             print('Selecting a random subset')
 70 | 
 71 |             order = np.arange(0, len(train_data))
 72 |             random.shuffle(order)
 73 |             order = order[:int(subset_size * len(train_data))]
 74 |             print(' 【Random subset size】 is ', int(subset_size * len(train_data)))
 75 |             weights = np.ones(int(subset_size * len(train_data)), dtype=np.float)
 76 | 
 77 |         print(f'--------------- run number: {itr}, rand: {rand}, '
 78 |               f'subset: {subset_size}, subset size: {len(order)}')
 79 | 
 80 |         best_test_f1 = 0
 81 |         best_test_acc = 0
 82 |         best_test_recall = 0
 83 |         best_test_precision = 0
 84 | 
 85 |         best_test_MAE = 0
 86 |         best_test_MSE = 0
 87 |         best_test_MSLE = 0
 88 | 
 89 |         print("g_range is ", g_range)
 90 |         print("b_range is ", b_range)
 91 |         for gamma in g_range:
 92 |             for b in b_range:
 93 | 
 94 | 
 95 |                 dim = len(train_data[0])
 96 | 
 97 |                 model = LogisticRegression(dim, num_class)
 98 |                 lr = gamma * np.power(b, np.arange(ne)) if exp_decay else gamma / (1 + b * np.arange(ne))
 99 | 
100 |                 st_time = time.time()
101 |                 x_s, t_s = Optimizer().optimize(
102 |                     method, model, train_data[order, :], train_target[order], weights, ne, shuffle, lr, reg)
103 |                 en_time = time.time()
104 |                 print("Train time is ", en_time - st_time)
105 |                 train_time_list.append(en_time - st_time)
106 | 
107 |                 f_s = model.loss(train_data, train_target, l2_reg=reg)
108 |                 acc_s = model.accuracy(val_data, val_target)
109 |                 print('acc_s is ',acc_s)
110 |                 f1_s = model.f1(val_data, val_target)
111 | 
112 |                 print(f'data: {data}, method: {method}, run: {itr}, exp_decay: {exp_decay}, size: {subset_size} {rand} '
113 |                       f'--> f: {f_s}, acc: {acc_s}, f1: {f1_s} b: {b}, g: {gamma}')
114 |                 # if f1_s > f1_best:
115 |                 if acc_s > acc_best:
116 | 
117 |                     acc_best, x_a, g_a, b_a, t_a = acc_s, x_s, gamma, b, t_s
118 | 
119 |                     f1_best = f1_s
120 |                     f_best = f_s
121 | 
122 |                     x_runs_a[itr] = x_a
123 |                     t_runs_a[itr, :] = t_a
124 |                     best_test_f1 = model.f1(test_data, test_target)
125 |                     best_test_precision = model.precision(test_data, test_target)
126 |                     best_test_recall = model.recall(test_data, test_target)
127 |                     best_test_acc = model.acc(test_data, test_target)
128 | 
129 |                     best_test_MAE, best_test_MSE, best_test_MSLE = model.MASLE(test_data, test_target)
130 |                     print("### New best MAE is ", best_test_MAE)
131 |                     print("### New best MSE is ", best_test_MSE)
132 |                     print("### New best MSLE is ", best_test_MSLE)
133 | 
134 |                     print("### New best f1 is [{}]".format(best_test_f1))
135 |                     print("### New best precision is [{}]".format(best_test_precision))
136 |                     print("### New best recall is [{}]".format(best_test_recall))
137 |                     print("### New best acc is [{}]".format(best_test_acc))
138 | 
139 | 
140 |             print(f'Best solution is => f: {f_best}, a: {acc_best}, b_f: {b_f}, g_f: {g_f}, b_a: {b_a}, g_a: {g_a}')
141 | 
142 | 
143 |         best_f1_list.append(best_test_f1)
144 |         best_f_list.append(f_best)
145 |         best_acc_list.append(best_test_acc)
146 |         best_precision_list.append(best_test_precision)
147 |         best_recall_list.append(best_test_recall)
148 | 
149 |         best_MAE_list.append(best_test_MAE)
150 |         best_MSE_list.append(best_test_MSE)
151 |         best_MSLE_list.append(best_test_MSLE)
152 | 
153 |         print("   Current best_f1_list")
154 |         print(best_f1_list)
155 |         print("Mean ", np.mean(best_f1_list), "Max ", np.max(best_f1_list), "Min ", np.min(best_f1_list),
156 |               "Median ", np.median(best_f1_list))
157 | 
158 |         print("   Current best f_list")
159 |         print(best_f_list)
160 |         print("Mean ", np.mean(best_f_list), "Max ", np.max(best_f_list), "Min ", np.min(best_f_list),
161 |               "Median ", np.median(best_f_list))
162 |         print("   Current best acc_list")
163 |         print(best_acc_list)
164 |         print("   Current best recall_list")
165 |         print(best_recall_list)
166 |         print("   Current best precision_list")
167 |         print(best_precision_list)
168 | 
169 |         print("   Current best MAE_list")
170 |         print(best_MAE_list)
171 |         print("Mean ", np.mean(best_MAE_list), "Max ", np.max(best_MAE_list), "Min ", np.min(best_MAE_list),
172 |               "Median ", np.median(best_MAE_list))
173 | 
174 |         print("   Current best MSE_list")
175 |         print(best_MSE_list)
176 |         print("Mean ", np.mean(best_MSE_list), "Max ", np.max(best_MSE_list), "Min ", np.min(best_MSE_list),
177 |               "Median ", np.median(best_MSE_list))
178 | 
179 |         print("   Current best MSLE_list")
180 |         print(best_MSLE_list)
181 |         print("Mean ", np.mean(best_MSLE_list), "Max ", np.max(best_MSLE_list), "Min ", np.min(best_MSLE_list),
182 |               "Median ", np.median(best_MSLE_list))
183 | 
184 |         print("Train time list(one hyper-param)")
185 |         print(train_time_list)
186 |         print("Mean ", np.mean(train_time_list), "Max ", np.max(train_time_list), "Min ", np.min(train_time_list), "Median ", np.median(train_time_list))
187 |     print('Finish')
188 | 
189 |     return best_acc_list,best_MSE_list,train_time_list,best_f_list
190 | 
191 | 
192 | if __name__ == '__main__':
193 | 
194 |     p = argparse.ArgumentParser(description='Faster Training.')
195 |     p.add_argument('--exp_decay', type=int, required=False, default=1,
196 |                    choices=[0, 1], help='exponentially decaying learning rate')
197 |     p.add_argument('--greedy', type=int, required=False, default=1,
198 |                    help='greedy ordering')
199 |     p.add_argument('--reg', type=float, required=False, default=1e-5,
200 |                    help='L2 regularization constant')
201 |     p.add_argument('--method', type=str, required=False, default='sgd',
202 |                    choices=['sgd', 'svrg', 'saga', 'BGD'], help='sgd, svrg, saga, BGD')
203 |     p.add_argument('--subset_size', '-s', type=float, required=False,
204 |                    help='size of the subset')
205 |     p.add_argument('--shuffle', type=int, default=2,
206 |                    choices=[0, 1, 2, 3],
207 |                    help='0: not shuffling, 1: random permutation, 2: with replacement, 3: fixed permutation')
208 |     p.add_argument('--num_runs', type=int, required=False, default=10,
209 |                    help='number of runs')
210 |     p.add_argument('--data', type=str, required=False, default='IMDB',
211 |                    choices=['IMDBC5','IMDBLargeC5', 'Brazilnew'], help='name of dataset')
212 |     p.add_argument('--metric', type=str, required=False, default='l2',
213 |                    help='distance metric')
214 |     p.add_argument('--b', type=float, required=False, default=-1,
215 |                    help='learning rate parameter b')
216 |     p.add_argument('--g', type=float, required=False, default=-1,
217 |                    help='learning rate parameter g')
218 |     p.add_argument('--grad_diff', type=int, required=False, default=0,
219 |                    help='number of epochs')
220 |     p.add_argument('--from_all', type=int, required=False, default=0)
221 |     p.add_argument('--coreset_from', type=str, required=False, default='diskOurs',
222 |                    choices=['diskOurs'], help='Where to load coreset')
223 |     p.add_argument('--batch', type=int, required=False, default=0)
224 | 
225 |     args = p.parse_args()
226 | 
227 |     if args.greedy == 0:
228 |         rand = 'rand_nw'
229 |     elif args.greedy == 1 and args.shuffle == 1:
230 |         rand = 'grd_shuff'
231 |     elif args.greedy == 1 and args.shuffle == 2:
232 |         rand = 'grd_rand'
233 |     elif args.greedy == 1 and args.shuffle == 0:
234 |         rand = 'grd_ord'
235 |     elif args.greedy == 1 and args.shuffle > 2:
236 |         rand = 'grd_fix_perm'
237 |     else:
238 |         rand = ''
239 | 
240 |     print("Start test time", time.asctime( time.localtime(time.time()) ))
241 |     test(method=args.method, data=args.data, exp_decay=args.exp_decay, subset_size=args.subset_size,
242 |          greedy=args.greedy, shuffle=args.shuffle, b_cnt=args.b, g_cnt=args.g, num_runs=args.num_runs,
243 |          metric=args.metric, rand=rand, ne=-1, from_all=args.from_all,
244 |          coreset_from=args.coreset_from, reg=args.reg,batch=0)
245 |     print("Finished test time", time.asctime(time.localtime(time.time()) ))
246 | 
247 | 
248 | 
249 | 


--------------------------------------------------------------------------------
/preprocess/Brazil.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.linear_model import LogisticRegression
  4 | from sklearn import metrics
  5 | import os
  6 | 
  7 | X_train = []
  8 | X_test = []
  9 | y_train = []
 10 | y_test = []
 11 | 
 12 | import datetime
 13 | def parseDatetime(s):
 14 |     pre, suf = s.split(' ')
 15 |     
 16 |     year_s, mon_s, day_s = pre.split('-')
 17 |     hour_s, minute_s, second_s = suf.split(':')
 18 |     return datetime.datetime(int(year_s), int(mon_s), int(day_s), int(hour_s), int(minute_s), int(second_s))
 19 | 
 20 | def timeDelta(arrLike, col1, col2):
 21 |     purchase = parseDatetime(arrLike[col1])
 22 |     approve = parseDatetime(arrLike[col2])
 23 |     delta = approve - purchase
 24 |     return delta.total_seconds()
 25 | 
 26 | DIR = '/home/jiayi/disk/C-craig/dataset/Brazil/'
 27 | 
 28 | file = 'olist_order_reviews_dataset'
 29 | review = pd.read_csv(DIR + file + '.csv')
 30 | 
 31 | file = 'olist_orders_dataset.csv'
 32 | order = pd.read_csv(DIR + file)
 33 | 
 34 | file = 'olist_order_items_dataset.csv'
 35 | orderItem = pd.read_csv(DIR + file)
 36 | 
 37 | file = 'olist_products_dataset.csv'
 38 | product = pd.read_csv(DIR + file)
 39 | 
 40 | review = review[['review_id', 'order_id', 'review_score','review_creation_date', 'review_answer_timestamp']].copy()
 41 | order = order[['order_id',  'order_status', 'order_purchase_timestamp',
 42 |               'order_approved_at', 'order_delivered_carrier_date',
 43 |               'order_delivered_customer_date', 'order_estimated_delivery_date']].copy()
 44 | orderItem = orderItem[['order_id', 'product_id',
 45 |                       'price', 'freight_value']].copy()
 46 | product = product[['product_id', 'product_photos_qty']].copy()
 47 | 
 48 | review.review_score = review.review_score - 1
 49 | 
 50 | tmp = pd.merge(review, order)
 51 | tmp = pd.merge(tmp, orderItem)
 52 | tmp = pd.merge(tmp, product)
 53 | 
 54 | print(tmp.shape)
 55 | tmp.dropna(inplace=True)
 56 | print(tmp.shape)
 57 | 
 58 | tmp['approve'] = tmp.apply(timeDelta, axis=1, args=('order_purchase_timestamp',
 59 |                                                     'order_approved_at'))
 60 | tmp['approve'] /= tmp['approve'].max()
 61 | tmp['deliver'] = tmp.apply(timeDelta, axis=1, args=('order_approved_at',
 62 |                                                     'order_delivered_carrier_date'))
 63 | tmp['deliver'] /= tmp['deliver'].max()
 64 | tmp['arrive'] = tmp.apply(timeDelta, axis=1, args=('order_delivered_carrier_date',
 65 |                                                    'order_delivered_customer_date'))
 66 | tmp['arrive'] /= tmp['arrive'].max()
 67 | tmp['review'] = tmp.apply(timeDelta, axis=1, args=('review_creation_date',
 68 |                                                    'review_answer_timestamp'))
 69 | tmp['review'] /= tmp['review'].max()
 70 | 
 71 | tmp['faster'] = tmp.apply(timeDelta, axis=1, args=('order_delivered_customer_date',
 72 |                                                    'order_estimated_delivery_date'))
 73 | tmp['faster'] /= tmp['faster'].max()
 74 | isDelivered_idx = tmp[tmp['order_status'] == 'delivered'].index
 75 | isCanceled_idx = tmp[tmp['order_status'] == 'canceled'].index
 76 | tmp.loc[isDelivered_idx, 'order_status'] = 0
 77 | tmp.loc[isCanceled_idx,  'order_status'] = 1
 78 | col_list = [
 79 |     'review_score',
 80 |     'order_status',
 81 |     'approve',
 82 |     'deliver',
 83 |     'arrive',
 84 |     'faster',
 85 |     'review'
 86 | ]
 87 | 
 88 | tmp.drop([
 89 |           'review_creation_date','review_answer_timestamp',
 90 |           'order_purchase_timestamp', 'order_approved_at',
 91 |           'order_delivered_carrier_date', 'order_delivered_customer_date',
 92 |           'order_estimated_delivery_date',
 93 |          ], axis=1, inplace=True)
 94 | 
 95 | print(tmp.columns)
 96 | print(tmp.shape)
 97 | 
 98 | for col in tmp.columns:
 99 |     if col not in [
100 |     'review_score',
101 |     'order_status',
102 |     'approve', 
103 |     'deliver', 
104 |     'arrive',
105 |     'faster',
106 |     'review',
107 |     'product_id','review_id', 'order_id',
108 |     ]:
109 |         print(col)
110 |         tmp[col] = (tmp[col] - tmp[col].min()) / (tmp[col].max() - tmp[col].min())
111 | 
112 | tmp.drop_duplicates(keep='first',inplace=True)
113 | print(tmp.shape)
114 | 
115 | print(tmp.columns)
116 | 
117 | review = tmp[['review_id', 'order_id', 'review_score','review']].copy()
118 | 
119 | order = tmp[['order_id',  'order_status', 'approve', 'deliver','arrive',  'faster']].copy()
120 | 
121 | orderItem = tmp[['order_id', 'product_id',
122 |                       'price', 'freight_value']].copy()
123 | 
124 | product = tmp[['product_id', 'product_photos_qty']].copy()
125 | 
126 | review.drop_duplicates(['order_id'],keep='first', inplace=True)
127 | review.drop_duplicates(['review_id'],keep='first', inplace=True)
128 | 
129 | order.drop_duplicates(keep='first', inplace=True)
130 | orderItem.drop_duplicates(keep='first', inplace=True)
131 | product.drop_duplicates(keep='first', inplace=True)
132 | 
133 | from sklearn.utils import shuffle
134 | rng=np.random.RandomState(123)
135 | review = shuffle(review, random_state=rng)
136 | 
137 | print("All base data shape is ")
138 | print(review.shape)
139 | 
140 | TrainProp = 0.5
141 | ValProp = 0.25
142 | TrainEnd = int(TrainProp * review.shape[0])
143 | ValEnd = TrainEnd + int(ValProp * review.shape[0])
144 | 
145 | print(TrainEnd)
146 | print(ValEnd)
147 | 
148 | trainReview = review[:TrainEnd].copy()
149 | valReview = review[TrainEnd:ValEnd].copy()
150 | testReview = review[ValEnd:].copy()
151 | 
152 | trainSet = pd.merge(trainReview, order)
153 | trainSet = pd.merge(trainSet, orderItem)
154 | trainSet = pd.merge(trainSet, product)
155 | 
156 | valSet = pd.merge(valReview, order)
157 | valSet = pd.merge(valSet, orderItem)
158 | valSet = pd.merge(valSet, product)
159 | 
160 | testSet = pd.merge(testReview, order)
161 | testSet = pd.merge(testSet, orderItem)
162 | testSet = pd.merge(testSet, product)
163 | 
164 | DIR = "/home/jiayi/disk/C-craig/dataset/"
165 | dataName = "Brazilnew"
166 | 
167 | trainSet.to_csv(DIR + "{}-train.csv".format(dataName), index=False)
168 | valSet.to_csv(DIR + "{}-val.csv".format(dataName), index=False)
169 | testSet.to_csv(DIR + "{}-test.csv".format(dataName), index=False)
170 | 
171 | y_train = trainSet.review_score.values
172 | y_val = valSet.review_score.values
173 | y_test = testSet.review_score.values
174 | 
175 | trainSet.drop(['review_id', 'order_id','review_score','product_id' ], axis=1, inplace=True)
176 | valSet.drop(['review_id', 'order_id', 'review_score', 'product_id' ], axis=1, inplace=True)
177 | testSet.drop(['review_id', 'order_id','review_score', 'product_id' ], axis=1, inplace=True)
178 | 
179 | X_train = np.ascontiguousarray(trainSet.astype(np.float64))
180 | X_val = np.ascontiguousarray(valSet.astype(np.float64))
181 | X_test = np.ascontiguousarray(testSet.astype(np.float64))
182 | 
183 | print(trainSet.shape)
184 | print(trainSet.columns)
185 | 
186 | DIR = "/home/jiayi/disk/C-craig/dataset/"
187 | dataName = "Brazilnew"
188 | np.save(DIR + "{}-train-X.npy".format(dataName), X_train)
189 | np.save(DIR + "{}-test-X.npy".format(dataName), X_test)
190 | np.save(DIR + "{}-val-X.npy".format(dataName), X_val)
191 | 
192 | np.save(DIR + "{}-train-y.npy".format(dataName), y_train)
193 | np.save(DIR + "{}-test-y.npy".format(dataName), y_test)
194 | np.save(DIR + "{}-val-y.npy".format(dataName), y_val)
195 | 
196 | DIR = "/home/jiayi/disk/C-craig/dataset/"
197 | dataName = "Brazilnew"
198 | 
199 | tmp = pd.read_csv(DIR + "{}-train.csv".format(dataName))
200 | tmp['rowID'] = np.arange(tmp.shape[0])
201 | 
202 | dataName = "Brazilnew"
203 | mycsDIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataName)
204 | 
205 | tmp = tmp[['review_id', 'order_id', 'product_id', 'rowID', 
206 |            'review_score', 'review', 'order_status',
207 |            'approve', 'deliver', 'arrive', 'faster', 'price',
208 |            'freight_value', 'product_photos_qty']].copy()
209 | 
210 | for cate in range(5):
211 |     train = tmp[tmp.review_score==cate].copy()
212 | 
213 |     le = preprocessing.LabelEncoder()
214 |     le.fit(train.review_id)
215 |     train.review_id = le.transform(train.review_id)
216 | 
217 |     le = preprocessing.LabelEncoder()
218 |     le.fit(train.order_id)
219 |     train.order_id = le.transform(train.order_id)
220 | 
221 |     le = preprocessing.LabelEncoder()
222 |     le.fit(train.product_id)
223 |     train.product_id = le.transform(train.product_id)
224 |     
225 |     
226 |     train.to_csv(mycsDIR + "train-cate-{}-joined.csv".format(cate), index=False)
227 |     tmp_ = np.ascontiguousarray(train.values.astype(np.float64))
228 |     np.save(mycsDIR + "train-cate-{}-joined.npy".format(cate), tmp_)
229 |     
230 | 
231 |     review = train[['review_id', 'order_id', 'review_score','review']].copy()
232 |     review.sort_values(by='review_id')
233 | 
234 |     order = train[['order_id',  'order_status', 'approve', 'deliver','arrive',  'faster']].copy()
235 |     order.sort_values(by='order_id')
236 |     
237 |     orderItem = train[['order_id', 'rowID','product_id',
238 |                           'price', 'freight_value']].copy()
239 |     orderItem.sort_values(by='order_id')
240 |     
241 |     product = train[['product_id', 'product_photos_qty']].copy()
242 |     
243 |     
244 |     review.drop_duplicates(keep='first', inplace=True)
245 |     order.drop_duplicates(keep='first', inplace=True)
246 |     orderItem.drop_duplicates(keep='first', inplace=True)
247 |     product.drop_duplicates(keep='first', inplace=True)
248 | 
249 |     
250 |     
251 |     np.save(mycsDIR + 'train-cate-{}-review.npy'.format(cate), np.ascontiguousarray(review.values.astype(np.float64)))
252 |     np.save(mycsDIR + 'train-cate-{}-order.npy'.format(cate), np.ascontiguousarray(order.values.astype(np.float64)))
253 |     np.save(mycsDIR + 'train-cate-{}-orderItem.npy'.format(cate), np.ascontiguousarray(orderItem.values.astype(np.float64)))
254 |     np.save(mycsDIR + 'train-cate-{}-product.npy'.format(cate), np.ascontiguousarray(product.values.astype(np.float64)))
255 | 
256 |     
257 |     review.to_csv(mycsDIR + 'train-cate-{}-review.csv'.format(cate),index=False)
258 |     order.to_csv(mycsDIR + 'train-cate-{}-order.csv'.format(cate), index=False)
259 |     orderItem.to_csv(mycsDIR + 'train-cate-{}-orderItem.csv'.format(cate), index=False)
260 |     product.to_csv(mycsDIR + 'train-cate-{}-product.csv'.format(cate), index=False)
261 | 
262 | 


--------------------------------------------------------------------------------
/preprocess/IMDBC-5.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.linear_model import LogisticRegression
  5 | from sklearn import metrics
  6 | import os
  7 | from sklearn import preprocessing
  8 | 
  9 | X_train = []
 10 | X_test = []
 11 | y_train = []
 12 | y_test = []
 13 | 
 14 | import datetime
 15 | def parseDatetime(s):
 16 | #     print('s is ',s)
 17 |     pre, suf = s.split(' ')
 18 |     
 19 |     year_s, mon_s, day_s = pre.split('-')
 20 |     hour_s, minute_s, second_s = suf.split(':')
 21 |     return datetime.datetime(int(year_s), int(mon_s), int(day_s), int(hour_s), int(minute_s), int(second_s))
 22 | 
 23 | def timeDelta(arrLike, col1, col2):
 24 |     purchase = parseDatetime(arrLike[col1])
 25 |     approve = parseDatetime(arrLike[col2])
 26 |     delta = approve - purchase
 27 |     return delta.total_seconds()
 28 | 
 29 | from scipy import sparse
 30 | def transMultihot(df, rowName, colName, IDName, onehotName='s'):
 31 | 
 32 |     tmp = df[colName].factorize()
 33 |     df.drop(colName,axis=1, inplace=True)
 34 |     df.insert(df.shape[1],colName,tmp[0])
 35 |     
 36 |     values = np.ones(df.shape[0])
 37 |     rows = df[rowName].values
 38 |     cols = df[colName].values
 39 |     
 40 |     sparse_matrix = sparse.coo_matrix((values, (rows,cols)))
 41 |     ar = sparse_matrix.toarray()
 42 |     sm = ar.sum(axis=1)  
 43 |     
 44 |     idxs = sm>0 
 45 |     IDs = np.arange(ar.shape[0])
 46 |     
 47 |     IDs = IDs[idxs]
 48 |     ARs = ar[idxs]
 49 |     
 50 |     col_name_list = ['{}{}'.format(onehotName, i) for i in range(ARs.shape[1])]
 51 |     col_name_list = [IDName] + col_name_list
 52 |        
 53 |     assert IDs.shape[0] == ARs.shape[0]
 54 |     IDs = IDs.reshape(-1,1)
 55 | 
 56 |     z = np.concatenate((IDs,ARs),axis=1)
 57 | 
 58 |     
 59 |     ret = pd.DataFrame(z, columns=col_name_list)
 60 |     ret[IDName] = ret[IDName].astype(np.int64)
 61 |     return ret
 62 | 
 63 | DIR = '/home/jiayi/disk/neurocard/datasets/job/'
 64 | 
 65 | file = 'title.csv'
 66 | title = pd.read_csv(DIR+file)
 67 | 
 68 | file = 'info_type.csv'
 69 | it = pd.read_csv(DIR+file)
 70 | 
 71 | file = 'movie_info.csv'
 72 | mi = pd.read_csv(DIR+file)
 73 | 
 74 | file = 'movie_info_idx.csv'
 75 | mix = pd.read_csv(DIR+file)
 76 | 
 77 | file = 'name.csv'
 78 | name = pd.read_csv(DIR+file)
 79 | 
 80 | file = 'cast_info.csv'
 81 | ci = pd.read_csv(DIR+file)
 82 | 
 83 | file = 'movie_companies.csv'
 84 | mc = pd.read_csv(DIR+file)
 85 | 
 86 | file = 'company_name.csv'
 87 | cn = pd.read_csv(DIR+file)
 88 | 
 89 | def changeToFloor(arrLike, col):
 90 | 
 91 | # def timeDelta(arrLike, col1, col2):
 92 |     colValue = arrLike[col]
 93 | #     colValue = np.floor(colValue)
 94 |     colValue = np.around(arrLike[col],0)
 95 |     
 96 |     colValue = np.floor(colValue/2)
 97 |     return colValue
 98 | #     purchase = parseDatetime(arrLike[col1])
 99 | #     approve = parseDatetime(arrLike[col2])
100 | #     delta = approve - purchase
101 | #     return delta.total_seconds()
102 | 
103 | def LoadIMDBC(Large=0,dataName="", saveCSV=False):
104 |     global X_train, X_test, X_val, y_val, y_train, y_test
105 | 
106 |     z = mix.copy()
107 |     # print(z.groupby('id'))
108 |     votes = z[z['info_type_id']==100].copy()
109 |     rating = z[z['info_type_id']==101].copy()
110 | 
111 |     votes['info'] = votes['info'].astype(int)
112 |     useVotes = votes[votes['info']>100].copy()
113 | 
114 |     useVotes.rename(columns={'info':'votes'},inplace=True)
115 |     useVotes = useVotes[['movie_id', 'votes']]
116 | 
117 |     MAX = useVotes.votes.max()
118 |     MIN = useVotes.votes.min()
119 |     useVotes.votes = (useVotes.votes - MIN)/(MAX - MIN)
120 | 
121 |     rating['info'] = rating['info'].astype(np.double)
122 |     useRating = rating.copy()
123 | 
124 |     useRating.rename(columns={'info':'rating'},inplace=True)
125 | 
126 |     useRating = useRating[['movie_id', 'rating']]
127 |     useRating['rating'] = useRating['rating'].astype(np.double)
128 | 
129 |     
130 |     
131 |     
132 |     
133 |     useRating['rating'] = useRating.apply(changeToFloor, axis=1, args=['rating'])
134 |     useRating.rating-=1
135 |     
136 | #     midLE = preprocessing.LabelEncoder()
137 | #     midLE.fit(useRating.rating)
138 | #     useRating['rating'] = midLE.transform(useRating.rating)
139 |     
140 | #     useRating['rating']=useRating['rating'].astype(int)
141 | 
142 |     useMIX = pd.merge(useVotes, useRating)
143 |     print(useMIX.shape)
144 |     print(useMIX.columns)
145 | 
146 |     useMI = mi.copy()
147 |     color = useMI[useMI['info_type_id']==2].copy()
148 |     genres = useMI[useMI['info_type_id']==3].copy()
149 | 
150 |     color.rename(columns={'info':'color'},inplace=True)
151 | 
152 |     color = color[['movie_id', 'color']]
153 | 
154 |     BWIndex = color[color['color']=='Black and White'].index
155 |     ColorIndex = color[color['color']=='Color'].index
156 |     color.loc[BWIndex,'color'] = 0
157 |     color.loc[ColorIndex,'color'] = 1
158 | 
159 |     genres.rename(columns={'info':'genres'},inplace=True)
160 |     genres = genres[['movie_id', 'genres']]
161 |     genres.drop_duplicates(inplace=True)
162 | 
163 |     genres = transMultihot(genres, 'movie_id', 'genres', IDName='movie_id', onehotName='s')
164 | 
165 |     useMI = pd.merge(color, genres)
166 | 
167 |     print(useMI.shape)
168 |     print(useMI.columns)
169 | 
170 |     if Large==0: 
171 |         useCI = ci[ci['role_id']==4].copy()
172 |     else:
173 |         useCI= ci.copy()
174 | #     useCI = ci[ci['role_id']<=4].copy()
175 |     
176 |     useCI = useCI[['person_id', 'movie_id']]
177 |     print(useCI.shape)
178 |     print(useCI.columns)
179 |     
180 | 
181 |     useNAME = name.copy()
182 | 
183 |     mIndex = useNAME[useNAME['gender']=='m'].index
184 |     fIndex = useNAME[useNAME['gender']=='f'].index
185 | 
186 |     useNAME.loc[mIndex,'gender'] = 1
187 |     useNAME.loc[fIndex,'gender'] = 0
188 | 
189 |     genderNA = ~useNAME['gender'].isna()
190 |     # purchaseNA = ~tmp['order_purchase_timestamp'].isna()
191 | 
192 |     useNAME = useNAME[genderNA]
193 | 
194 |     useNAME.rename(columns={'id':'person_id'},inplace=True)
195 |     useNAME = useNAME[['person_id', 'gender']]
196 |     print(useNAME.shape)
197 |     print(useNAME.columns)
198 |     
199 | 
200 |     useTITLE = title.copy()
201 |     useTITLE.rename(columns={'id':'movie_id'},inplace=True)
202 |     yearNA = ~useTITLE.production_year.isna()
203 |     kindNA = ~useTITLE.kind_id.isna()
204 |     yearNA = yearNA & kindNA
205 |     useTITLE = useTITLE[yearNA]
206 | 
207 |     useTITLE = useTITLE[['movie_id', 'production_year','kind_id']].copy()
208 |     MIN = useTITLE.production_year.min()
209 |     MAX = useTITLE.production_year.max()
210 | 
211 |     useTITLE['production_year'] = (useTITLE['production_year'] - MIN)/(MAX - MIN)
212 |     useTITLE = useTITLE.join(pd.get_dummies(useTITLE.kind_id))
213 |     useTITLE.rename(columns={1:'k1',2:'k2',3:'k3',4:'k4',6:'k6',7:'k7'},inplace=True)
214 | 
215 |     useTITLE.drop(['kind_id'],axis=1, inplace=True)
216 |     print(useTITLE.shape)
217 |     print(useTITLE.columns)
218 |     
219 |     
220 | 
221 |     
222 |     useMC = mc.copy()
223 |     useCN = cn.copy()
224 |     useCN.rename(columns={'id':'company_id'},inplace=True)
225 |     le = preprocessing.LabelEncoder()
226 |     le.fit(useCN.country_code)
227 |     useCN['country_code'] = le.transform(useCN.country_code)
228 |     
229 |     joinedMC = pd.merge(useMC, useCN)
230 |     
231 |     tMC = joinedMC[['company_id', 'country_code','movie_id']].copy()
232 |     
233 | 
234 |     MAX = tMC.country_code.max()
235 |     tMC['country_code'] = (tMC['country_code']/MAX)
236 |     
237 |     
238 | 
239 |     useTITLE.drop_duplicates(useTITLE.columns,inplace=True)
240 |     useMIX.drop_duplicates(subset=['movie_id'], keep='first', inplace=True)
241 |     useMIX.drop_duplicates(useMIX.columns,inplace=True)
242 |     useCI.drop_duplicates(useCI.columns,inplace=True)
243 |     useNAME.drop_duplicates(useNAME.columns,inplace=True)
244 |     useMI.drop_duplicates(subset=['movie_id'], keep='first', inplace=True)
245 |     useMI.drop_duplicates(useMI.columns,inplace=True)
246 |     
247 |     
248 | 
249 |     useTITLE.drop_duplicates(inplace=True)
250 |     useCI.drop_duplicates(inplace=True)
251 |     useNAME.drop_duplicates(inplace=True)
252 |     useMI.drop_duplicates(inplace=True)
253 |     useMIX.drop_duplicates(inplace=True)
254 |     tMC.drop_duplicates(inplace=True)
255 |     
256 |     
257 |     
258 |     
259 | 
260 |     z = pd.merge(useTITLE, useMIX)
261 |     print(z.shape)
262 |     z = pd.merge(z, useCI)
263 |     print(z.shape)
264 |     z = pd.merge(z, useNAME)
265 |     print(z.shape)
266 |     z = pd.merge(z, useMI)
267 |     print(z.shape)
268 |     
269 |     
270 | 
271 |     print(z.columns)
272 |     print(z.shape)
273 |     
274 |     print("##### Join company ")
275 |     z = pd.merge(z, tMC)
276 |     print(z.columns)
277 |     print(z.shape)
278 |     
279 |     
280 |     from sklearn.utils import shuffle
281 |     z = shuffle(z, random_state=123)
282 |     
283 | #     trainSize = int(0.5 * z.shape[0])
284 | #     valSize = trainSize + int(0.25 * z.shape[0])
285 |     
286 |     
287 |     
288 | 
289 |     movieIDs= title.id.unique()
290 |     movieIDs = shuffle(movieIDs, random_state=123)
291 |     
292 |     
293 |     trainSize = int(0.5 * movieIDs.shape[0])
294 |     valSize = trainSize + int(0.25 * movieIDs.shape[0])
295 |     
296 |     
297 |     trainMovies = movieIDs[:trainSize]
298 |     trainTMP = pd.DataFrame(trainMovies.reshape(-1,1), columns=["movie_id"])
299 |     trainData = pd.merge(z, trainTMP)
300 |     
301 |     valMovies = movieIDs[trainSize:valSize]
302 |     valTMP = pd.DataFrame(valMovies.reshape(-1,1), columns=["movie_id"])
303 |     valData = pd.merge(z, valTMP)
304 |     
305 |     testMovies = movieIDs[valSize:]
306 |     testTMP = pd.DataFrame(testMovies.reshape(-1,1), columns=["movie_id"])
307 |     testData = pd.merge(z, testTMP)
308 |     
309 | 
310 |     
311 |     
312 | 
313 |     y_train = trainData.rating.values
314 |     y_val = valData.rating.values
315 |     y_test = testData.rating.values
316 |     
317 | 
318 |     if saveCSV:
319 |         trainData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-train.csv'.format(dataName), index=False)
320 |         valData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-val.csv'.format(dataName), index=False)
321 |         testData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-test.csv'.format(dataName), index=False)
322 |     
323 |     
324 |     trainData.drop(['rating'], axis=1, inplace=True)
325 |     valData.drop(['rating'], axis=1, inplace=True)
326 |     testData.drop(['rating'], axis=1, inplace=True)
327 |     
328 |     trainData.drop(['person_id','movie_id', 'company_id'], axis=1, inplace=True)
329 |     valData.drop(['person_id', 'movie_id', 'company_id'], axis=1, inplace=True)
330 |     testData.drop(['person_id', 'movie_id', 'company_id'], axis=1, inplace=True)
331 |      
332 |         
333 |     print("Train Data shape   ", trainData.shape)
334 |     print("Test Data shape   ", testData.shape)
335 |     print("Val Data shape   ", valData.shape)
336 |     
337 |     print(trainData.columns)
338 |     X_train = np.ascontiguousarray(trainData.values.astype(np.float64))
339 |     X_val = np.ascontiguousarray(valData.values.astype(np.float64))
340 |     X_test = np.ascontiguousarray(testData.values.astype(np.float64))
341 |     
342 |     print(X_train.shape)
343 |     print(y_train.shape)
344 |     
345 |     return z
346 | 
347 | dataNameList = ["IMDBC5", "IMDBLargeC5"]
348 | parameterList = [0, 1]
349 | 
350 | for dataName, param in zip(dataNameList, parameterList):
351 |     LoadIMDBC(param, dataName, saveCSV=True) 
352 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-train-X.npy".format(dataName),X_train)
353 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-train-y.npy".format(dataName),y_train)
354 | 
355 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-val-X.npy".format(dataName),X_val)
356 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-val-y.npy".format(dataName),y_val)
357 | 
358 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-test-X.npy".format(dataName),X_test)
359 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-test-y.npy".format(dataName),y_test)
360 | 
361 | # dataName = "IMDBC"
362 | dataName = "IMDBLargeC5"
363 | # dataName = "IMDBC5"
364 | 
365 | df = pd.read_csv('/home/jiayi/disk/C-craig/dataset/{}-train.csv'.format(dataName))
366 | 
367 | print(df.columns)
368 | print(df.shape)
369 | 
370 | print(np.unique(df.rating))
371 | 
372 | midLE = preprocessing.LabelEncoder()
373 | midLE.fit(df.movie_id)
374 | df['movie_id'] = midLE.transform(df.movie_id)
375 | 
376 | pidLE = preprocessing.LabelEncoder()
377 | pidLE.fit(df.person_id)
378 | df['person_id'] = pidLE.transform(df.person_id)
379 | 
380 | cidLE = preprocessing.LabelEncoder()
381 | cidLE.fit(df.company_id)
382 | df['company_id'] = cidLE.transform(df.company_id)
383 | 
384 | PROP = 1
385 | trainData = df.values
386 | print(trainData.shape)
387 | print(trainData[:5,:])
388 | np.save('/home/jiayi/disk/C-craig/dataset/{}-joined-prop-{}.npy'.format(dataName, PROP), np.ascontiguousarray(trainData.astype(np.float64)))
389 | 
390 | print(midLE.classes_.shape)
391 | print(pidLE.classes_.shape)
392 | print(cidLE.classes_.shape)
393 | num = midLE.classes_.shape[0]
394 | num = num * pidLE.classes_.shape[0]
395 | num = num * cidLE.classes_.shape[0]
396 | print(num)
397 | assert num< 4* (10**18)
398 | print("\n【 Passed 】")
399 | 
400 | uni = np.unique(df[['movie_id', 'person_id']], axis=0)
401 | print(uni.shape)
402 | uni = np.unique(df[['movie_id', 'person_id']], axis=0)
403 | print(uni.shape) 
404 | 
405 | print(df.shape)
406 | 
407 | uni = df[['movie_id', 'person_id', 'company_id']].copy()
408 | # uni = np.unique(df[['movie_id', 'person_id', 'company_id']], axis=0)
409 | print(uni.shape)
410 | print(uni)
411 | rowNumMap = np.zeros((uni.shape[0],2), np.int64)
412 | i = 0
413 | # for row in uni.values:
414 | #     print(row)
415 | for row in uni.values:
416 | #     if row.sha
417 | #     print(row)
418 | #     print(row.shape)
419 |     x,y,z = row
420 |     x = np.int64(x)
421 |     y = np.int64(y)
422 |     z = np.int64(z)
423 |     
424 |     rowNumMap[i,0] = (x+1) + (y+1)*(10**5) + (z+1) * (10**11)
425 |     assert 0 <= rowNumMap[i,0] and rowNumMap[i,0] < (4*(10**18))
426 | #     print(rowNumMap[i,0])
427 |     rowNumMap[i,1] = i
428 |     i = i + 1
429 | 
430 | mycsDIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataName)
431 | print(mycsDIR)
432 | np.save(mycsDIR + 'idMap.npy', np.ascontiguousarray(rowNumMap))
433 | 
434 | print(rowNumMap)
435 | 
436 | CATE = 10
437 | 
438 | Databackup = df.copy()
439 | 
440 | for cate in range(CATE):
441 |     print("#"*10 ,' '*5, '【cate】 ', cate, ' '*10, '#'*10)
442 |     trainData = Databackup[Databackup['rating'] == cate]
443 |     
444 | 
445 |     mixColumns = ['movie_id', 'votes', 'rating']
446 |     mixNotUniqued = trainData[mixColumns].copy()
447 |     mixUniqued = mixNotUniqued.drop_duplicates(mixNotUniqued.columns).copy()
448 |     mixUniqued.sort_values(['movie_id'], inplace=True)
449 |     print('【Movie_info_idx】')
450 |     print(mixUniqued.shape)
451 |     print(len(np.unique(mixUniqued.movie_id)))
452 |     
453 | 
454 |     miColumns = ['movie_id', 'color', 's0', 's1', 's2', 's3', 's4', 's5', 's6', 's7',
455 |        's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17',
456 |        's18', 's19', 's20', 's21', 's22', 's23', 's24', 's25', 's26', 's27',
457 |        's28', 's29']
458 |     miNotUniqued = trainData[miColumns].copy()
459 |     miUniqued = miNotUniqued.drop_duplicates(miNotUniqued.columns).copy()
460 |     miUniqued.sort_values(['movie_id'], inplace=True)
461 |     print('【Movie_info】')
462 |     print(miUniqued.shape)
463 |     print(len(np.unique(miUniqued.movie_id)))
464 |     
465 |     
466 | 
467 |     ciColumns = ['person_id', 'movie_id']
468 |     ciNotUniqued = trainData[ciColumns].copy()
469 |     ciUniqued = ciNotUniqued.drop_duplicates(ciNotUniqued.columns).copy()
470 |     print('【Cast_info】')
471 |     print(ciUniqued.shape)
472 |     print('in cast_info movie_id unique ', len(np.unique(ciUniqued.movie_id)))
473 |     print('in cast_info person_id unique ', len(np.unique(ciUniqued.person_id)))
474 |     
475 | 
476 |     nameColumns = ['person_id', 'gender']
477 |     nameNotUniqued = trainData[nameColumns].copy()
478 |     nameUniqued = nameNotUniqued.drop_duplicates(nameNotUniqued.columns).copy()
479 |     nameUniqued.sort_values(['person_id'], inplace=True)
480 |     print('【Name】')
481 |     print(nameUniqued.shape)
482 |     print(len(np.unique(nameUniqued.person_id)))
483 | 
484 |     
485 |     titleColumns = ['movie_id', 'production_year', 'k1', 'k2', 'k3', 'k4', 'k6', 'k7']
486 |     titleNotUniqued = trainData[titleColumns].copy()
487 |     titleUniqued = titleNotUniqued.drop_duplicates(titleNotUniqued.columns).copy()
488 |     titleUniqued.sort_values(['movie_id'], inplace=True)
489 |     print('【Title】')
490 |     print(titleUniqued.shape)
491 |     print(len(np.unique(titleUniqued.movie_id)))
492 |     
493 |   
494 |     mcColumns = ['movie_id', 'company_id', 'country_code']
495 |     mcNotUniqued = trainData[mcColumns].copy()
496 |     mcUniqued = mcNotUniqued.drop_duplicates(mcNotUniqued.columns).copy()
497 |     mcUniqued.sort_values(['movie_id'], inplace=True)
498 |     print('【Movie Company】')
499 |     print(mcUniqued.shape)
500 |     print(len(np.unique(mcUniqued.movie_id)))
501 |     
502 |     
503 |     mycsDIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataName)
504 |     np.save(mycsDIR + 'train-cate-{}-mix.npy'.format(cate), np.ascontiguousarray(mixUniqued.values.astype(np.float64)))
505 |     np.save(mycsDIR + 'train-cate-{}-mi.npy'.format(cate), np.ascontiguousarray(miUniqued.values.astype(np.float64)))
506 |     np.save(mycsDIR + 'train-cate-{}-ci.npy'.format(cate), np.ascontiguousarray(ciUniqued.values.astype(np.float64)))
507 |     np.save(mycsDIR + 'train-cate-{}-name.npy'.format(cate), np.ascontiguousarray(nameUniqued.values.astype(np.float64)))
508 |     np.save(mycsDIR + 'train-cate-{}-title.npy'.format(cate), np.ascontiguousarray(titleUniqued.values.astype(np.float64)))
509 |     np.save(mycsDIR + 'train-cate-{}-mc.npy'.format(cate), np.ascontiguousarray(mcUniqued.values.astype(np.float64)))
510 | 
511 |     
512 |     mixUniqued.to_csv(mycsDIR + 'train-cate-{}-mix.csv'.format(cate),index=False)
513 |     miUniqued.to_csv(mycsDIR + 'train-cate-{}-mi.csv'.format(cate), index=False)
514 |     ciUniqued.to_csv(mycsDIR + 'train-cate-{}-ci.csv'.format(cate), index=False)
515 |     nameUniqued.to_csv(mycsDIR + 'train-cate-{}-name.csv'.format(cate), index=False)
516 |     titleUniqued.to_csv(mycsDIR + 'train-cate-{}-title.csv'.format(cate), index=False)
517 |     mcUniqued.to_csv(mycsDIR + 'train-cate-{}-mc.csv'.format(cate), index=False)
518 |     
519 |     
520 |     
521 | 
522 | 


--------------------------------------------------------------------------------
/preprocess/IMDBC-Linear.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | from sklearn.linear_model import LogisticRegression
  7 | from sklearn import metrics
  8 | import os
  9 | from sklearn import preprocessing
 10 | 
 11 | X_train = []
 12 | X_test = []
 13 | y_train = []
 14 | y_test = []
 15 | 
 16 | from scipy import sparse
 17 | def transMultihot(df, rowName, colName, IDName, onehotName='s'):
 18 |     tmp = df[colName].factorize()
 19 |     df.drop(colName,axis=1, inplace=True)
 20 |     df.insert(df.shape[1],colName,tmp[0])
 21 |     
 22 |     values = np.ones(df.shape[0])
 23 |     rows = df[rowName].values
 24 |     cols = df[colName].values
 25 |     
 26 |     sparse_matrix = sparse.coo_matrix((values, (rows,cols)))
 27 |     ar = sparse_matrix.toarray()
 28 |     sm = ar.sum(axis=1)  
 29 |     
 30 |     idxs = sm>0 
 31 |     IDs = np.arange(ar.shape[0])
 32 |     
 33 |     IDs = IDs[idxs]
 34 |     ARs = ar[idxs]
 35 |     
 36 |     col_name_list = ['{}{}'.format(onehotName, i) for i in range(ARs.shape[1])]
 37 |     col_name_list = [IDName] + col_name_list
 38 |        
 39 |     assert IDs.shape[0] == ARs.shape[0]
 40 |     IDs = IDs.reshape(-1,1)
 41 | 
 42 |     z = np.concatenate((IDs,ARs),axis=1)
 43 | 
 44 |     
 45 |     ret = pd.DataFrame(z, columns=col_name_list)
 46 |     ret[IDName] = ret[IDName].astype(np.int64)
 47 |     return ret
 48 | 
 49 | get_ipython().run_cell_magic('time', '', "\nDIR = '/home/jiayi/disk/neurocard/datasets/job/'\n\nfile = 'title.csv'\ntitle = pd.read_csv(DIR+file)\n\nfile = 'info_type.csv'\nit = pd.read_csv(DIR+file)\n\nfile = 'movie_info.csv'\nmi = pd.read_csv(DIR+file)\n\nfile = 'movie_info_idx.csv'\nmix = pd.read_csv(DIR+file)\n\nfile = 'name.csv'\nname = pd.read_csv(DIR+file)\n\nfile = 'cast_info.csv'\nci = pd.read_csv(DIR+file)\n\nfile = 'movie_companies.csv'\nmc = pd.read_csv(DIR+file)\n\nfile = 'company_name.csv'\ncn = pd.read_csv(DIR+file)")
 50 | 
 51 | def changeToFloor(arrLike, col):
 52 | 
 53 |     colValue = arrLike[col]
 54 |     colValue = np.around(arrLike[col],1)
 55 |     
 56 |     return colValue
 57 | 
 58 | def LoadIMDBC(Large=0,dataName="", saveCSV=False, useFor='test'):
 59 |     global X_train, X_test, X_val, y_val, y_train, y_test
 60 | 
 61 |     z = mix.copy()
 62 | 
 63 |     votes = z[z['info_type_id']==100].copy()
 64 |     rating = z[z['info_type_id']==101].copy()
 65 | 
 66 |     votes['info'] = votes['info'].astype(int)
 67 |     useVotes = votes[votes['info']>100].copy()
 68 | 
 69 |     useVotes.rename(columns={'info':'votes'},inplace=True)
 70 |     useVotes = useVotes[['movie_id', 'votes']]
 71 | 
 72 |     MAX = useVotes.votes.max()
 73 |     MIN = useVotes.votes.min()
 74 |     useVotes.votes = (useVotes.votes - MIN)/(MAX - MIN)
 75 | 
 76 |     rating['info'] = rating['info'].astype(np.double)
 77 |     useRating = rating.copy()
 78 | 
 79 |     useRating.rename(columns={'info':'rating'},inplace=True)
 80 | 
 81 |     useRating = useRating[['movie_id', 'rating']]
 82 |     useRating['rating'] = useRating['rating'].astype(np.double)
 83 | 
 84 |     
 85 |     
 86 |     
 87 |     
 88 |     
 89 | 
 90 |     useRating['rating'] = useRating.apply(changeToFloor, axis=1, args=['rating'])
 91 | 
 92 |     if useFor == 'train': 
 93 |         midLE = preprocessing.LabelEncoder()
 94 |         midLE.fit(useRating.rating)
 95 |         useRating['rating'] = midLE.transform(useRating.rating)
 96 | 
 97 |         useRating['rating']=useRating['rating'].astype(int)
 98 |         useRating.rating -=1
 99 | 
100 |     useMIX = pd.merge(useVotes, useRating)
101 |     print(useMIX.shape)
102 |     print(useMIX.columns)
103 | 
104 |     useMI = mi.copy()
105 |     color = useMI[useMI['info_type_id']==2].copy()
106 |     genres = useMI[useMI['info_type_id']==3].copy()
107 | 
108 |     color.rename(columns={'info':'color'},inplace=True)
109 | 
110 |     color = color[['movie_id', 'color']]
111 | 
112 |     BWIndex = color[color['color']=='Black and White'].index
113 |     ColorIndex = color[color['color']=='Color'].index
114 |     color.loc[BWIndex,'color'] = 0
115 |     color.loc[ColorIndex,'color'] = 1
116 | 
117 |     genres.rename(columns={'info':'genres'},inplace=True)
118 |     genres = genres[['movie_id', 'genres']]
119 |     genres.drop_duplicates(inplace=True)
120 | 
121 |     genres = transMultihot(genres, 'movie_id', 'genres', IDName='movie_id', onehotName='s')
122 | 
123 |     useMI = pd.merge(color, genres)
124 | 
125 |     print(useMI.shape)
126 |     print(useMI.columns)
127 | 
128 |     if Large==0: 
129 |         useCI = ci[ci['role_id']==4].copy()
130 |     else:
131 |         useCI= ci.copy()
132 | 
133 |     useCI = useCI[['person_id', 'movie_id']]
134 |     print(useCI.shape)
135 |     print(useCI.columns)
136 |     
137 | 
138 |     useNAME = name.copy()
139 | 
140 |     mIndex = useNAME[useNAME['gender']=='m'].index
141 |     fIndex = useNAME[useNAME['gender']=='f'].index
142 | 
143 |     useNAME.loc[mIndex,'gender'] = 1
144 |     useNAME.loc[fIndex,'gender'] = 0
145 | 
146 |     genderNA = ~useNAME['gender'].isna()
147 |     # purchaseNA = ~tmp['order_purchase_timestamp'].isna()
148 | 
149 |     useNAME = useNAME[genderNA]
150 | 
151 |     useNAME.rename(columns={'id':'person_id'},inplace=True)
152 |     useNAME = useNAME[['person_id', 'gender']]
153 |     print(useNAME.shape)
154 |     print(useNAME.columns)
155 | 
156 |     useTITLE = title.copy()
157 |     useTITLE.rename(columns={'id':'movie_id'},inplace=True)
158 |     yearNA = ~useTITLE.production_year.isna()
159 |     kindNA = ~useTITLE.kind_id.isna()
160 |     yearNA = yearNA & kindNA
161 |     useTITLE = useTITLE[yearNA]
162 | 
163 |     useTITLE = useTITLE[['movie_id', 'production_year','kind_id']].copy()
164 |     MIN = useTITLE.production_year.min()
165 |     MAX = useTITLE.production_year.max()
166 | 
167 |     useTITLE['production_year'] = (useTITLE['production_year'] - MIN)/(MAX - MIN)
168 |     useTITLE = useTITLE.join(pd.get_dummies(useTITLE.kind_id))
169 |     useTITLE.rename(columns={1:'k1',2:'k2',3:'k3',4:'k4',6:'k6',7:'k7'},inplace=True)
170 | 
171 |     useTITLE.drop(['kind_id'],axis=1, inplace=True)
172 |     print(useTITLE.shape)
173 |     print(useTITLE.columns)
174 |     
175 | 
176 |     
177 |     useMC = mc.copy()
178 |     useCN = cn.copy()
179 |     useCN.rename(columns={'id':'company_id'},inplace=True)
180 |     le = preprocessing.LabelEncoder()
181 |     le.fit(useCN.country_code)
182 |     useCN['country_code'] = le.transform(useCN.country_code)
183 |     
184 |     joinedMC = pd.merge(useMC, useCN)
185 |     
186 |     tMC = joinedMC[['company_id', 'country_code','movie_id']].copy()
187 |     
188 | 
189 |     MAX = tMC.country_code.max()
190 |     tMC['country_code'] = (tMC['country_code']/MAX)
191 |     
192 | 
193 |     useTITLE.drop_duplicates(useTITLE.columns,inplace=True)
194 |     useMIX.drop_duplicates(subset=['movie_id'], keep='first', inplace=True)
195 |     useMIX.drop_duplicates(useMIX.columns,inplace=True)
196 |     useCI.drop_duplicates(useCI.columns,inplace=True)
197 |     useNAME.drop_duplicates(useNAME.columns,inplace=True)
198 |     useMI.drop_duplicates(subset=['movie_id'], keep='first', inplace=True)
199 |     useMI.drop_duplicates(useMI.columns,inplace=True)
200 |     
201 |     
202 |     
203 | 
204 |     useTITLE.drop_duplicates(inplace=True)
205 |     useCI.drop_duplicates(inplace=True)
206 |     useNAME.drop_duplicates(inplace=True)
207 |     useMI.drop_duplicates(inplace=True)
208 |     useMIX.drop_duplicates(inplace=True)
209 |     tMC.drop_duplicates(inplace=True)
210 |     
211 |     
212 |     
213 |     
214 | 
215 |     z = pd.merge(useTITLE, useMIX)
216 |     print(z.shape)
217 |     z = pd.merge(z, useCI)
218 |     print(z.shape)
219 |     z = pd.merge(z, useNAME)
220 |     print(z.shape)
221 |     z = pd.merge(z, useMI)
222 |     print(z.shape)
223 |     
224 | 
225 |     print(z.columns)
226 |     print(z.shape)
227 | 
228 |     z = pd.merge(z, tMC)
229 |     print(z.columns)
230 |     print(z.shape)
231 |     
232 |     
233 |     from sklearn.utils import shuffle
234 |     z = shuffle(z, random_state=123)
235 |     
236 | 
237 |     movieIDs= title.id.unique()
238 |     movieIDs = shuffle(movieIDs, random_state=123)
239 |     
240 |     
241 |     trainSize = int(0.5 * movieIDs.shape[0])
242 |     valSize = trainSize + int(0.25 * movieIDs.shape[0])
243 |     
244 |     
245 |     trainMovies = movieIDs[:trainSize]
246 |     trainTMP = pd.DataFrame(trainMovies.reshape(-1,1), columns=["movie_id"])
247 |     trainData = pd.merge(z, trainTMP)
248 |     
249 |     valMovies = movieIDs[trainSize:valSize]
250 |     valTMP = pd.DataFrame(valMovies.reshape(-1,1), columns=["movie_id"])
251 |     valData = pd.merge(z, valTMP)
252 |     
253 |     testMovies = movieIDs[valSize:]
254 |     testTMP = pd.DataFrame(testMovies.reshape(-1,1), columns=["movie_id"])
255 |     testData = pd.merge(z, testTMP)
256 |     
257 | 
258 |     
259 |     
260 |     
261 |     
262 | 
263 |     y_train = trainData.rating.values
264 |     y_val = valData.rating.values
265 |     y_test = testData.rating.values
266 |     
267 |     
268 | 
269 |     if saveCSV:
270 |         trainData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-train.csv'.format(dataName), index=False)
271 |         valData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-val.csv'.format(dataName), index=False)
272 |         testData.to_csv('/home/jiayi/disk/C-craig/dataset/{}-test.csv'.format(dataName), index=False)
273 |     
274 |     
275 |     trainData.drop(['rating'], axis=1, inplace=True)
276 |     valData.drop(['rating'], axis=1, inplace=True)
277 |     testData.drop(['rating'], axis=1, inplace=True)
278 |     
279 |     trainData.drop(['person_id','movie_id', 'company_id'], axis=1, inplace=True)
280 |     valData.drop(['person_id', 'movie_id', 'company_id'], axis=1, inplace=True)
281 |     testData.drop(['person_id', 'movie_id', 'company_id'], axis=1, inplace=True)
282 |      
283 |         
284 |     print("Train Data shape   ", trainData.shape)
285 |     print("Test Data shape   ", testData.shape)
286 |     print("Val Data shape   ", valData.shape)
287 |     
288 |     print(trainData.columns)
289 |     X_train = np.ascontiguousarray(trainData.values.astype(np.float64))
290 |     X_val = np.ascontiguousarray(valData.values.astype(np.float64))
291 |     X_test = np.ascontiguousarray(testData.values.astype(np.float64))
292 |     
293 |     print(X_train.shape)
294 |     print(y_train.shape)
295 |     
296 |     return z
297 | 
298 | dataNameList = ["IMDBCLinear","IMDBCLinearC++" ]
299 | parameterList = [0,0]
300 | 
301 | # dataNameList = ["IMDBLargeCLinear","IMDBLargeCLinearC++" ]
302 | # parameterList = [1,1]
303 | 
304 | useForList = ["test", "train"]
305 | 
306 | for dataName, param,useFor in zip(dataNameList, parameterList, useForList):
307 |     LoadIMDBC(param, dataName, saveCSV=True,useFor=useFor) 
308 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-train-X.npy".format(dataName),X_train)
309 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-train-y.npy".format(dataName),y_train)
310 | 
311 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-val-X.npy".format(dataName),X_val)
312 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-val-y.npy".format(dataName),y_val)
313 | 
314 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-test-X.npy".format(dataName),X_test)
315 |     np.save("/home/jiayi/disk/C-craig/dataset/{}-test-y.npy".format(dataName),y_test)
316 | 
317 | # dataName = "IMDBCLinearC++"
318 | # dataName = "IMDBCLinear"
319 | dataName = "IMDBLargeCLinearC++"
320 | 
321 | df = pd.read_csv('/home/jiayi/disk/C-craig/dataset/{}-train.csv'.format(dataName))
322 | 
323 | # dataName = "IMDBCLinearC++"
324 | dataName2 = "IMDBLargeCLinear"
325 | 
326 | df2 = pd.read_csv('/home/jiayi/disk/C-craig/dataset/{}-train.csv'.format(dataName2))
327 | 
328 | print(df.iloc[:3])
329 | print(df2.iloc[:3])
330 | 
331 | print(df.columns)
332 | print(df.shape)
333 | 
334 | print(np.unique(df.rating))
335 | 
336 | midLE = preprocessing.LabelEncoder()
337 | midLE.fit(df.movie_id)
338 | df['movie_id'] = midLE.transform(df.movie_id)
339 | 
340 | pidLE = preprocessing.LabelEncoder()
341 | pidLE.fit(df.person_id)
342 | df['person_id'] = pidLE.transform(df.person_id)
343 | 
344 | cidLE = preprocessing.LabelEncoder()
345 | cidLE.fit(df.company_id)
346 | df['company_id'] = cidLE.transform(df.company_id)
347 | 
348 | PROP = 1
349 | trainData = df.values
350 | print(trainData.shape)
351 | print(trainData[:5,:])
352 | np.save('/home/jiayi/disk/C-craig/dataset/{}-joined-prop-{}.npy'.format(dataName, PROP), np.ascontiguousarray(trainData.astype(np.float64)))
353 | 
354 | print(midLE.classes_.shape)
355 | print(pidLE.classes_.shape)
356 | print(cidLE.classes_.shape)
357 | num = midLE.classes_.shape[0]
358 | num = num * pidLE.classes_.shape[0]
359 | num = num * cidLE.classes_.shape[0]
360 | print(num)
361 | assert num< 4* (10**18)
362 | print("\n【 Passed 】")
363 | 
364 | uni = np.unique(df[['movie_id', 'person_id']], axis=0)
365 | print(uni.shape)
366 | uni = np.unique(df[['movie_id', 'person_id']], axis=0)
367 | print(uni.shape) 
368 | 
369 | print(df.shape)
370 | 
371 | uni = df[['movie_id', 'person_id', 'company_id']].copy()
372 | # uni = np.unique(df[['movie_id', 'person_id', 'company_id']], axis=0)
373 | print(uni.shape)
374 | print(uni)
375 | rowNumMap = np.zeros((uni.shape[0],2), np.int64)
376 | i = 0
377 | # for row in uni.values:
378 | #     print(row)
379 | for row in uni.values:
380 | #     if row.sha
381 | #     print(row)
382 | #     print(row.shape)
383 |     x,y,z = row
384 |     x = np.int64(x)
385 |     y = np.int64(y)
386 |     z = np.int64(z)
387 |     
388 |     rowNumMap[i,0] = (x+1) + (y+1)*(10**5) + (z+1) * (10**11)
389 |     assert 0 <= rowNumMap[i,0] and rowNumMap[i,0] < (4*(10**18))
390 | #     print(rowNumMap[i,0])
391 |     rowNumMap[i,1] = i
392 |     i = i + 1
393 | 
394 | mycsDIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataName)
395 | np.save(mycsDIR + 'idMap.npy', np.ascontiguousarray(rowNumMap))
396 | 
397 | print(rowNumMap)
398 | 
399 | CATE = len(np.unique(df.rating))
400 | print("Cate num is ",CATE)
401 | 
402 | Databackup = df.copy()
403 | 
404 | for cate in range(CATE + 1):
405 |     print("#"*10 ,' '*5, '【cate】 ', cate, ' '*10, '#'*10)
406 |     trainData = Databackup[Databackup['rating'] == cate]
407 |     
408 | 
409 |     mixColumns = ['movie_id', 'votes', 'rating']
410 |     mixNotUniqued = trainData[mixColumns].copy()
411 |     mixUniqued = mixNotUniqued.drop_duplicates(mixNotUniqued.columns).copy()
412 |     mixUniqued.sort_values(['movie_id'], inplace=True)
413 |     print('【Movie_info_idx】')
414 |     print(mixUniqued.shape)
415 |     print(len(np.unique(mixUniqued.movie_id)))
416 |     
417 |     
418 | 
419 |     miColumns = ['movie_id', 'color', 's0', 's1', 's2', 's3', 's4', 's5', 's6', 's7',
420 |        's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17',
421 |        's18', 's19', 's20', 's21', 's22', 's23', 's24', 's25', 's26', 's27',
422 |        's28', 's29']
423 |     miNotUniqued = trainData[miColumns].copy()
424 |     miUniqued = miNotUniqued.drop_duplicates(miNotUniqued.columns).copy()
425 |     miUniqued.sort_values(['movie_id'], inplace=True)
426 |     print('【Movie_info】')
427 |     print(miUniqued.shape)
428 |     print(len(np.unique(miUniqued.movie_id)))
429 |     
430 |     
431 | 
432 |     ciColumns = ['person_id', 'movie_id']
433 |     ciNotUniqued = trainData[ciColumns].copy()
434 |     ciUniqued = ciNotUniqued.drop_duplicates(ciNotUniqued.columns).copy()
435 |     print('【Cast_info】')
436 |     print(ciUniqued.shape)
437 |     print('in cast_info movie_id unique ', len(np.unique(ciUniqued.movie_id)))
438 |     print('in cast_info person_id unique ', len(np.unique(ciUniqued.person_id)))
439 |     
440 |     
441 | 
442 |     nameColumns = ['person_id', 'gender']
443 |     nameNotUniqued = trainData[nameColumns].copy()
444 |     nameUniqued = nameNotUniqued.drop_duplicates(nameNotUniqued.columns).copy()
445 |     nameUniqued.sort_values(['person_id'], inplace=True)
446 |     print('【Name】')
447 |     print(nameUniqued.shape)
448 |     print(len(np.unique(nameUniqued.person_id)))
449 | 
450 |     titleColumns = ['movie_id', 'production_year', 'k1', 'k2', 'k3', 'k4', 'k6', 'k7']
451 |     titleNotUniqued = trainData[titleColumns].copy()
452 |     titleUniqued = titleNotUniqued.drop_duplicates(titleNotUniqued.columns).copy()
453 |     titleUniqued.sort_values(['movie_id'], inplace=True)
454 |     print('【Title】')
455 |     print(titleUniqued.shape)
456 |     print(len(np.unique(titleUniqued.movie_id)))
457 |     
458 |     
459 | 
460 |     mcColumns = ['movie_id', 'company_id', 'country_code']
461 |     mcNotUniqued = trainData[mcColumns].copy()
462 |     mcUniqued = mcNotUniqued.drop_duplicates(mcNotUniqued.columns).copy()
463 |     mcUniqued.sort_values(['movie_id'], inplace=True)
464 |     print('【Movie Company】')
465 |     print(mcUniqued.shape)
466 |     print(len(np.unique(mcUniqued.movie_id)))
467 |     
468 |     
469 |     mycsDIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataName)
470 |     np.save(mycsDIR + 'train-cate-{}-mix.npy'.format(cate), np.ascontiguousarray(mixUniqued.values.astype(np.float64)))
471 |     np.save(mycsDIR + 'train-cate-{}-mi.npy'.format(cate), np.ascontiguousarray(miUniqued.values.astype(np.float64)))
472 |     np.save(mycsDIR + 'train-cate-{}-ci.npy'.format(cate), np.ascontiguousarray(ciUniqued.values.astype(np.float64)))
473 |     np.save(mycsDIR + 'train-cate-{}-name.npy'.format(cate), np.ascontiguousarray(nameUniqued.values.astype(np.float64)))
474 |     np.save(mycsDIR + 'train-cate-{}-title.npy'.format(cate), np.ascontiguousarray(titleUniqued.values.astype(np.float64)))
475 |     np.save(mycsDIR + 'train-cate-{}-mc.npy'.format(cate), np.ascontiguousarray(mcUniqued.values.astype(np.float64)))
476 | 
477 |     
478 | 
479 |     mixUniqued.to_csv(mycsDIR + 'train-cate-{}-mix.csv'.format(cate),index=False)
480 |     miUniqued.to_csv(mycsDIR + 'train-cate-{}-mi.csv'.format(cate), index=False)
481 |     ciUniqued.to_csv(mycsDIR + 'train-cate-{}-ci.csv'.format(cate), index=False)
482 |     nameUniqued.to_csv(mycsDIR + 'train-cate-{}-name.csv'.format(cate), index=False)
483 |     titleUniqued.to_csv(mycsDIR + 'train-cate-{}-title.csv'.format(cate), index=False)
484 |     mcUniqued.to_csv(mycsDIR + 'train-cate-{}-mc.csv'.format(cate), index=False)
485 |     
486 |     
487 |     
488 | 
489 | print(np.unique(df.rating))
490 | 
491 | z = np.unique(df.rating)
492 | print(z[85])
493 | # print(len(np.unique()))
494 | 
495 | 


--------------------------------------------------------------------------------
/preprocess/stack.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.linear_model import LogisticRegression
  4 | from sklearn import metrics
  5 | import os
  6 | from sklearn import preprocessing
  7 | 
  8 | DIR = '/home/jiayi/disk/stackData/'
  9 | 
 10 | user = pd.read_csv(DIR + 'user' + '.csv',usecols=['id','site_id', 'reputation', 'upvotes', 'downvotes'] )
 11 | question = pd.read_csv(DIR + 'question' + '.csv', usecols=['id', 'site_id', 'score','view_count', 'favorite_count'])
 12 | answer = pd.read_csv(DIR + 'answer' + '.csv', usecols=['id', 'site_id', 'question_id','owner_user_id','score'])
 13 | 
 14 | useUser= user.copy()
 15 | useAnswer = answer.copy()
 16 | useQuestion = question.copy()
 17 | 
 18 | useUser.rename(columns={'id':'user_id'},inplace=True)
 19 | useAnswer.rename(columns={'owner_user_id':'user_id', 'score':'Y'},inplace=True)
 20 | useQuestion.rename(columns={'id':'question_id'},inplace=True)
 21 | 
 22 | z = pd.merge(useAnswer, useUser)
 23 | z = pd.merge(z, useQuestion)
 24 | inUser = z[['user_id', 'site_id', 'reputation', 'upvotes', 'downvotes']].copy()
 25 | inAnswer = z[['id', 'site_id', 'question_id', 'Y', 'user_id']].copy()
 26 | inQuestion = z[['question_id', 'site_id', 'score', 'view_count', 'favorite_count']].copy()
 27 | inUser.drop_duplicates(inplace=True)
 28 | inAnswer.drop_duplicates(inplace=True)
 29 | inQuestion.drop_duplicates(inplace=True)
 30 | 
 31 | print(inUser.shape)
 32 | print(inAnswer.shape)
 33 | print(inQuestion.shape)
 34 | 
 35 | print(inUser.site_id.min(), inUser.site_id.max())
 36 | print(inUser.user_id.min(), inUser.user_id.max())
 37 | inUser['Uid'] = 1000 * inUser.user_id + inUser.site_id
 38 | inAnswer['Uid'] = 1000 * inAnswer.user_id + inAnswer.site_id
 39 | 
 40 | inQuestion['Qid'] = 1000 * inQuestion.question_id + inQuestion.site_id
 41 | inAnswer['Qid'] = 1000 * inAnswer.question_id + inAnswer.site_id
 42 | 
 43 | le = preprocessing.LabelEncoder()
 44 | le.fit(inAnswer.Uid)
 45 | inAnswer.Uid = le.transform(inAnswer.Uid)
 46 | inUser.Uid = le.transform(inUser.Uid)
 47 | print(inAnswer.Uid.min(), inAnswer.Uid.max())
 48 | print(inUser.Uid.min(), inUser.Uid.max())
 49 | 
 50 | le = preprocessing.LabelEncoder()
 51 | le.fit(inAnswer.Qid)
 52 | inAnswer.Qid = le.transform(inAnswer.Qid)
 53 | inQuestion.Qid = le.transform(inQuestion.Qid)
 54 | print(inAnswer.Qid.min(), inAnswer.Qid.max())
 55 | print(inQuestion.Qid.min(), inQuestion.Qid.max())
 56 | 
 57 | print(inUser.iloc[:3,:])
 58 | tu = inUser[['reputation', 'upvotes' ,'downvotes']].copy()
 59 | tu.drop_duplicates(inplace=True)
 60 | print(tu.shape)
 61 | 
 62 | tu = inUser[['reputation', 'upvotes' ,'downvotes','site_id']].copy()
 63 | tu.drop_duplicates(inplace=True)
 64 | print(tu.shape)
 65 | 
 66 | tu['newUid'] = np.arange(tu.shape[0])
 67 | newU = pd.merge(inUser, tu) 
 68 | print(newU.shape)
 69 | print(newU.columns)
 70 | print(newU.iloc[:3,:])
 71 | 
 72 | print(inQuestion.iloc[:3,:])
 73 | tq = inQuestion[['score', 'view_count']].copy()
 74 | tq.drop_duplicates(inplace=True)
 75 | print(tq.shape)
 76 | 
 77 | tq['newQid'] = np.arange(tq.shape[0])
 78 | newQ = pd.merge(inQuestion, tq) 
 79 | print(newQ.shape)
 80 | print(newQ.columns)
 81 | print(newQ.iloc[:3,:])
 82 | 
 83 | z = pd.merge(inAnswer, newQ)
 84 | z = pd.merge(z, newU)
 85 | print(z.columns)
 86 | print(z.shape)
 87 | 
 88 | doAnswer = z[['id','newUid','newQid','Y']].copy().drop_duplicates()
 89 | doQuestion = z[['newQid','score', 'view_count']].copy().drop_duplicates()
 90 | doUser = z[['newUid','site_id', 'reputation','upvotes','downvotes']].copy().drop_duplicates()
 91 | 
 92 | print(doAnswer.shape)
 93 | print(doQuestion.shape)
 94 | print(doUser.shape)
 95 | 
 96 | doJoin = pd.merge(doAnswer, doQuestion)
 97 | doJoin = pd.merge(doJoin, doUser)
 98 | 
 99 | STD = doJoin.reputation.std()
100 | 
101 | doUserBackup = doUser.copy() 
102 | 
103 | print(doUser.iloc[:3,:])
104 | doUser = doUser.join(pd.get_dummies(doUser.site_id, prefix='st'))
105 | doUser.upvotes = (doUser.upvotes - doUser.upvotes.min()) / (doUser.upvotes.max() - doUser.upvotes.min())
106 | doUser.downvotes = (doUser.downvotes - doUser.downvotes.min()) / (doUser.downvotes.max() - doUser.downvotes.min())
107 | 
108 | # doUser.reputation /= doUser.reputation.std()
109 | # doUser.reputation /= STD
110 | 
111 | print(doUser.iloc[:3,:])
112 | 
113 | doAnswerBackup = doAnswer.copy()
114 | doAnswer.Y = (doAnswer.Y - doAnswer.Y.min()) / (doAnswer.Y.max() - doAnswer.Y.min())
115 | 
116 | print(doAnswer.columns)
117 | print(doAnswer.iloc[:3,:])
118 | 
119 | doQuestionBackup = doQuestion.copy()
120 | 
121 | print(doQuestion.columns)
122 | print(doQuestion.iloc[:3,:])
123 | 
124 | doQuestion.score = (doQuestion.score - doQuestion.score.min()) / (doQuestion.score.max() - doQuestion.score.min())
125 | doQuestion.view_count = (doQuestion.view_count - doQuestion.view_count.min()) / (doQuestion.view_count.max() - doQuestion.view_count.min())
126 | 
127 | print(doQuestion.iloc[:3,:])
128 | 
129 | doingUser = doUser.copy()
130 | 
131 | rng = np.random.RandomState(123)
132 | from sklearn.utils import shuffle
133 | doingUser = shuffle(doingUser, random_state=rng)
134 | 
135 | TrainProp = 0.5
136 | ValProp = 0.25
137 | TrainEnd = int(TrainProp * doingUser.shape[0])
138 | ValEnd = TrainEnd + int(ValProp * doingUser.shape[0])
139 | 
140 | print(doingUser.columns)
141 | 
142 | doingUser.reputation /= doingUser.reputation.std()
143 | 
144 | trainUser = doingUser[:TrainEnd].copy()
145 | valUser = doingUser[TrainEnd:ValEnd].copy()
146 | testUser = doingUser[ValEnd:].copy()
147 | 
148 | print(doingUser.columns)
149 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/'
150 | dataset = 'stackn-single'
151 | y_train = trainUser.reputation
152 | y_val = valUser.reputation
153 | y_test = testUser.reputation
154 | 
155 | trainUser.drop(['newUid', 'site_id', 'reputation'], axis=1, inplace=True)
156 | valUser.drop(['newUid', 'site_id', 'reputation'], axis=1, inplace=True)
157 | testUser.drop(['newUid', 'site_id', 'reputation'], axis=1, inplace=True)
158 | 
159 | X_train = np.ascontiguousarray(trainUser.values.astype(np.float64))
160 | X_val = np.ascontiguousarray(valUser.values.astype(np.float64))
161 | X_test = np.ascontiguousarray(testUser.values.astype(np.float64))
162 | 
163 | np.save( DATASET_DIR + "{}-train-X.npy".format(dataset),X_train)
164 | np.save(DATASET_DIR + "{}-val-X.npy".format(dataset),X_val)
165 | np.save( DATASET_DIR + "{}-test-X.npy".format(dataset),X_test)
166 | 
167 | np.save(DATASET_DIR + "{}-train-y.npy".format(dataset),y_train)
168 | np.save(DATASET_DIR + "{}-val-y.npy".format(dataset),y_val)
169 | np.save(DATASET_DIR + "{}-test-y.npy".format(dataset),y_test)
170 | 
171 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/'
172 | dataset = 'stackn'
173 | 
174 | doingUser = doUser.copy()
175 | doingUser.reputation/=STD
176 | 
177 | rng = np.random.RandomState(123)
178 | from sklearn.utils import shuffle
179 | doingUser = shuffle(doingUser, random_state=rng)
180 | 
181 | TrainProp = 0.5
182 | ValProp = 0.25
183 | TrainEnd = int(TrainProp * doingUser.shape[0])
184 | ValEnd = TrainEnd + int(ValProp * doingUser.shape[0])
185 | 
186 | print(doingUser.columns)
187 | 
188 | # User
189 | trainUser = doingUser[:TrainEnd].copy()
190 | valUser = doingUser[TrainEnd:ValEnd].copy()
191 | testUser = doingUser[ValEnd:].copy()
192 | 
193 | # join
194 | trainSet = pd.merge(trainUser, doAnswer)
195 | trainSet = pd.merge(trainSet,  doQuestion)
196 | 
197 | valSet = pd.merge(valUser, doAnswer)
198 | valSet = pd.merge(valSet,  doQuestion)
199 | 
200 | testSet = pd.merge(testUser, doAnswer)
201 | testSet = pd.merge(testSet,  doQuestion)
202 | 
203 | y_train = trainSet.reputation
204 | y_val = valSet.reputation
205 | y_test = testSet.reputation
206 | 
207 | trainSet.to_csv(DATASET_DIR + "{}-train-X.csv".format(dataset), index=False)
208 | valSet.to_csv(DATASET_DIR + "{}-val-X.csv".format(dataset), index=False)
209 | testSet.to_csv(DATASET_DIR + "{}-test-X.csv".format(dataset), index=False)
210 | 
211 | trainSet.drop(['newUid', 'newQid', 'id', 'site_id', 'reputation'], axis=1, inplace=True)
212 | valSet.drop(['newUid', 'newQid', 'id', 'newUid', 'site_id', 'reputation'], axis=1, inplace=True)
213 | testSet.drop(['newUid', 'newQid', 'id', 'newUid', 'site_id', 'reputation'], axis=1, inplace=True)
214 | 
215 | print(trainSet.shape)
216 | print(trainSet.columns)
217 | X_train = np.ascontiguousarray(trainSet.values.astype(np.float64))
218 | X_val = np.ascontiguousarray(valSet.values.astype(np.float64))
219 | X_test = np.ascontiguousarray(testSet.values.astype(np.float64))
220 | 
221 | np.save( DATASET_DIR + "{}-train-X.npy".format(dataset),X_train)
222 | np.save(DATASET_DIR + "{}-val-X.npy".format(dataset),X_val)
223 | np.save( DATASET_DIR + "{}-test-X.npy".format(dataset),X_test)
224 | 
225 | np.save(DATASET_DIR + "{}-train-y.npy".format(dataset),y_train)
226 | np.save(DATASET_DIR + "{}-val-y.npy".format(dataset),y_val)
227 | np.save(DATASET_DIR + "{}-test-y.npy".format(dataset),y_test)
228 | 
229 | # doAnswer.to_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doAnswer.csv', index=False)
230 | # doUser.to_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doUser.csv', index=False)
231 | # doQuestion.to_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doQuestion.csv', index=False)
232 | 
233 | doAnswer = pd.read_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doAnswer.csv')
234 | doUser = pd.read_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doUser.csv')
235 | doQuestion = pd.read_csv('/home/jiayi/disk/C-craig/dataset/stackn-formycs/doQuestion.csv')
236 | 
237 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/'
238 | dataset = 'stackn'
239 | 
240 | dfBackup = pd.read_csv(DATASET_DIR + "{}-train-X.csv".format(dataset))
241 | print(dfBackup.shape)
242 | 
243 | df = dfBackup.copy()
244 | print(df.columns)
245 | 
246 | df.rename(columns={'reputation':'target'},inplace=True)
247 | doUser.rename(columns={'reputation':'target'},inplace=True)
248 | doUser.drop(['site_id','target'],axis=1,inplace=True)
249 | 
250 | df['rowID'] = np.arange(df.shape[0])
251 | 
252 | le = preprocessing.LabelEncoder()
253 | le.fit(df.target)
254 | df.target = le.transform(df.target)
255 | 
256 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/'
257 | dataset = 'stacknC++'
258 | 
259 | y_train = df.target
260 | y_train = np.ascontiguousarray(y_train.values.astype(np.int64))
261 | dfC = df.drop(['rowID','newUid', 'newQid', 'id', 'site_id', 'target'], axis=1)
262 | X_train = np.ascontiguousarray(dfC.values.astype(np.float64))
263 | 
264 | testy = np.load(DATASET_DIR + "{}-train-y.npy".format(dataset))
265 | print(np.unique(testy))
266 | print(len(np.unique(testy)))
267 | print(np.min(testy), np.max(testy))
268 | 
269 | np.save( DATASET_DIR + "{}-train-X.npy".format(dataset),X_train)
270 | np.save(DATASET_DIR + "{}-train-y.npy".format(dataset),y_train)
271 | 
272 | print(dataset)
273 | 
274 | print(dfC.columns)
275 | 
276 | print(np.sort(df.target.unique()))
277 | print(len(df.target.unique()))
278 | print(df.shape)
279 | 
280 | print(np.unique(y_train))
281 | print(len(np.unique(y_train)))
282 | print(np.min(y_train), np.max(y_train))
283 | 
284 | print(df.target.value_counts())
285 | 
286 | uni = np.sort(df.target.unique())
287 | print(uni.shape)
288 | print(uni)
289 | 
290 | df = df [['newUid', 'newQid', 'id',  'rowID','site_id', 'target', 'upvotes', 'downvotes', 'st_0', 'st_1', 'st_2', 'st_3', 'st_4', 'st_5', 'st_6', 'st_7', 'st_8', 'st_9', 'st_10', 'st_11', 'st_12', 'st_13', 'st_14', 'st_15', 'st_16', 'st_17', 'st_18', 'st_19', 'st_20', 'st_21', 'st_22', 'st_23', 'st_24', 'st_25', 'st_26', 'st_27', 'st_28', 'st_29', 'st_30', 'st_31', 'st_32', 'st_33', 'st_34', 'st_35', 'st_36', 'st_37', 'st_38', 'st_39', 'st_40', 'st_41', 'st_42', 'st_43', 'st_44', 'st_45', 'st_46', 'st_47', 'st_48', 'st_49', 'st_50', 'st_51', 'st_52', 'st_53', 'st_54', 'st_55', 'st_56', 'st_57', 'st_58', 'st_59', 'st_60', 'st_61', 'st_62', 'st_63', 'st_64', 'st_65', 'st_66', 'st_67', 'st_68', 'st_69', 'st_70', 'st_71', 'st_72', 'st_73', 'st_74', 'st_75', 'st_76', 'st_77', 'st_78', 'st_79', 'st_80', 'st_81', 'st_82', 'st_83', 'st_84', 'st_85', 'st_86', 'st_87', 'st_88', 'st_89', 'st_90', 'st_91', 'st_92', 'st_93', 'st_94', 'st_95', 'st_96', 'st_97', 'st_98', 'st_99', 'st_100', 'st_101', 'st_102', 'st_103', 'st_104', 'st_105', 'st_106', 'st_107', 'st_108', 'st_109', 'st_110', 'st_111', 'st_112', 'st_113', 'st_114', 'st_115', 'st_116', 'st_117', 'st_118', 'st_119', 'st_120', 'st_121', 'st_122', 'st_123', 'st_124', 'st_125', 'st_126', 'st_127', 'st_128', 'st_129', 'st_130', 'st_131', 'st_132', 'st_133', 'st_134', 'st_135', 'st_136', 'st_137', 'st_138', 'st_139', 'st_140', 'st_141', 'st_142', 'st_143', 'st_144', 'st_145', 'st_146', 'st_147', 'st_148', 'st_149', 'st_150', 'st_151', 'st_152', 'st_153', 'st_154', 'st_155', 'st_156', 'st_157', 'st_158', 'st_159', 'st_160', 'st_161', 'st_162', 'st_163', 'st_164', 'st_165', 'st_166', 'st_167', 'st_168', 'st_169', 'st_170', 'st_171', 'st_172',  'Y', 'score', 'view_count']].copy()
291 | 
292 | dataset = 'stackn'
293 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataset)
294 | 
295 | for cate in uni:
296 |     print("#"*20, " "*10, cate, " "*10, "#"*20)
297 |     tmpDF = df[df['target'] == cate].copy()
298 |     
299 | 
300 |     le = preprocessing.LabelEncoder()
301 |     le.fit(tmpDF.newUid)
302 |     tmpDF.newUid = le.transform(tmpDF.newUid)
303 |     
304 | 
305 |     le = preprocessing.LabelEncoder()
306 |     le.fit(tmpDF.newQid)
307 |     tmpDF.newQid = le.transform(tmpDF.newQid)
308 | 
309 |     le = preprocessing.LabelEncoder()
310 |     le.fit(tmpDF.id)
311 |     tmpDF.id = le.transform(tmpDF.id)
312 |     
313 |     
314 | 
315 |     tmpDF.sort_values("id",inplace=True)
316 |     tmpDF.to_csv(DATASET_DIR + "train-{}-joined.csv".format(cate), index=False)
317 |     tmp_ = np.ascontiguousarray(tmpDF.values.astype(np.float64))
318 |     np.save(DATASET_DIR + "train-{}-joined.npy".format(cate), tmp_)
319 |     
320 |      
321 |     tmpUser = tmpDF[doUser.columns].copy()
322 |     tmpUser.drop_duplicates(inplace=True)
323 |     tmpUser.sort_values("newUid",inplace=True)
324 |     tmpUser.to_csv(DATASET_DIR + "train-{}-user.csv".format(cate), index=False)
325 |     tmpUser = np.ascontiguousarray(tmpUser.values.astype(np.float64))
326 |     np.save(DATASET_DIR + "train-{}-user.npy".format(cate), tmpUser)
327 | 
328 |     
329 |     tmpQuestion = tmpDF[doQuestion.columns].copy()
330 |     tmpQuestion.drop_duplicates(inplace=True) 
331 |     tmpQuestion.sort_values("newQid",inplace=True)
332 |     tmpQuestion.to_csv(DATASET_DIR + "train-{}-question.csv".format(cate), index=False)
333 |     tmpQuestion = np.ascontiguousarray(tmpQuestion.values.astype(np.float64))
334 |     np.save(DATASET_DIR + "train-{}-question.npy".format(cate), tmpQuestion)
335 |     print(tmpQuestion.shape)
336 |     
337 |     
338 |     tmpAnswer = tmpDF[doAnswer.columns].copy()
339 |     tmpAnswer.drop_duplicates(inplace=True)
340 |     tmpAnswer.sort_values("id",inplace=True)
341 |     tmpAnswer.to_csv(DATASET_DIR + "train-{}-answer.csv".format(cate), index=False)
342 |     tmpAnswer = np.ascontiguousarray(tmpAnswer.values.astype(np.float64))
343 |     np.save(DATASET_DIR + "train-{}-answer.npy".format(cate), tmpAnswer)
344 |     
345 | 
346 | 


--------------------------------------------------------------------------------
/preprocess/taxi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | from sklearn.linear_model import LogisticRegression
  7 | from sklearn import metrics
  8 | import os
  9 | from sklearn import preprocessing
 10 | 
 11 | import datetime
 12 | def parseDatetime(s):
 13 | #     print('s is ',s)
 14 |     pre, suf = s.split(' ')
 15 |     
 16 |     year_s, mon_s, day_s = pre.split('-')
 17 |     hour_s, minute_s, second_s = suf.split(':')
 18 | #     retuabsrn datetime.datetime(int(year_s), int(mon_s), int(day_s), int(hour_s), int(minute_s), int(second_s))
 19 |     return datetime.datetime(int(year_s), int(mon_s), int(day_s), int(hour_s), int(minute_s), int(second_s)).date()
 20 | 
 21 | def parseYMD(arrLike, col1):
 22 |     YMD = parseDatetime(arrLike[col1])
 23 |     return str(YMD)
 24 | 
 25 | def timeDelta(arrLike, col1, col2):
 26 |     purchase = parseDatetime(arrLike[col1])
 27 |     approve = parseDatetime(arrLike[col2])
 28 |     delta = approve - purchase
 29 |     return delta.total_seconds()
 30 | 
 31 | X_train = []
 32 | X_test = []
 33 | y_train = []
 34 | y_test = []
 35 | 
 36 | DIR= "/home/jiayi/disk/gits/craig/datasets/taxi/data/"
 37 | 
 38 | taxi = pd.read_csv(DIR+"taxi.csv")
 39 | 
 40 | def readDF(ID):
 41 |     df = pd.read_csv(DIR+"tbl_{}.csv".format(ID))
 42 |     return df
 43 | 
 44 | t16 = readDF(16)
 45 | t16['f642'] = t16.apply(parseYMD, axis=1, args=['f405'])
 46 | 
 47 | t5 = readDF(5)
 48 | t20 = readDF(20)
 49 | #t23 = readDF(23)
 50 | 
 51 | #t24 = readDF(24)
 52 | t14 = readDF(14)
 53 | t11 = readDF(11)
 54 | #t6 = readDF(6)
 55 | 
 56 | t5.rename({'f188':'f642'},axis=1,inplace=True)
 57 | t20.rename({'f520':'f642'},axis=1,inplace=True)
 58 | # t23.rename({'f607':'f642'},axis=1,inplace=True)
 59 | 
 60 | # t24.rename({'f634':'f642'},axis=1,inplace=True)
 61 | t14.rename({'f373':'f642'},axis=1,inplace=True)
 62 | t11.rename({'f299':'f642'},axis=1,inplace=True)
 63 | # t6.rename({'f195':'f642'},axis=1,inplace=True)
 64 | 
 65 | t11 = t11[['f642', 'f294','f298', 'f300','f302', 'f306']].copy()
 66 | print(t11.shape)
 67 | print(t11.columns)
 68 | 
 69 | """ t5"""
 70 | # print(t5.shape)
 71 | # print(t5.columns)
 72 | # print(t5.f189.min(), t5.f189.max())
 73 | 
 74 | """ t20"""
 75 | # print(t20.shape)
 76 | # print(t20.columns)
 77 | # print(t20)
 78 | # print(t20.f189.min(), t5.f189.max())
 79 | 
 80 | """ t16"""
 81 | t16.drop(['f405'],axis=1,inplace=True)
 82 | t16 = t16[['f642','f406','f407','f408']].copy()
 83 | # print(t16.shape)
 84 | # print(t16.columns)
 85 | # print(t16)
 86 | # print(t20.f189.min(), t5.f189.max())
 87 | 
 88 | z = pd.merge(taxi, t11,left_on='f642', right_on='f642')
 89 | print(z.shape)
 90 | 
 91 | z = pd.merge(z, t5)
 92 | print(z.shape)
 93 | 
 94 | z = pd.merge(z, t20)
 95 | print(z.shape)
 96 | 
 97 | z = pd.merge(z, t16)
 98 | print(z.shape)
 99 | 
100 | le = preprocessing.LabelEncoder()
101 | le.fit(z.f642)
102 | z.f642 = le.transform(z.f642)
103 | # aUser.user_id = le.transform(aUser.user_id)
104 | 
105 | taxi = z[taxi.columns].copy().drop_duplicates()
106 | print(taxi.shape)
107 | 
108 | t11 = z[t11.columns].copy().drop_duplicates()
109 | print(t11.shape)
110 | 
111 | t5 = z[t5.columns].copy().drop_duplicates()
112 | print(t5.shape)
113 | 
114 | t20 = z[t20.columns].copy().drop_duplicates()
115 | print(t20.shape)
116 | 
117 | t16 = z[t16.columns].copy().drop_duplicates()
118 | print(t16.shape)
119 | 
120 | taxi.dropna(inplace=True)
121 | taxi = taxi[['f642','f643','target']].copy()
122 | print(taxi.f643.min(), taxi.f643.max())
123 | print(taxi.target.min(), taxi.target.max())
124 | 
125 | taxi.f643 = (taxi.f643 - (taxi.f643.min())) / (taxi.f643.max()-taxi.f643.min())
126 | std = taxi.target.std()
127 | print(std)
128 | taxi.target/=std
129 | print(taxi)
130 | 
131 | t11.dropna(inplace=True)
132 | print(t11)
133 | 
134 | cols = ['f300','f302','f306']
135 | for col in cols:
136 |     True_idx = t11[col].map(lambda x: x==True)
137 |     False_idx = t11[col].map(lambda x: x==False)
138 |     t11.loc[True_idx, col] = 1
139 |     t11.loc[False_idx, col] = 0
140 | 
141 | print(t11)
142 | t11 = t11.join(pd.get_dummies(t11.f294))
143 | t11.drop(['f294'], axis=1,inplace=True)
144 | t11.f298 = (t11.f298 - t11.f298.min())/(t11.f298.max()-t11.f298.min())
145 | t11["ID11"] = np.arange(t11.shape[0])
146 | print(t11.shape)
147 | 
148 | t5.dropna(inplace=True)
149 | print(t5)
150 | print(t5.f189.min(), t5.f189.max())
151 | t5.f189 = (t5.f189 - t5.f189.min())/(t5.f189.max()-t5.f189.min())
152 | 
153 | t5["ID5"] = np.arange(t5.shape[0])
154 | 
155 | t20.dropna(inplace=True)
156 | print(t20)
157 | print(t20.f521.min(), t20.f521.max())
158 | print(t20.f522.min(), t20.f522.max())
159 | print(t20.f523.min(), t20.f523.max())
160 | t20.f521 = (t20.f521 - t20.f521.min())/(t20.f521.max()-t20.f521.min())
161 | t20.f522 = (t20.f522 - t20.f522.min())/(t20.f522.max()-t20.f522.min())
162 | t20.f523 = (t20.f523 - t20.f523.min())/(t20.f523.max()-t20.f523.min())
163 | 
164 | t20["ID20"] = np.arange(t20.shape[0])
165 | 
166 | t16.dropna(inplace=True)
167 | t16.f406 = (t16.f406 - t16.f406.min())/(t16.f406.max()-t16.f406.min())
168 | t16.f407 = (t16.f407 - t16.f407.min())/(t16.f407.max()-t16.f407.min())
169 | t16.f408 = (t16.f408 - t16.f408.min())/(t16.f408.max()-t16.f408.min())
170 | t16["ID16"] = np.arange(t16.shape[0])
171 | 
172 | print(taxi.shape)
173 | 
174 | rng = np.random.RandomState(123)
175 | from sklearn.utils import shuffle
176 | taxi = shuffle(taxi, random_state=rng)
177 | 
178 | TrainProp = 0.5
179 | ValProp = 0.25
180 | TrainEnd = int(TrainProp * taxi.shape[0])
181 | ValEnd = TrainEnd + int(ValProp * taxi.shape[0])
182 | 
183 | trainTaxi = taxi[:TrainEnd]
184 | valTaxi = taxi[TrainEnd:ValEnd]
185 | testTaxi = taxi[ValEnd:]
186 | 
187 | print(trainTaxi.shape)
188 | print(valTaxi.shape)
189 | print(testTaxi.shape)
190 | 
191 | print(trainTaxi.columns)
192 | 
193 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/'
194 | dataset = 'taxi-single'
195 | y_train = trainTaxi.target
196 | y_val = valTaxi.target
197 | y_test = testTaxi.target
198 | 
199 | X_train = np.ascontiguousarray(trainTaxi[['f643']].copy().values.astype(np.float64))
200 | X_val = np.ascontiguousarray(valTaxi[['f643']].copy().values.astype(np.float64))
201 | X_test = np.ascontiguousarray(testTaxi[['f643']].copy().values.astype(np.float64))
202 | 
203 | np.save( DATASET_DIR + "{}-train-X.npy".format(dataset),X_train)
204 | np.save(DATASET_DIR + "{}-val-X.npy".format(dataset),X_val)
205 | np.save( DATASET_DIR + "{}-test-X.npy".format(dataset),X_test)
206 | 
207 | np.save(DATASET_DIR + "{}-train-y.npy".format(dataset),y_train)
208 | np.save(DATASET_DIR + "{}-val-y.npy".format(dataset),y_val)
209 | np.save(DATASET_DIR + "{}-test-y.npy".format(dataset),y_test)
210 | 
211 | trainSet = pd.merge(trainTaxi, t11)
212 | trainSet = pd.merge(trainSet, t5)
213 | trainSet = pd.merge(trainSet, t20)
214 | trainSet = pd.merge(trainSet, t16)
215 | print(trainSet.shape)
216 | 
217 | valSet = pd.merge(valTaxi, t11)
218 | valSet = pd.merge(valSet, t5)
219 | valSet = pd.merge(valSet, t20)
220 | valSet = pd.merge(valSet, t16)
221 | print(valSet.shape)
222 | 
223 | testSet = pd.merge(testTaxi, t11)
224 | testSet = pd.merge(testSet, t5)
225 | testSet = pd.merge(testSet, t20)
226 | testSet = pd.merge(testSet, t16)
227 | print(testSet.shape)
228 | 
229 | z = 0.9
230 | print(z **20)
231 | 
232 | z = 0.8
233 | print(z ** 20)
234 | 
235 | z = 0.7
236 | print(z ** 20)
237 | 
238 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/'
239 | dataset = 'taxi'
240 | 
241 | y_train = trainSet.target.copy()
242 | y_val = valSet.target.copy()
243 | y_test = testSet.target.copy()
244 | 
245 | trainSet.to_csv(DATASET_DIR + "{}-train.csv".format(dataset), index=False)
246 | valSet.to_csv(DATASET_DIR + "{}-val.csv".format(dataset), index=False)
247 | testSet.to_csv(DATASET_DIR + "{}-test.csv".format(dataset), index=False)
248 | 
249 | trainSet.drop(['target','f642',"ID11","ID5","ID20","ID16"],axis=1,inplace=True)
250 | valSet.drop( ['target','f642',"ID11","ID5","ID20","ID16"],axis=1,inplace=True)
251 | testSet.drop(['target','f642',"ID11","ID5","ID20","ID16"],axis=1,inplace=True)
252 | 
253 | X_train = np.ascontiguousarray(trainSet.values.astype(np.float64))
254 | X_val = np.ascontiguousarray(valSet.values.astype(np.float64))
255 | X_test = np.ascontiguousarray(testSet.values.astype(np.float64))
256 | 
257 | np.save( DATASET_DIR + "{}-train-X.npy".format(dataset),X_train)
258 | np.save(DATASET_DIR + "{}-val-X.npy".format(dataset),X_val)
259 | np.save( DATASET_DIR + "{}-test-X.npy".format(dataset),X_test)
260 | 
261 | np.save(DATASET_DIR + "{}-train-y.npy".format(dataset),y_train)
262 | np.save(DATASET_DIR + "{}-val-y.npy".format(dataset),y_val)
263 | np.save(DATASET_DIR + "{}-test-y.npy".format(dataset),y_test)
264 | 
265 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/'
266 | dataset = 'taxi'
267 | 
268 | df = pd.read_csv(DATASET_DIR + "{}-train-X.csv".format(dataset))
269 | print(df.shape)
270 | 
271 | print(df.columns)
272 | 
273 | print(df.target.value_counts())
274 | 
275 | df['rowID'] = np.arange(df.shape[0])
276 | 
277 | le = preprocessing.LabelEncoder()
278 | le.fit(df.target)
279 | df.target = le.transform(df.target)
280 | 
281 | print(len(df.target.unique()))
282 | cate_list = df.target.unique()
283 | 
284 | le = preprocessing.LabelEncoder()
285 | le.fit(df.ID5)
286 | df.ID5 = le.transform(df.ID5)
287 | 
288 | le = preprocessing.LabelEncoder()
289 | le.fit(df.ID11)
290 | df.ID11 = le.transform(df.ID11)
291 | 
292 | le = preprocessing.LabelEncoder()
293 | le.fit(df.ID16)
294 | df.ID16 = le.transform(df.ID16)
295 | 
296 | le = preprocessing.LabelEncoder()
297 | le.fit(df.ID20)
298 | df.ID20 = le.transform(df.ID20)
299 | 
300 | le = preprocessing.LabelEncoder()
301 | le.fit(df.f642)
302 | df.f642 = le.transform(df.f642)
303 | 
304 | taxi = df[taxi.columns].copy().drop_duplicates()
305 | print(taxi.shape)
306 | 
307 | t11 = df[t11.columns].copy().drop_duplicates()
308 | print(t11.shape)
309 | 
310 | t5 = df[t5.columns].copy().drop_duplicates()
311 | print(t5.shape)
312 | 
313 | t20 = df[t20.columns].copy().drop_duplicates()
314 | print(t20.shape)
315 | 
316 | t16 = df[t16.columns].copy().drop_duplicates()
317 | print(t16.shape)
318 | 
319 | taxi.sort_values("f642",inplace=True)
320 | t5.sort_values("ID5",inplace=True)
321 | t20.sort_values("ID20",inplace=True)
322 | t16.sort_values("ID16",inplace=True)
323 | t11.sort_values("ID11",inplace=True)
324 | 
325 | print(taxi.columns)
326 | t5 = t5[['ID5',    'f642', 'f189']].copy()
327 | t20 = t20[['ID20', 'f642', 'f521', 'f522', 'f523']].copy()
328 | t16 = t16[['ID16', 'f642', 'f406', 'f407', 'f408']].copy()
329 | t11 = t11[['ID11', 'f642', 'f298', 'f300', 'f302', 'f306', 'Booted in Error',
330 |        'Duplicate Case', 'Executed', 'NJS Released', 'Other',
331 |        'Paid in the Field', 'Redeemed', 'Reduced', 'Salvage History',
332 |        'Salvage and Total Loss', 'Salvage/Total Loss/Export', 'Sold',
333 |        'Sold Abandoned', 'Stolen Vehicle', 'Total Loss', 'Towed in Error',
334 |        'Vehicle Not Towed', 'Zero Released']].copy()
335 | print(t5.columns)
336 | print(t20.columns)
337 | print(t16.columns)
338 | print(t11.columns)
339 | 
340 | dataset = 'taxi'
341 | DATASET_DIR = '/home/jiayi/disk/C-craig/dataset/{}-formycs/'.format(dataset)
342 | 
343 | taxi.to_csv(DATASET_DIR + "train-taxi.csv", index=False)
344 | t5.to_csv(DATASET_DIR + "train-t5.csv", index=False)
345 | t20.to_csv(DATASET_DIR + "train-t20.csv", index=False)
346 | t16.to_csv(DATASET_DIR + "train-t16.csv", index=False)
347 | 
348 | taxi_ = np.ascontiguousarray(taxi.values.astype(np.float64))
349 | t5_ = np.ascontiguousarray(t5.values.astype(np.float64))
350 | t20_ = np.ascontiguousarray(t20.values.astype(np.float64))
351 | t16_ = np.ascontiguousarray(t16.values.astype(np.float64))
352 | 
353 | np.save(DATASET_DIR + "train-taxi.npy", taxi_)
354 | np.save(DATASET_DIR + "train-t5.npy", t5_)
355 | np.save(DATASET_DIR + "train-t20.npy", t20_)
356 | np.save(DATASET_DIR + "train-t16.npy", t16_)
357 | 
358 | uni = np.sort(df.target.unique())
359 | print(uni)
360 | # print(df.target.unique())
361 | 
362 | print(df.columns)
363 | 
364 | df = df[['f642', 'ID5', 'ID11', 'ID16', 'ID20', 'f643', 'target', 'f298', 'f300', 'f302', 'f306',
365 |        'Booted in Error', 'Duplicate Case', 'Executed', 'NJS Released',
366 |        'Other', 'Paid in the Field', 'Redeemed', 'Reduced', 'Salvage History',
367 |        'Salvage and Total Loss', 'Salvage/Total Loss/Export', 'Sold',
368 |        'Sold Abandoned', 'Stolen Vehicle', 'Total Loss', 'Towed in Error',
369 |        'Vehicle Not Towed', 'Zero Released', 'f189', 'ID5', 'f521',
370 |        'f522', 'f523', 'f406', 'f407', 'f408', 'rowID']].copy()
371 | 
372 | for cate in uni:
373 |     tmpDF = df[df['target'] == cate].copy()
374 |     
375 |     le = preprocessing.LabelEncoder()
376 |     le.fit(tmpDF.ID11)
377 |     tmpDF.ID11 = le.transform(tmpDF.ID11)
378 |     
379 | #     print(tmpDF.shape)
380 |     tmpDF.to_csv(DATASET_DIR + "train-{}-joined.csv".format(cate), index=False)
381 |     tmp_ = np.ascontiguousarray(tmpDF.values.astype(np.float64))
382 |     np.save(DATASET_DIR + "train-{}-joined.npy".format(cate), tmp_)
383 |     
384 |     tmpt11 = tmpDF[t11.columns].copy()
385 |     tmpt11.drop_duplicates(inplace=True) 
386 |     tmpt11.to_csv(DATASET_DIR + "train-{}-t11.csv".format(cate), index=False)
387 |     tmpt11_ = np.ascontiguousarray(tmpt11.values.astype(np.float64))
388 |     np.save(DATASET_DIR + "train-{}-t11.npy".format(cate), tmpt11_)
389 |     print(tmpt11_.shape)
390 | 
391 | print(len(t11.f642.unique()))
392 | print(len(t5.f642.unique()))
393 | print(len(t20.f642.unique()))
394 | print(len(t16.f642.unique()))
395 | # print(len(t11.f642.unique()))
396 | 
397 | 


--------------------------------------------------------------------------------