├── .gitignore ├── cuda_acceleration_demo ├── conv.cpp ├── matmul_tile.cu ├── matmul_tile_full.cu └── sparse_mv.cu ├── ddp_example ├── README.md ├── data.py ├── data │ └── wikitext-2 │ │ ├── test.txt │ │ ├── train.txt │ │ └── valid.txt ├── main.py ├── model.py ├── requirements.txt └── synthetic.py ├── decoding └── decoding.ipynb ├── deepspeed_example ├── DeepSpeed-Example.ipynb ├── README.md ├── deepspeed_chat.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ ├── requires.txt │ └── top_level.txt ├── dschat │ ├── rlhf │ │ ├── ppo_trainer.py │ │ └── rlhf_engine.py │ └── utils │ │ ├── data │ │ ├── data_utils.py │ │ └── raw_datasets.py │ │ ├── ds_utils.py │ │ ├── model │ │ ├── model_utils.py │ │ └── reward_model.py │ │ ├── module │ │ └── lora.py │ │ ├── perf.py │ │ └── utils.py ├── main.py ├── requirements.txt ├── run_llama2_7b.sh └── setup.py ├── imgs ├── broadcast_matrix_matrix.png ├── broadcast_matrix_vector.png ├── broadcast_mv_scalar.png ├── broadcast_rule.png ├── data_storage_operators.png ├── high_level_abstraction.png ├── reduce.jpg └── strides.png ├── mini_tensorflow ├── mini_tensorflow.ipynb └── mini_tensorflow_full.ipynb ├── minitorch_notebook └── minitorch_architecture.ipynb ├── simple_cuda_demo ├── CUDA_Code_Examples.ipynb ├── example_matadd.cu ├── example_matmul.cu ├── example_matmul2.cu ├── example_vector_add.cu ├── example_window_sum.cu ├── test_matmul.py ├── test_vector_add.py └── test_window_sum.py ├── tensor_demo ├── indexing_broadcasting.ipynb └── miniTorch │ ├── .vscode │ └── settings.json │ ├── LICENSE │ ├── minitorch.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ └── top_level.txt │ ├── minitorch │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── __init__.cpython-39.pyc │ │ ├── autodiff.cpython-38.pyc │ │ ├── autodiff.cpython-39.pyc │ │ ├── cuda_kernel_ops.cpython-39.pyc │ │ ├── cuda_ops.cpython-39.pyc │ │ ├── datasets.cpython-38.pyc │ │ ├── datasets.cpython-39.pyc │ │ ├── fast_conv.cpython-39.pyc │ │ ├── fast_ops.cpython-39.pyc │ │ ├── module.cpython-39.pyc │ │ ├── nn.cpython-39.pyc │ │ ├── operators.cpython-38.pyc │ │ ├── operators.cpython-39.pyc │ │ ├── optim.cpython-39.pyc │ │ ├── scalar.cpython-39.pyc │ │ ├── scalar_functions.cpython-39.pyc │ │ ├── tensor.cpython-38.pyc │ │ ├── tensor.cpython-39.pyc │ │ ├── tensor_data.cpython-38.pyc │ │ ├── tensor_data.cpython-39.pyc │ │ ├── tensor_functions.cpython-38.pyc │ │ ├── tensor_functions.cpython-39.pyc │ │ ├── tensor_ops.cpython-38.pyc │ │ ├── tensor_ops.cpython-39.pyc │ │ └── testing.cpython-39.pyc │ ├── autodiff.py │ ├── datasets.py │ ├── module.py │ ├── operators.py │ ├── optim.py │ ├── tensor.py │ ├── tensor_data.py │ ├── tensor_functions.py │ └── tensor_ops.py │ ├── requirements.extra.txt │ ├── requirements.txt │ ├── setup.cfg │ ├── setup.py │ └── style.sh └── tokenization └── tokenization.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /cuda_acceleration_demo/conv.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | using namespace std; 4 | #define TILE_WIDTH 16 5 | 6 | 7 | void naive_conv(int N, int H, int W, int K, int C_IN, int C_OUT, float *input, float *output, float *kernel) { 8 | int h_out = H - K + 1; 9 | int w_out = W - K + 1; 10 | // kernel: C_OUT * C_IN * K * K 11 | // input: N * C_IN * H * W 12 | // output: N * C_OUT * h_out * w_out 13 | for(int n = 0; n < N; n++) { // for each image in the mini-batch 14 | for(int c_in = 0; c_in < C_IN; c_in++) { // for each output feature maps 15 | for(int c_out = 0; c_out < C_OUT; c_out++) { 16 | for(int h = 0; h < h_out; h++) { 17 | for(int w = 0; w < w_out; w++) { 18 | for(int i = 0; k < K; i++) { 19 | for(int j = 0; j < K; j++) { 20 | output[n, c_out, h, w] += input[n, c_in, h + i, w + j] * kernel[c_out, c_in, i, j]; 21 | } 22 | } 23 | } 24 | } 25 | } 26 | } 27 | } 28 | } 29 | 30 | void unroll_conv(int N, int H, int W, int K, int C_IN, int C_OUT, float *input, float *output, float *kernel) { 31 | int h_out = H - K + 1; 32 | int w_out = W - K + 1; 33 | int W_unroll = C_IN * K * K; 34 | int H_unroll = h_out * w_out; 35 | float* input_unroll = new float[W_unroll * H_unroll]; 36 | for(int i = 0; i < N; i++) { 37 | unroll(input[i], input_unroll, C_IN, H, W, K, h_out, w_out); 38 | gemm(input_unroll, kernel, output[i], W_unroll, C_OUT, H_unroll); 39 | } 40 | } 41 | 42 | void im2col(float* input, int C_IN, int H, int W, int K, float* output) { 43 | int h_out = H - K + 1; 44 | int w_out = W - K + 1; 45 | int h_unroll = C_IN * K * K; 46 | int w_unroll = h_out * w_out; 47 | 48 | for (int c = 0; c < C_IN; ++c) { 49 | for(int h = 0; h < h_out; h++) { 50 | for(int w = 0; w < w_out; w++) { 51 | for(int i = 0; i < K; i++) { 52 | for(int j = 0; j < K; j++) { 53 | output[c * K * K + (h * w_out + w)][i * K + j] = input[c * H * W + (h + i) * W + w + j]; 54 | output[c * K * K * h_out * w_out + i * K * w_out + j * w_out + h * w_out + w] = input[c, h + i, w + j]; 55 | } 56 | } 57 | } 58 | } 59 | } 60 | } 61 | 62 | void unroll(float *input, float *input_unroll, int C_IN, int H, int W, int K, int h_out, int w_out) { 63 | for(int c_in = 0; c_in < C_IN; c_in++) { 64 | int w_base = c_in * K * K; 65 | for(int h = 0; h < h_out; h++) { 66 | for(int w = 0; w < w_out; w++) { 67 | for(int i = 0; i < K; i++) { 68 | for(int j = 0; j < K; j++) { 69 | input_unroll[w_base * h_out * w_out + h * w_out + w] = input[c_in * H * W + (h + i) * W + w + j]; 70 | } 71 | } 72 | } 73 | } 74 | } 75 | } -------------------------------------------------------------------------------- /cuda_acceleration_demo/matmul_tile.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace std; 5 | #define TILE_WIDTH 16 6 | 7 | void matrix_multiply(float **a, float **b, float **c, float N) { 8 | for (int i = 0; i < N; i++) { 9 | for (int j = 0; j < N; j++) { 10 | c[i][j] = 0; 11 | for (int k = 0; k < N; k++) { 12 | c[i][j] += a[i][k] * b[k][j]; 13 | } 14 | } 15 | } 16 | } 17 | 18 | /** 19 | * @brief compute C=A*B using tile size TILE_WIDTH 20 | * 21 | * @param d_A matrix A 22 | * @param d_B matrix B 23 | * @param d_C result matrix C 24 | * @param N size of matrix (number of rows and columns) 25 | * 26 | * hint: define two matrices of size TILE_WIDTH x TILE_WIDTH in shared memory 27 | * sliding the tile along the matrix, compute partial sum of product. 28 | */ 29 | __global__ void MatMulTiledKernel(float* d_A, float* d_B, float* d_C, int N) { 30 | // define two matrices in share memory 31 | 32 | 33 | // define the row and column in the result matrix of current thread 34 | 35 | 36 | // iterate over tiles along row and column in d_A and d_B 37 | 38 | 39 | 40 | // store result 41 | 42 | 43 | } -------------------------------------------------------------------------------- /cuda_acceleration_demo/matmul_tile_full.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace std; 5 | #define TILE_WIDTH 16 6 | 7 | void matrix_multiply(float **a, float **b, float **c, float N) { 8 | for (int i = 0; i < N; i++) { 9 | for (int j = 0; j < N; j++) { 10 | c[i][j] = 0; 11 | for (int k = 0; k < N; k++) { 12 | c[i][j] += a[i][k] * b[k][j]; 13 | } 14 | } 15 | } 16 | } 17 | 18 | /** 19 | * @brief compute C=A*B using tile size TILE_WIDTH 20 | * 21 | * @param d_A matrix A 22 | * @param d_B matrix B 23 | * @param d_C result matrix C 24 | * @param N size of matrix (number of rows and columns) 25 | */ 26 | __global__ void MatMulTiledKernel(float* d_A, float* d_B, float* d_C, int N) { 27 | __shared__ float As[TILE_WIDTH][TILE_WIDTH]; 28 | __shared__ float Bs[TILE_WIDTH][TILE_WIDTH]; 29 | 30 | // Determine the row and col of the P element to be calculated for the thread 31 | int row = blockIdx.y * blockDim.y + threadIdx.y; 32 | int col = blockIdx.x * blockDim.x + threadIdx.x; 33 | float Cvalue = 0; 34 | for(int ph = 0; ph < N/TILE_WIDTH; ++ph) { 35 | As[threadIdx.y][threadIdx.x] = d_A[row * N + ph * TILE_WIDTH + threadIdx.x]; 36 | Bs[threadIdx.y][threadIdx.x] = d_B[(ph * TILE_WIDTH + threadIdx.y) * N + col]; 37 | __syncthreads(); 38 | for(int k = 0; k < TILE_WIDTH; ++k) { 39 | Cvalue += As[threadIdx.y][k] * Bs[k][threadIdx.x]; 40 | } 41 | __syncthreads(); 42 | } 43 | d_C[row * N + col] = Cvalue; 44 | } -------------------------------------------------------------------------------- /cuda_acceleration_demo/sparse_mv.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace std; 5 | #define TILE_WIDTH 16 6 | 7 | void spmv_csr(float *data, int *col_index, int *row_ptr, float *x, float *y, int n) { 8 | for(int row = 0; row < n; row++) { 9 | float dot = 0; 10 | int row_start = row_ptr[row]; 11 | int row_end = row_ptr[row + 1]; 12 | for(int elem = row_start; elem < row_end; elem++) { 13 | dot += x[row] * data[col_index[elem]]; 14 | } 15 | y[row] += dot; 16 | } 17 | } 18 | 19 | __global__ void SpMVCSRKernel(float *data, int *col_index, int *row_ptr, float *x, float *y, int num_rows) { 20 | int row = blockIdx.x * blockDim.x + threadIdx.x; 21 | if(row < num_rows) { 22 | float dot = 0; 23 | int row_start = row_ptr[row]; 24 | int row_end = row_ptr[row + 1]; 25 | for(int elem = row_start; elem < row_end; elem++) { 26 | dot += x[row] * data[col_index[elem]]; 27 | } 28 | y[row] += dot; 29 | } 30 | } -------------------------------------------------------------------------------- /ddp_example/README.md: -------------------------------------------------------------------------------- 1 | # Language Model Training Example 2 | 3 | This example is adapted from [PyTorch Word Language Model](https://github.com/pytorch/examples/tree/main/word_language_model) and demonstrates how to train a language model using PyTorch, both on a single GPU and using Distributed Data Parallel (DDP) for multiple GPUs. 4 | 5 | ## Instructions 6 | 7 | ### Running Training on a Single GPU 8 | To train the model on a single GPU, run the following command: 9 | ``` 10 | torchrun --nproc_per_node=1 main.py 11 | ``` 12 | 13 | This will start training with the default settings. You should see output similar to: 14 | ``` 15 | | Epoch 1 | Time: 13.50s | Train Loss 7.85 | Valid Loss 7.21 | Perplexity 1355.19 16 | | Epoch 2 | Time: 13.32s | Train Loss 7.30 | Valid Loss 6.93 | Perplexity 1026.65 17 | | Epoch 3 | Time: 13.36s | Train Loss 7.09 | Valid Loss 6.79 | Perplexity 885.36 18 | | Epoch 4 | Time: 13.48s | Train Loss 6.97 | Valid Loss 6.69 | Perplexity 805.83 19 | | Epoch 5 | Time: 13.52s | Train Loss 6.89 | Valid Loss 6.62 | Perplexity 746.42 20 | | End of Training | Test Loss 6.53 | Test Perplexity 687.53 21 | ``` 22 | 23 | ### Running Distributed Data Parallel (DDP) Training on Multiple GPUs 24 | Replace with the number of GPUs you wish to use. For example, to use 2 GPUs: 25 | ``` 26 | torchrun --nproc_per_node=2 main.py --ddp 27 | ``` 28 | 29 | This will start the training process with DDP enabled, and you should see output similar to: 30 | ``` 31 | | Epoch 1 | Time: 7.48s | Train Loss 8.14 | Valid Loss 7.47 | Perplexity 1757.71 32 | | Epoch 2 | Time: 7.28s | Train Loss 7.56 | Valid Loss 7.21 | Perplexity 1353.03 33 | | Epoch 3 | Time: 7.29s | Train Loss 7.36 | Valid Loss 7.05 | Perplexity 1154.24 34 | | Epoch 4 | Time: 7.31s | Train Loss 7.23 | Valid Loss 6.94 | Perplexity 1027.78 35 | | Epoch 5 | Time: 7.32s | Train Loss 7.13 | Valid Loss 6.85 | Perplexity 945.97 36 | | End of Training | Test Loss 6.77 | Test Perplexity 873.09 37 | ``` 38 | 39 | You will see the train, validation loss and perplexity plots (training\_metrics*.png). 40 | 41 | ### Debugging DDP 42 | If the ddp initialization is stucked, it is likely that your GPUs are not connected via NVLink, please use `export NCCL_P2P_DISABLE=1`. 43 | -------------------------------------------------------------------------------- /ddp_example/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | from io import open 3 | import torch 4 | from torch.utils.data import Dataset, DataLoader 5 | 6 | class Dictionary(object): 7 | def __init__(self): 8 | self.word2idx = {} 9 | self.idx2word = [] 10 | 11 | def add_word(self, word): 12 | if word not in self.word2idx: 13 | self.idx2word.append(word) 14 | self.word2idx[word] = len(self.idx2word) - 1 15 | return self.word2idx[word] 16 | 17 | def __len__(self): 18 | return len(self.idx2word) 19 | 20 | 21 | class Corpus(object): 22 | def __init__(self, path): 23 | self.dictionary = Dictionary() 24 | self.train = self.tokenize(os.path.join(path, 'train.txt')) 25 | self.valid = self.tokenize(os.path.join(path, 'valid.txt')) 26 | self.test = self.tokenize(os.path.join(path, 'test.txt')) 27 | 28 | def tokenize(self, path): 29 | """Tokenizes a text file.""" 30 | assert os.path.exists(path) 31 | # Add words to the dictionary 32 | with open(path, 'r', encoding="utf8") as f: 33 | for line in f: 34 | words = line.split() + [''] 35 | for word in words: 36 | self.dictionary.add_word(word) 37 | 38 | # Tokenize file content 39 | with open(path, 'r', encoding="utf8") as f: 40 | idss = [] 41 | for line in f: 42 | words = line.split() + [''] 43 | ids = [] 44 | for word in words: 45 | ids.append(self.dictionary.word2idx[word]) 46 | idss.append(torch.tensor(ids).type(torch.int64)) 47 | ids = torch.cat(idss) 48 | 49 | return ids 50 | 51 | class TextDataset(Dataset): 52 | """Formats corpus into sequences for language modeling""" 53 | def __init__(self, data, seq_len, device, batch_size = 128): 54 | self.device = device 55 | self.seq_len = seq_len 56 | data = data.narrow(0, 0, data.size(0) // batch_size * batch_size) 57 | self.data = data.view(batch_size, -1).t().contiguous().to(device) 58 | self.device = device 59 | 60 | self.indices = list(range(0, self.data.size(0)-self.seq_len, self.seq_len)) 61 | 62 | def __len__(self): 63 | return len(self.indices) 64 | 65 | def __getitem__(self, idx): 66 | start_idx = self.indices[idx] 67 | inputs = self.data[start_idx: start_idx + self.seq_len] 68 | targets = self.data[start_idx + 1: start_idx + self.seq_len + 1].view(-1) 69 | return inputs.to(self.device), targets.to(self.device) 70 | 71 | def get_dataloader(dataset, seq_len, batch_size=1, sampler=None): 72 | loader = DataLoader( 73 | dataset, 74 | batch_size=batch_size, 75 | sampler=sampler, 76 | shuffle=(sampler is None), 77 | ) 78 | return loader 79 | 80 | -------------------------------------------------------------------------------- /ddp_example/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import math 4 | import os 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import torch.distributed as dist 9 | from torch.nn.parallel import DistributedDataParallel as DDP 10 | from torch.utils.data import DistributedSampler 11 | import matplotlib.pyplot as plt 12 | 13 | import data 14 | import model 15 | from data import TextDataset, get_dataloader 16 | 17 | def setup_ddp(): 18 | dist.init_process_group("nccl") 19 | rank = dist.get_rank() 20 | torch.cuda.set_device(rank) 21 | return rank, dist.get_world_size() 22 | 23 | def cleanup_ddp(): 24 | dist.destroy_process_group() 25 | 26 | def parse_args(): 27 | parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Transformer Language Model') 28 | parser.add_argument('--ddp', action='store_true', help='run with ddp', default=False) 29 | parser.add_argument('--batch_size', type=int, default=128) 30 | return parser.parse_args() 31 | 32 | def plot_training_metrics(train_losses, val_losses, ppl_values, save_path='training_metrics.png', ddp=False): 33 | if ddp: 34 | save_path = save_path.replace('.png', '_ddp.png') 35 | 36 | epochs = range(1, len(train_losses) + 1) 37 | fig, axs = plt.subplots(3, 1, figsize=(8, 12)) 38 | axs[0].plot(epochs, train_losses, marker='o', linestyle='-', color='blue', label='Training Loss') 39 | axs[1].plot(epochs, val_losses, marker='o', linestyle='-', color='red', label='Validation Loss') 40 | axs[2].plot(epochs, ppl_values, marker='o', linestyle='-', color='green', label='Perplexity') 41 | for ax, title in zip(axs, ['Training Loss', 'Validation Loss', 'Perplexity']): 42 | ax.set_xlabel('Epochs') 43 | ax.set_ylabel('Value') 44 | ax.set_title(title) 45 | ax.grid(True) 46 | ax.legend() 47 | plt.tight_layout() 48 | plt.savefig(save_path) 49 | print(f"Plot saved as {save_path}") 50 | 51 | class Trainer: 52 | def __init__(self, args, model, train_loader, val_loader, test_loader, ntokens, device): 53 | self.args = args 54 | self.device = device 55 | self.ntokens = ntokens 56 | self.model = model.to(device) 57 | self.lr = 0.05 58 | if args.ddp: 59 | self.model = DDP(self.model, device_ids=[device.index]) 60 | self.lr *= dist.get_world_size() #scale learning rate to compensate gradient averaging 61 | 62 | self.train_loader = train_loader 63 | self.val_loader = val_loader 64 | self.test_loader = test_loader 65 | self.criterion = nn.NLLLoss() 66 | self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr) 67 | self.best_val_loss = None 68 | 69 | def train_one_epoch(self, epoch): 70 | if self.args.ddp: 71 | self.train_loader.sampler.set_epoch(epoch) 72 | 73 | self.model.train() 74 | total_loss = 0. 75 | start_time = time.time() 76 | for data, targets in self.train_loader: 77 | self.model.zero_grad() 78 | data = data.squeeze(0).clone() 79 | targets = targets.squeeze(0).clone() 80 | output = self.model(data) 81 | targets = targets.view(-1) 82 | output = output.view(-1, self.ntokens) 83 | loss = self.criterion(output, targets) 84 | loss.backward() 85 | self.optimizer.step() 86 | total_loss = total_loss + loss.item() 87 | 88 | if self.args.ddp: 89 | total_loss_tensor = torch.tensor(total_loss, device=self.device) 90 | dist.all_reduce(total_loss_tensor, op=dist.ReduceOp.SUM) 91 | 92 | return (total_loss_tensor / dist.get_world_size() / len(self.train_loader)).item() 93 | else: 94 | return total_loss / len(self.train_loader) 95 | 96 | def evaluate_model(self, loader): 97 | self.model.eval() 98 | total_loss = 0. 99 | with torch.no_grad(): 100 | for data, targets in loader: 101 | data = data.squeeze(0) 102 | targets = targets.squeeze(0) 103 | output = self.model(data) 104 | output = output.view(-1, self.ntokens) 105 | total_loss = total_loss + self.criterion(output, targets).item() 106 | 107 | if self.args.ddp: 108 | total_loss_tensor = torch.tensor(total_loss, device=self.device) 109 | dist.all_reduce(total_loss_tensor, op=dist.ReduceOp.SUM) 110 | 111 | return (total_loss_tensor / dist.get_world_size() / len(loader)).item() 112 | else: 113 | return total_loss / len(loader) 114 | 115 | def train_model(self, num_epochs=5): 116 | train_losses, val_losses, ppl_values = [], [], [] 117 | try: 118 | for epoch in range(1, num_epochs + 1): 119 | epoch_start_time = time.time() 120 | train_loss = self.train_one_epoch(epoch) 121 | val_loss = self.evaluate_model(self.val_loader) 122 | ppl = math.exp(val_loss) 123 | 124 | if not self.args.ddp or (self.args.ddp and self.device.index == 0): 125 | train_losses.append(train_loss) 126 | val_losses.append(val_loss) 127 | ppl_values.append(ppl) 128 | print(f'| Epoch {epoch} | Time: {time.time() - epoch_start_time:.2f}s | Train Loss {train_loss:.2f} | Valid Loss {val_loss:.2f} | Perplexity {ppl:.2f}') 129 | 130 | if self.best_val_loss is None or val_loss < self.best_val_loss: 131 | torch.save(self.model, 'model.pt') 132 | self.best_val_loss = val_loss 133 | else: 134 | self.lr /= 4.0 135 | except KeyboardInterrupt: 136 | print('Exiting from training early') 137 | 138 | self.test_model() 139 | if not self.args.ddp or (self.args.ddp and self.device.index == 0): 140 | plot_training_metrics(train_losses, val_losses, ppl_values, ddp=self.args.ddp) 141 | 142 | def test_model(self): 143 | test_loss = self.evaluate_model(self.test_loader) 144 | if not self.args.ddp or (self.args.ddp and self.device.index == 0): 145 | print(f'| End of Training | Test Loss {test_loss:.2f} | Test Perplexity {math.exp(test_loss):.2f}') 146 | 147 | 148 | def main(): 149 | args = parse_args() 150 | 151 | if args.ddp: 152 | rank, world_size = setup_ddp() 153 | device = torch.device(f"cuda:{rank}") 154 | else: 155 | rank, world_size = 0, 1 156 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 157 | 158 | torch.manual_seed(42) 159 | corpus = data.Corpus('./data/wikitext-2') 160 | ntokens = len(corpus.dictionary) 161 | seq_len = 40 162 | 163 | corpus.train = TextDataset(corpus.train, seq_len, device, args.batch_size) 164 | corpus.valid = TextDataset(corpus.valid, seq_len, device, args.batch_size) 165 | corpus.test = TextDataset(corpus.test, seq_len, device, args.batch_size) 166 | 167 | sampler = DistributedSampler(corpus.train, world_size, rank, shuffle=False) if args.ddp else None 168 | train_loader = get_dataloader(corpus.train, seq_len=40, sampler=sampler) 169 | val_loader = get_dataloader(corpus.valid, seq_len=40) 170 | test_loader = get_dataloader(corpus.test, seq_len=40) 171 | 172 | model_inst = model.TransformerModel(ntokens, 256, 8, 256, 4, 0.2) 173 | trainer = Trainer(args, model_inst, train_loader, val_loader, test_loader, ntokens, device) 174 | trainer.train_model() 175 | 176 | if args.ddp: 177 | cleanup_ddp() 178 | 179 | if __name__ == "__main__": 180 | main() 181 | -------------------------------------------------------------------------------- /ddp_example/model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | class PositionalEncoding(nn.Module): 7 | 8 | def __init__(self, d_model, dropout=0.1, max_len=5000): 9 | super(PositionalEncoding, self).__init__() 10 | self.dropout = nn.Dropout(p=dropout) 11 | 12 | pe = torch.zeros(max_len, d_model) 13 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 14 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 15 | pe[:, 0::2] = torch.sin(position * div_term) 16 | pe[:, 1::2] = torch.cos(position * div_term) 17 | pe = pe.unsqueeze(0).transpose(0, 1) 18 | self.register_parameter('pe', nn.Parameter(pe, requires_grad=False)) 19 | 20 | def forward(self, x): 21 | x = x + self.pe[:x.size(0), :] 22 | return self.dropout(x) 23 | 24 | class TransformerModel(nn.Transformer): 25 | """Container module with an encoder, a recurrent or transformer module, and a decoder.""" 26 | 27 | def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): 28 | super(TransformerModel, self).__init__(d_model=ninp, nhead=nhead, dim_feedforward=nhid, num_encoder_layers=nlayers) 29 | self.model_type = 'Transformer' 30 | self.pos_encoder = PositionalEncoding(ninp, dropout) 31 | 32 | self.input_emb = nn.Embedding(ntoken, ninp) 33 | self.ninp = ninp 34 | self.decoder = nn.Linear(ninp, ntoken) 35 | 36 | self.init_weights() 37 | 38 | def _generate_square_subsequent_mask(self, sz): 39 | return torch.log(torch.tril(torch.ones(sz,sz))) 40 | 41 | def init_weights(self): 42 | initrange = 0.1 43 | nn.init.uniform_(self.input_emb.weight, -initrange, initrange) 44 | nn.init.zeros_(self.decoder.bias) 45 | nn.init.uniform_(self.decoder.weight, -initrange, initrange) 46 | 47 | def forward(self, src, has_mask=True): 48 | mask = None 49 | if has_mask: 50 | mask = self._generate_square_subsequent_mask(len(src)) 51 | src = self.input_emb(src) * math.sqrt(self.ninp) 52 | src = self.pos_encoder(src) 53 | output = self.encoder(src, mask=mask) 54 | output = self.decoder(output) 55 | return F.log_softmax(output, dim=-1) 56 | -------------------------------------------------------------------------------- /ddp_example/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.26.4 2 | torch==2.5.1 3 | torchaudio==2.5.1 4 | torcheval==0.0.7 5 | torchvision==0.20.1 6 | nvidia-cuda-cupti-cu11==11.8.87 7 | nvidia-cuda-cupti-cu12==12.4.127 8 | nvidia-cuda-nvrtc-cu11==11.7.99 9 | nvidia-cuda-nvrtc-cu12==12.4.127 10 | nvidia-cuda-runtime-cu11==11.7.99 11 | nvidia-cuda-runtime-cu12==12.4.127 12 | matplotlib==3.7.1 13 | matplotlib-inline==0.1.6 14 | nvidia-nccl-cu11==2.20.5 15 | nvidia-nccl-cu12==2.21.5 16 | -------------------------------------------------------------------------------- /ddp_example/synthetic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | import torch.distributed as dist 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | 8 | from torch.nn.parallel import DistributedDataParallel as DDP 9 | from torch.utils.data import Dataset, DataLoader, TensorDataset 10 | from torch.utils.data.distributed import DistributedSampler 11 | 12 | 13 | class ToyModel(nn.Module): 14 | def __init__(self): 15 | super(ToyModel, self).__init__() 16 | self.net1 = nn.Linear(10, 10) 17 | self.relu = nn.ReLU() 18 | self.net2 = nn.Linear(10, 5) 19 | 20 | def forward(self, x): 21 | return self.net2(self.relu(self.net1(x))) 22 | 23 | 24 | def create_dataset(num_samples): 25 | # Here we create a synthetic dataset 26 | inputs = torch.randn(num_samples, 10) 27 | labels = torch.randn(num_samples, 5) 28 | return TensorDataset(inputs, labels) 29 | 30 | 31 | def train_single_gpu(dataset, batch_size=1024, num_epochs=100): 32 | # The following code only train with one gpu 33 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 34 | model = ToyModel().to(device) 35 | loss_fn = nn.MSELoss() 36 | optimizer = optim.SGD(model.parameters(), lr=0.001) 37 | 38 | dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) 39 | 40 | start_time = time.time() 41 | for epoch in range(num_epochs): 42 | for inputs, labels in dataloader: 43 | inputs, labels = inputs.to(device), labels.to(device) 44 | optimizer.zero_grad() 45 | outputs = model(inputs) 46 | loss = loss_fn(outputs, labels) 47 | loss.backward() 48 | optimizer.step() 49 | end_time = time.time() 50 | 51 | return end_time - start_time 52 | 53 | 54 | def train_ddp(dataset, batch_size=1024, num_epochs=100): 55 | # We initialize process group with nccl for communication 56 | torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) 57 | dist.init_process_group("nccl") 58 | 59 | rank = dist.get_rank() 60 | world_size = dist.get_world_size() 61 | 62 | device_id = rank % torch.cuda.device_count() 63 | model = ToyModel().to(device_id) 64 | ddp_model = DDP(model, device_ids=[device_id]) 65 | 66 | loss_fn = nn.MSELoss() 67 | optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) 68 | 69 | # We sample using DistributedSampler such that each GPU gets unique data 70 | sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=True) 71 | dataloader = DataLoader(dataset, batch_size=batch_size // world_size, sampler=sampler) 72 | 73 | start_time = time.time() 74 | for epoch in range(num_epochs): 75 | sampler.set_epoch(epoch) # we will have different shuffling per epoch 76 | for inputs, labels in dataloader: 77 | inputs, labels = inputs.to(device_id), labels.to(device_id) 78 | optimizer.zero_grad() 79 | outputs = ddp_model(inputs) 80 | loss = loss_fn(outputs, labels) 81 | loss.backward() 82 | optimizer.step() 83 | end_time = time.time() 84 | 85 | dist.destroy_process_group() 86 | return end_time - start_time 87 | 88 | 89 | if __name__ == "__main__": 90 | import sys 91 | 92 | NUM_SAMPLES = 100000 93 | BATCH_SIZE = 1024 94 | NUM_EPOCHS = 100 95 | 96 | dataset = create_dataset(NUM_SAMPLES) 97 | 98 | if "--ddp" in sys.argv: 99 | # Run DDP training 100 | total_time = train_ddp(dataset, batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS) 101 | print(f"DDP Training Time: {total_time:.4f} seconds") 102 | 103 | else: 104 | # Run Single-GPU training 105 | total_time = train_single_gpu(dataset, batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS) 106 | print(f"Single-GPU Training Time: {total_time:.4f} seconds") 107 | 108 | -------------------------------------------------------------------------------- /decoding/decoding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "[[4, 0, 4, 0, 4, 0, 4, 0, 4, 0], 6.931471805599453]\n", 13 | "[[4, 0, 4, 0, 4, 0, 4, 0, 4, 1], 7.154615356913663]\n", 14 | "[[4, 0, 4, 0, 4, 0, 4, 0, 3, 0], 7.154615356913663]\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "from math import log\n", 20 | "from numpy import array\n", 21 | "from numpy import argmax\n", 22 | "\n", 23 | "# beam search\n", 24 | "def beam_search_decoder(next_token_probs, k):\n", 25 | "\t\"\"\"beam search decoding \n", 26 | " \n", 27 | " next_token_probs: next token probabilities \n", 28 | " k: beam size\n", 29 | "\t\"\"\"\n", 30 | "\tsequences = [[list(), 0.0]]\n", 31 | "\t# walk over each step in sequence\n", 32 | "\tfor token_probs in next_token_probs:\n", 33 | "\t\tall_candidates = list()\n", 34 | "\t\t# expand each current candidate\n", 35 | "\t\tfor current_seq in range(len(sequences)):\n", 36 | "\t\t\tseq, score = sequences[current_seq]\n", 37 | "\t\t\tfor tk_id in range(len(token_probs)):\n", 38 | "\t\t\t\tcandidate = [seq + [tk_id], score - log(token_probs[tk_id])]\n", 39 | "\t\t\t\tall_candidates.append(candidate)\n", 40 | "\t\t# order all candidates by score\n", 41 | "\t\tordered = sorted(all_candidates, key=lambda tup:tup[1])\n", 42 | "\t\t# select k best\n", 43 | "\t\tsequences = ordered[:k]\n", 44 | "\treturn sequences\n", 45 | "\n", 46 | "# define a sequence of 10 words over a vocab of 5 words\n", 47 | "data = [[0.1, 0.2, 0.3, 0.4, 0.5],\n", 48 | "\t\t[0.5, 0.4, 0.3, 0.2, 0.1],\n", 49 | "\t\t[0.1, 0.2, 0.3, 0.4, 0.5],\n", 50 | "\t\t[0.5, 0.4, 0.3, 0.2, 0.1],\n", 51 | "\t\t[0.1, 0.2, 0.3, 0.4, 0.5],\n", 52 | "\t\t[0.5, 0.4, 0.3, 0.2, 0.1],\n", 53 | "\t\t[0.1, 0.2, 0.3, 0.4, 0.5],\n", 54 | "\t\t[0.5, 0.4, 0.3, 0.2, 0.1],\n", 55 | "\t\t[0.1, 0.2, 0.3, 0.4, 0.5],\n", 56 | "\t\t[0.5, 0.4, 0.3, 0.2, 0.1]]\n", 57 | "data = array(data)\n", 58 | "# decode sequence\n", 59 | "result = beam_search_decoder(data, 3)\n", 60 | "# print result\n", 61 | "for seq in result:\n", 62 | "\tprint(seq)" 63 | ] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "base", 69 | "language": "python", 70 | "name": "python3" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.9.18" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 2 87 | } 88 | -------------------------------------------------------------------------------- /deepspeed_example/DeepSpeed-Example.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOm999wxkBzBdw34in6tbAt"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["This is adopted from https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md.\n","\n","We only demonstrate the SFT step here, but feel free to try out the reward modeling and RLHF training!"],"metadata":{"id":"edL-x2OUcFsT"}},{"cell_type":"markdown","source":["Env Setup: Make sure you are using GPU runtime.(e.g. T4)"],"metadata":{"id":"a8fmdQUScgoE"}},{"cell_type":"markdown","source":["Inside the training script `main.py`, we will need some modifications...\n","\n","(Note: libraries such huggingface accelerate integrates with Deepspeed, see https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed for more details!)"],"metadata":{"id":"ztHx5i5ee1I9"}},{"cell_type":"code","source":["!pip install deepspeed"],"metadata":{"id":"noq0zvYqn7kC"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import deepspeed\n","from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam\n","from deepspeed import get_accelerator\n","\n","import argparse\n","import torch\n","\n","#skip the details here, please check actual main.py\n","def get_train_ds_config(offload,\n"," dtype,\n"," stage=2,\n"," enable_hybrid_engine=False,\n"," inference_tp_size=1,\n"," release_inference_cache=False,\n"," pin_parameters=True,\n"," tp_gather_partition_size=8,\n"," max_out_tokens=512,\n"," enable_tensorboard=False,\n"," enable_mixed_precision_lora=False,\n"," tb_path=\"\",\n"," tb_name=\"\"):\n"," pass\n","def parse_args():\n"," pass\n","def to_device():\n"," pass\n","\n","def main():\n"," args = parse_args()\n","\n"," if args.local_rank == -1:\n"," device = torch.device(get_accelerator().device_name())\n"," else:\n"," get_accelerator().set_device(args.local_rank)\n"," device = torch.device(get_accelerator().device_name(), args.local_rank)\n"," # Initializes the distributed backend which will take care of sychronizing nodes/GPUs\n"," # torch.distributed.init_process_group(backend='nccl')\n"," deepspeed.init_distributed()\n"," ds_config = get_train_ds_config(offload=args.offload,\n"," dtype=args.dtype,\n"," stage=args.zero_stage,\n"," enable_tensorboard=args.enable_tensorboard,\n"," tb_path=args.tensorboard_path,\n"," tb_name=\"step1_model\")\n"," # Deepspeed provide custom cpu adam if we want to offload optimizer to cpu\n"," AdamOptimizer = DeepSpeedCPUAdam if args.offload else FusedAdam\n"," # Initialize model with deepspeed\n"," # DeepSpeed model training is accomplished using the DeepSpeed engine.\n"," # The engine can wrap any arbitrary model of type torch.nn.module and\n"," # has a minimal set of APIs for training and checkpointing the model.\n"," model, optimizer, _, lr_scheduler = deepspeed.initialize(\n"," model=model,\n"," optimizer=optimizer,\n"," args=args,\n"," config=ds_config,\n"," lr_scheduler=lr_scheduler,\n"," dist_init_required=True)\n"," train_dataloader = None\n"," #DeepSpeed automatically performs the necessary operations required for distributed data parallel training,\n"," #in mixed precision, with a pre-defined learning rate scheduler. No code change needed.\n"," for epoch in range(args.num_train_epochs):\n"," model.train()\n"," import time\n"," for step, batch in enumerate(train_dataloader):\n"," start = time.time()\n"," batch = to_device(batch, device)\n"," outputs = model(**batch, use_cache=False)\n"," loss = outputs.loss\n"," if args.print_loss:\n"," print(\n"," f\"Epoch: {epoch}, Step: {step}, Rank: {torch.distributed.get_rank()}, loss = {loss}\"\n"," )\n"," model.backward(loss)\n"," model.step()\n"," end = time.time()"],"metadata":{"id":"FVdLubB9eq1G","executionInfo":{"status":"ok","timestamp":1743549424621,"user_tz":240,"elapsed":28,"user":{"displayName":"Kath Choi","userId":"14493180204401828909"}}},"execution_count":6,"outputs":[]},{"cell_type":"markdown","source":["Let's see what is needed to run the training script with deepspeed command line:\n","1. specify your training script `main.py`\n","2. specify args such as `model_name_or_path`, `zero_stage` etc.\n","\n","To see the full list of args supported, see https://www.deepspeed.ai/docs/config-json/"],"metadata":{"id":"-dlTlGeTeNxx"}},{"cell_type":"markdown","source":["Recall that:\n","1. Optimizer state partitioning (ZeRO stage 1)\n","2. Gradient partitioning (ZeRO stage 2)\n","3. Parameter partitioning (ZeRO stage 3)"],"metadata":{"id":"il2y5cowhv43"}},{"cell_type":"markdown","source":["The bash scipt looks something like this"],"metadata":{"id":"Swm3dKlOm4RJ"}},{"cell_type":"code","source":["\"\"\"\n","ZERO_STAGE=$1\n","OUTPUT=./output_llama2_7b\n","if [ \"$ZERO_STAGE\" == \"\" ]; then\n"," ZERO_STAGE=3\n","fi\n","mkdir -p $OUTPUT\n","\n","deepspeed main.py \\\n"," --data_split 2,4,4 \\\n"," --model_name_or_path meta-llama/Llama-2-7b-hf \\\n"," --per_device_train_batch_size 1 \\\n"," --per_device_eval_batch_size 4 \\\n"," --max_seq_len 512 \\\n"," --learning_rate 9.65e-6 \\\n"," --weight_decay 0. \\\n"," --num_train_epochs 3 \\\n"," --gradient_accumulation_steps 4 \\\n"," --lr_scheduler_type cosine \\\n"," --num_warmup_steps 0 \\\n"," --seed 1234 \\\n"," --gradient_checkpointing \\\n"," --dtype bf16 \\\n"," --zero_stage $ZERO_STAGE \\\n"," --deepspeed \\\n"," --output_dir $OUTPUT \\\n","\"\"\""],"metadata":{"id":"tlbF27LkeFBZ"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["Then we can run the command in a cluster and monitor the nvidia-smi to see the how much memory is saved!"],"metadata":{"id":"B8iioP1jSivH"}}]} -------------------------------------------------------------------------------- /deepspeed_example/README.md: -------------------------------------------------------------------------------- 1 | This example is adopted from https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat, with some modifications. 2 | 3 | ### Request GPUs 4 | ```bash 5 | srun --cpus-per-task=5 --gpus=4 --mem=256GB --partition= --time=10:34:56 --pty bash 6 | ``` 7 | 8 | ### Installation 9 | 10 | ```bash 11 | pip install -r requirements.txt 12 | pip install -e . 13 | ``` 14 | 15 | ### Finetuning LLaMA2-7b 16 | If stage is not provided, the script will use DeepSpeed ZeRO stage 3 by default. 17 | Using stage 0 would disable ZeRO. (You will probably see CUDA OOM errors!) 18 | ```bash 19 | bash run_llama2_7b.sh 20 | ``` 21 | -------------------------------------------------------------------------------- /deepspeed_example/deepspeed_chat.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.2 2 | Name: deepspeed-chat 3 | Version: 0.1 4 | Home-page: https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat 5 | Requires-Dist: datasets>=2.8.0 6 | Requires-Dist: sentencepiece>=0.1.97 7 | Requires-Dist: protobuf==3.20.3 8 | Requires-Dist: accelerate>=0.15.0 9 | Requires-Dist: torch>=1.12.0 10 | Requires-Dist: deepspeed>=0.9.2 11 | Requires-Dist: transformers!=4.33.2,>=4.31.0 12 | Requires-Dist: tensorboard 13 | Provides-Extra: azureml 14 | Requires-Dist: azure-ml-component; extra == "azureml" 15 | Requires-Dist: azureml-core; extra == "azureml" 16 | Dynamic: home-page 17 | Dynamic: provides-extra 18 | Dynamic: requires-dist 19 | -------------------------------------------------------------------------------- /deepspeed_example/deepspeed_chat.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | deepspeed_chat.egg-info/PKG-INFO 3 | deepspeed_chat.egg-info/SOURCES.txt 4 | deepspeed_chat.egg-info/dependency_links.txt 5 | deepspeed_chat.egg-info/requires.txt 6 | deepspeed_chat.egg-info/top_level.txt -------------------------------------------------------------------------------- /deepspeed_example/deepspeed_chat.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /deepspeed_example/deepspeed_chat.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | datasets>=2.8.0 2 | sentencepiece>=0.1.97 3 | protobuf==3.20.3 4 | accelerate>=0.15.0 5 | torch>=1.12.0 6 | deepspeed>=0.9.2 7 | transformers!=4.33.2,>=4.31.0 8 | tensorboard 9 | 10 | [azureml] 11 | azure-ml-component 12 | azureml-core 13 | -------------------------------------------------------------------------------- /deepspeed_example/deepspeed_chat.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /deepspeed_example/dschat/rlhf/rlhf_engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import time 6 | import torch 7 | import deepspeed 8 | from deepspeed.ops.adam import FusedAdam 9 | from deepspeed.ops.adam import DeepSpeedCPUAdam 10 | from transformers import AutoModelForCausalLM, get_scheduler 11 | 12 | from dschat.utils.ds_utils import get_train_ds_config, get_eval_ds_config 13 | from dschat.utils.module.lora import convert_linear_layer_to_lora, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible 14 | from dschat.utils.model.model_utils import create_hf_model, create_critic_model 15 | from dschat.utils.utils import get_optimizer_grouped_parameters 16 | """ 17 | TODOs: 18 | * support HF models for critic (for debugging), must be a previously saved ckpt from step-2 19 | * determine ds_config/zero_stage based on model size, gpu style, world size, etc 20 | - get model size by creating simple meta model 21 | - 1.3b: zero-2 for actor/ref models, zero-0 for others 22 | - 13b+: zero-3 for all models 23 | """ 24 | 25 | 26 | def log_init(model_name, stime=None): 27 | if torch.distributed.get_rank() == 0: 28 | tag = "start" if stime is None else "end" 29 | suffix = "ing" if stime is None else "ed" 30 | duration = "" 31 | if stime is not None: 32 | duration = "(duration: {:.2f}s)".format(time.time() - stime) 33 | msg = f"[{tag}] Initializ{suffix} {model_name} Model [{tag}] {duration}" 34 | stars = (90 - len(msg)) // 2 35 | extra_star = "*" if (90 - len(msg)) % 2 == 1 else "" 36 | print("*" * stars + msg + "*" * stars + extra_star) 37 | return time.time() 38 | 39 | 40 | class DeepSpeedRLHFEngine(): 41 | 42 | def __init__(self, actor_model_name_or_path, critic_model_name_or_path, 43 | tokenizer, args, num_total_iters): 44 | self.args = args 45 | self.num_total_iters = num_total_iters 46 | self.tokenizer = tokenizer 47 | 48 | self.actor = self._init_actor( 49 | actor_model_name_or_path=actor_model_name_or_path) 50 | self.ref = self._init_ref( 51 | actor_model_name_or_path=actor_model_name_or_path) 52 | self.actor_ema = None 53 | if self.args.enable_ema: 54 | self.actor_ema = self._init_ema( 55 | actor_model_name_or_path=actor_model_name_or_path) 56 | self.critic = self._init_critic( 57 | critic_model_name_or_path=critic_model_name_or_path) 58 | self.reward = self._init_reward( 59 | critic_model_name_or_path=critic_model_name_or_path) 60 | if self.args.critic_gradient_checkpointing: 61 | self.critic.gradient_checkpointing_enable() 62 | 63 | def _init_actor(self, actor_model_name_or_path): 64 | stime = log_init("Actor") 65 | 66 | # DS Config 67 | ds_config = get_train_ds_config( 68 | offload=self.args.offload, 69 | dtype=self.args.dtype, 70 | stage=self.args.actor_zero_stage, 71 | enable_hybrid_engine=self.args.enable_hybrid_engine, 72 | inference_tp_size=self.args.inference_tp_size, 73 | release_inference_cache=self.args.release_inference_cache, 74 | pin_parameters=(not self.args.unpin_actor_parameters), 75 | tp_gather_partition_size=self.args.tp_gather_partition_size, 76 | max_out_tokens=self.args.max_prompt_seq_len + 77 | self.args.max_answer_seq_len, 78 | enable_tensorboard=self.args.enable_tensorboard, 79 | enable_mixed_precision_lora=self.args.enable_mixed_precision_lora, 80 | tb_path=self.args.tensorboard_path, 81 | tb_name="step3_actor") 82 | ds_config[ 83 | 'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size 84 | #TODO(jeff): we should probably set grad accumlation steps here as well for clarity 85 | ds_config[ 86 | 'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size( 87 | ) * self.args.gradient_accumulation_steps_actor 88 | 89 | # Model 90 | actor_model = create_hf_model( 91 | model_class=AutoModelForCausalLM, 92 | model_name_or_path=actor_model_name_or_path, 93 | tokenizer=self.tokenizer, 94 | ds_config=ds_config, 95 | dropout=self.args.actor_dropout) 96 | 97 | # LoRA 98 | if self.args.actor_lora_dim > 0: 99 | actor_model = convert_linear_layer_to_lora( 100 | actor_model, self.args.actor_lora_module_name, 101 | self.args.actor_lora_dim) 102 | if self.args.only_optimize_lora: 103 | actor_model = only_optimize_lora_parameters(actor_model) 104 | actor_model = make_model_gradient_checkpointing_compatible( 105 | actor_model) 106 | 107 | # Optimizer 108 | AdamOptimizer = DeepSpeedCPUAdam if self.args.offload else FusedAdam 109 | optim_params = get_optimizer_grouped_parameters( 110 | actor_model, self.args.actor_weight_decay, 111 | self.args.actor_lora_learning_rate) 112 | optim = AdamOptimizer(optim_params, 113 | lr=self.args.actor_learning_rate, 114 | betas=(0.9, 0.95)) 115 | 116 | # LR Scheduler 117 | lr_scheduler = get_scheduler( 118 | name=self.args.lr_scheduler_type, 119 | optimizer=optim, 120 | num_warmup_steps=self.args.num_warmup_steps, 121 | num_training_steps=self.num_total_iters, 122 | ) 123 | 124 | # DeepSpeed Engine 125 | #TODO: move enable_hybrid_engine and pin_parameters to ds_config 126 | actor_engine, *_ = deepspeed.initialize(model=actor_model, 127 | optimizer=optim, 128 | lr_scheduler=lr_scheduler, 129 | config=ds_config) 130 | 131 | log_init("Actor", stime=stime) 132 | 133 | return actor_engine 134 | 135 | def _init_ref(self, actor_model_name_or_path): 136 | stime = log_init("Ref") 137 | # DS Config 138 | zero_stage = self.args.actor_zero_stage 139 | if zero_stage != 3: 140 | # If actor is ZeRO-3 then we use it for everything, otherwise assume we have enough memory for ref model 141 | zero_stage = 0 142 | ds_config = get_eval_ds_config(self.args.offload_reference_model, 143 | self.args.dtype, zero_stage) 144 | ds_config[ 145 | 'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size 146 | #TODO(jeff): we should probably set grad accumlation steps here as well for clarity 147 | ds_config[ 148 | 'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size( 149 | ) * self.args.gradient_accumulation_steps_actor 150 | 151 | ref_model = create_hf_model(AutoModelForCausalLM, 152 | actor_model_name_or_path, self.tokenizer, 153 | ds_config) 154 | 155 | ref_engine, *_ = deepspeed.initialize(model=ref_model, 156 | config=ds_config) 157 | 158 | log_init("Ref", stime=stime) 159 | return ref_engine 160 | 161 | def _init_ema(self, actor_model_name_or_path): 162 | stime = log_init("EMA") 163 | # DS Config 164 | zero_stage = self.args.actor_zero_stage 165 | if zero_stage != 3: 166 | # If actor is ZeRO-3 then we use it for everything, otherwise assume we have enough memory 167 | zero_stage = 0 168 | ds_config = get_eval_ds_config(self.args.offload_reference_model, 169 | self.args.dtype, zero_stage) 170 | ds_config[ 171 | 'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size 172 | #TODO(jeff): we should probably set grad accumlation steps here as well for clarity 173 | ds_config[ 174 | 'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size( 175 | ) * self.args.gradient_accumulation_steps_actor 176 | 177 | actor_model_ema = create_hf_model(AutoModelForCausalLM, 178 | actor_model_name_or_path, 179 | self.tokenizer, ds_config) 180 | if self.args.actor_lora_dim > 0: 181 | actor_model_ema = convert_linear_layer_to_lora( 182 | actor_model_ema, self.args.actor_lora_module_name, 183 | self.args.actor_lora_dim) 184 | 185 | ema_engine, *_ = deepspeed.initialize(model=actor_model_ema, 186 | config=ds_config) 187 | 188 | log_init("EMA", stime=stime) 189 | return ema_engine 190 | 191 | def _init_critic(self, critic_model_name_or_path): 192 | stime = log_init("Critic") 193 | ds_config = get_train_ds_config( 194 | offload=self.args.offload, 195 | dtype=self.args.dtype, 196 | stage=self.args.critic_zero_stage, 197 | enable_tensorboard=self.args.enable_tensorboard, 198 | tb_path=self.args.tensorboard_path, 199 | tb_name="step3_critic") 200 | ds_config[ 201 | 'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size 202 | #TODO(jeff): we should probably set grad accumlation steps here as well for clarity 203 | ds_config[ 204 | 'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size( 205 | ) * self.args.gradient_accumulation_steps 206 | 207 | ds_eval_config = get_eval_ds_config(offload=False, 208 | dtype=self.args.dtype, 209 | stage=self.args.critic_zero_stage) 210 | # We need to set train batch size and micro batch size here to pass the sanity check of DeepSpeed engine. 211 | ds_eval_config[ 212 | 'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size 213 | ds_eval_config[ 214 | 'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size( 215 | ) * self.args.gradient_accumulation_steps 216 | 217 | # Model 218 | critic_model = create_critic_model( 219 | model_name_or_path=critic_model_name_or_path, 220 | tokenizer=self.tokenizer, 221 | ds_config=ds_eval_config, 222 | num_padding_at_beginning=self.args.num_padding_at_beginning, 223 | rlhf_training=True, 224 | dropout=self.args.critic_dropout, 225 | zero_stage=self.args.critic_zero_stage) 226 | 227 | # LoRA 228 | if self.args.critic_lora_dim > 0: 229 | critic_model = convert_linear_layer_to_lora( 230 | critic_model, self.args.critic_lora_module_name, 231 | self.args.critic_lora_dim) 232 | if self.args.only_optimize_lora: 233 | critic_model = only_optimize_lora_parameters(critic_model) 234 | critic_model = make_model_gradient_checkpointing_compatible( 235 | critic_model) 236 | 237 | # Optimizer 238 | AdamOptimizer = DeepSpeedCPUAdam if self.args.offload else FusedAdam 239 | optim_params = get_optimizer_grouped_parameters( 240 | critic_model, self.args.critic_weight_decay, 241 | self.args.critic_lora_learning_rate) 242 | optim = AdamOptimizer(optim_params, 243 | lr=self.args.critic_learning_rate, 244 | betas=(0.9, 0.95)) 245 | 246 | # LR Scheduler 247 | lr_scheduler = get_scheduler( 248 | name=self.args.lr_scheduler_type, 249 | optimizer=optim, 250 | num_warmup_steps=self.args.num_warmup_steps, 251 | num_training_steps=self.num_total_iters, 252 | ) 253 | 254 | # DeepSpeed Engine 255 | critic_engine, *_ = deepspeed.initialize(model=critic_model, 256 | optimizer=optim, 257 | lr_scheduler=lr_scheduler, 258 | config=ds_config) 259 | 260 | log_init("Critic", stime=stime) 261 | return critic_engine 262 | 263 | def _init_reward(self, critic_model_name_or_path): 264 | stime = log_init("Reward") 265 | # DS Config 266 | zero_stage = self.args.critic_zero_stage 267 | if zero_stage != 3: 268 | # If critic is ZeRO-3 then we use it for everything, otherwise assume we have enough memory 269 | zero_stage = 0 270 | 271 | ds_config = get_eval_ds_config(offload=self.args.offload_reward_model, 272 | dtype=self.args.dtype, 273 | stage=zero_stage) 274 | 275 | # We need to set train batch size and micro batch size here to pass the sanity check of DeepSpeed engine. 276 | ds_config[ 277 | 'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size 278 | ds_config[ 279 | 'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size( 280 | ) * self.args.gradient_accumulation_steps 281 | 282 | # Model 283 | reward_model = create_critic_model( 284 | model_name_or_path=critic_model_name_or_path, 285 | tokenizer=self.tokenizer, 286 | ds_config=ds_config, 287 | num_padding_at_beginning=self.args.num_padding_at_beginning, 288 | rlhf_training=True, 289 | dropout=self.args.critic_dropout, 290 | zero_stage=zero_stage) 291 | 292 | reward_engine, *_ = deepspeed.initialize(model=reward_model, 293 | config=ds_config) 294 | 295 | log_init("Reward", stime=stime) 296 | return reward_engine 297 | -------------------------------------------------------------------------------- /deepspeed_example/dschat/utils/ds_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import deepspeed.comm as dist 7 | from deepspeed.accelerator import get_accelerator 8 | 9 | GLOBAL_BATCH_SIZE = 32 10 | MICRO_BATCH_SIZE = 4 11 | 12 | 13 | def get_train_ds_config(offload, 14 | dtype, 15 | stage=2, 16 | enable_hybrid_engine=False, 17 | inference_tp_size=1, 18 | release_inference_cache=False, 19 | pin_parameters=True, 20 | tp_gather_partition_size=8, 21 | max_out_tokens=512, 22 | enable_tensorboard=False, 23 | enable_mixed_precision_lora=False, 24 | tb_path="", 25 | tb_name=""): 26 | 27 | device = "cpu" if offload else "none" 28 | if dtype == "fp16": 29 | data_type = "fp16" 30 | dtype_config = {"enabled": True, "loss_scale_window": 100} 31 | elif dtype == "bf16": 32 | data_type = "bfloat16" 33 | dtype_config = {"enabled": True} 34 | zero_opt_dict = { 35 | "stage": stage, 36 | "overlap_comm": True, 37 | "offload_param": { 38 | "device": device 39 | }, 40 | "offload_optimizer": { 41 | "device": device 42 | }, 43 | "stage3_param_persistence_threshold": 1e4, 44 | "stage3_max_live_parameters": 3e7, 45 | "stage3_prefetch_bucket_size": 3e7, 46 | "memory_efficient_linear": False 47 | } 48 | if enable_mixed_precision_lora: 49 | zero_opt_dict["zero_quantized_nontrainable_weights"] = True 50 | if dist.get_world_size() != get_accelerator().device_count(): 51 | zero_opt_dict["zero_hpz_partition_size"] = get_accelerator( 52 | ).device_count() 53 | return { 54 | "train_batch_size": GLOBAL_BATCH_SIZE, 55 | "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE, 56 | "steps_per_print": 10, 57 | "zero_optimization": zero_opt_dict, 58 | data_type: dtype_config, 59 | "gradient_clipping": 1.0, 60 | "prescale_gradients": False, 61 | "wall_clock_breakdown": False, 62 | "hybrid_engine": { 63 | "enabled": enable_hybrid_engine, 64 | "max_out_tokens": max_out_tokens, 65 | "inference_tp_size": inference_tp_size, 66 | "release_inference_cache": release_inference_cache, 67 | "pin_parameters": pin_parameters, 68 | "tp_gather_partition_size": tp_gather_partition_size, 69 | }, 70 | "tensorboard": { 71 | "enabled": enable_tensorboard, 72 | "output_path": f"{tb_path}/ds_tensorboard_logs/", 73 | "job_name": f"{tb_name}_tensorboard" 74 | } 75 | } 76 | 77 | 78 | def get_eval_ds_config(offload, dtype, stage=0): 79 | device = "cpu" if offload else "none" 80 | if dtype == "fp16": 81 | data_type = "fp16" 82 | dtype_config = { 83 | "enabled": True, 84 | } 85 | elif dtype == "bf16": 86 | data_type = "bfloat16" 87 | dtype_config = {"enabled": True} 88 | zero_opt_dict = { 89 | "stage": stage, 90 | "stage3_param_persistence_threshold": 1e4, 91 | "offload_param": { 92 | "device": device 93 | }, 94 | "memory_efficient_linear": False 95 | } 96 | return { 97 | "train_batch_size": GLOBAL_BATCH_SIZE, 98 | "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE, 99 | "steps_per_print": 10, 100 | "zero_optimization": zero_opt_dict, 101 | data_type: dtype_config, 102 | "gradient_clipping": 1.0, 103 | "prescale_gradients": False, 104 | "wall_clock_breakdown": False 105 | } 106 | -------------------------------------------------------------------------------- /deepspeed_example/dschat/utils/model/model_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import math 7 | import torch 8 | from transformers import ( 9 | AutoConfig, 10 | AutoModel, 11 | ) 12 | from huggingface_hub import snapshot_download 13 | from transformers.integrations.deepspeed import HfDeepSpeedConfig 14 | 15 | from dschat.utils.model.reward_model import RewardModel 16 | from dschat.utils.utils import load_state_dict_into_model, print_rank_0 17 | 18 | 19 | def configure_dropout(model_config, dropout): 20 | if dropout is not None: 21 | for key in ('dropout', 'attention_dropout', 'hidden_dropout', 22 | 'activation_dropout'): 23 | if hasattr(model_config, key): 24 | print(f"Setting model_config.{key} to {dropout}") 25 | setattr(model_config, key, dropout) 26 | 27 | 28 | def causal_lm_model_to_fp32_loss(model): 29 | """ Convert CausalLM model to calculate loss in fp32 """ 30 | 31 | def causal_lm_forward( 32 | input_ids=None, 33 | past_key_values=None, 34 | attention_mask=None, 35 | head_mask=None, 36 | inputs_embeds=None, 37 | labels=None, 38 | use_cache=None, 39 | output_attentions=None, 40 | output_hidden_states=None, 41 | return_dict=None, 42 | **deprecated_arguments, 43 | ): 44 | kwargs = dict() if model.config.model_type == "llama" else dict( 45 | head_mask=head_mask) 46 | output = model.__original_forward__( 47 | input_ids=input_ids, 48 | past_key_values=past_key_values, 49 | attention_mask=attention_mask, 50 | inputs_embeds=inputs_embeds, 51 | labels=None, 52 | use_cache=use_cache, 53 | output_attentions=output_attentions, 54 | output_hidden_states=output_hidden_states, 55 | return_dict=return_dict, 56 | **kwargs) 57 | 58 | return_dict = isinstance(output, dict) 59 | lm_logits = output.logits if return_dict else output[0] 60 | loss = None 61 | if labels is not None: 62 | # move labels to correct device to enable model parallelism 63 | labels = labels.to(lm_logits.device) 64 | # Shift so that tokens < n predict n 65 | shift_logits = lm_logits[..., :-1, :].float().contiguous() 66 | shift_labels = labels[..., 1:].contiguous() 67 | batch_size, seq_length, vocab_size = shift_logits.shape 68 | # Flatten the tokens 69 | loss_fct = torch.nn.CrossEntropyLoss() 70 | loss = loss_fct( 71 | shift_logits.view(batch_size * seq_length, vocab_size), 72 | shift_labels.view(batch_size * seq_length)) 73 | 74 | if not return_dict: 75 | # re-pack output with fp32 loss 76 | return ((loss, ) + output) if loss is not None else output 77 | 78 | output.loss = loss 79 | return output 80 | 81 | model.__original_forward__ = model.forward 82 | model.forward = causal_lm_forward 83 | 84 | 85 | def create_hf_model(model_class, 86 | model_name_or_path, 87 | tokenizer, 88 | ds_config=None, 89 | rlhf_training=False, 90 | dropout=None): 91 | model_config = AutoConfig.from_pretrained(model_name_or_path) 92 | configure_dropout(model_config, dropout) 93 | 94 | # Note: dschf is defined in function scope to avoid global effects 95 | # https://huggingface.co/docs/transformers/main_classes/deepspeed#nontrainer-deepspeed-integration 96 | if ds_config is not None and ds_config["zero_optimization"]["stage"] == 3: 97 | dschf = HfDeepSpeedConfig(ds_config) 98 | else: 99 | dschf = None 100 | if rlhf_training: 101 | # the weight loading is handled by create critic model 102 | model = model_class.from_config(model_config) 103 | else: 104 | model = model_class.from_pretrained( 105 | model_name_or_path, 106 | from_tf=bool(".ckpt" in model_name_or_path), 107 | config=model_config) 108 | 109 | model.config.end_token_id = tokenizer.eos_token_id 110 | model.config.pad_token_id = model.config.eos_token_id 111 | model.resize_token_embeddings(int( 112 | 8 * 113 | math.ceil(len(tokenizer) / 8.0))) # make the vocab size multiple of 8 114 | 115 | return model 116 | 117 | 118 | def create_critic_model(model_name_or_path, 119 | tokenizer, 120 | ds_config, 121 | num_padding_at_beginning=0, 122 | rlhf_training=False, 123 | dropout=None, 124 | zero_stage=0, 125 | compute_fp32_loss=False): 126 | # OPT model family always put a padding token at the beginning of the sequence, 127 | # we did not see this in other models but not sure if it is a general rule 128 | 129 | import time 130 | 131 | start = time.time() 132 | critic_model = create_hf_model(AutoModel, model_name_or_path, tokenizer, 133 | ds_config, rlhf_training, dropout) 134 | end = time.time() 135 | print_rank_0(f">Creating model from_config took {end - start} seconds", 136 | None) 137 | 138 | critic_model = RewardModel( 139 | critic_model, 140 | tokenizer, 141 | num_padding_at_beginning=num_padding_at_beginning, 142 | compute_fp32_loss=compute_fp32_loss) 143 | 144 | if rlhf_training: 145 | # load critic model from checkpoint 146 | 147 | if not os.path.isdir(model_name_or_path): 148 | model_name_or_path = snapshot_download(model_name_or_path) 149 | model_ckpt_path = os.path.join(model_name_or_path, 'pytorch_model.bin') 150 | assert os.path.exists( 151 | model_ckpt_path 152 | ), f"Cannot find model checkpoint at {model_ckpt_path}" 153 | 154 | start = time.time() 155 | model_ckpt_state_dict = torch.load(model_ckpt_path, map_location='cpu') 156 | end = time.time() 157 | print_rank_0(f">Creating model from_config took {end - start} seconds", 158 | None) 159 | 160 | # load critic model from checkpoint with zero-stage 3 compatibility 161 | # this functionality may be moved to DS checkpoint load API in future 162 | start = time.time() 163 | load_state_dict_into_model(critic_model, 164 | model_ckpt_state_dict, 165 | "", 166 | zero_stage=zero_stage) 167 | end = time.time() 168 | 169 | print_rank_0(f">Creating model from_config took {end - start} seconds", 170 | None) 171 | 172 | return critic_model 173 | -------------------------------------------------------------------------------- /deepspeed_example/dschat/utils/model/reward_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import torch 6 | from torch import nn 7 | 8 | 9 | ## Note that the following code is modified from 10 | ## https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/reward_model.py 11 | class RewardModel(nn.Module): 12 | 13 | def __init__(self, 14 | base_model, 15 | tokenizer, 16 | num_padding_at_beginning=0, 17 | compute_fp32_loss=False): 18 | super().__init__() 19 | self.config = base_model.config 20 | self.num_padding_at_beginning = num_padding_at_beginning 21 | if hasattr(self.config, "word_embed_proj_dim"): 22 | # `OPT` models use word_embed_proj_dim as final output 23 | # https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#L497 24 | self.v_head = nn.Linear(self.config.word_embed_proj_dim, 25 | 1, 26 | bias=False) 27 | else: 28 | # `gpt-neo(x)` models use `hidden_size` attribute names instead of `n_embd`` 29 | self.config.n_embd = self.config.hidden_size if hasattr( 30 | self.config, "hidden_size") else self.config.n_embd 31 | self.v_head = nn.Linear(self.config.n_embd, 1, bias=False) 32 | self.rwtransformer = base_model 33 | self.PAD_ID = tokenizer.pad_token_id 34 | self.compute_fp32_loss = compute_fp32_loss 35 | 36 | def gradient_checkpointing_enable(self): 37 | self.rwtransformer.gradient_checkpointing_enable() 38 | 39 | def gradient_checkpointing_disable(self): 40 | self.rwtransformer.gradient_checkpointing_disable() 41 | 42 | def forward(self, 43 | input_ids=None, 44 | past_key_values=None, 45 | attention_mask=None, 46 | position_ids=None, 47 | head_mask=None, 48 | inputs_embeds=None, 49 | use_cache=False): 50 | loss = None 51 | 52 | if self.config.model_type == "llama": 53 | kwargs = dict() 54 | else: 55 | kwargs = dict(head_mask=head_mask) 56 | 57 | transformer_outputs = self.rwtransformer( 58 | input_ids, 59 | past_key_values=past_key_values, 60 | attention_mask=attention_mask, 61 | inputs_embeds=inputs_embeds, 62 | use_cache=use_cache, 63 | **kwargs) 64 | 65 | hidden_states = transformer_outputs[0] 66 | rewards = self.v_head(hidden_states).squeeze(-1) 67 | chosen_mean_scores = [] 68 | rejected_mean_scores = [] 69 | 70 | # Split the inputs and rewards into two parts, chosen and rejected 71 | assert len(input_ids.shape) == 2 72 | bs = input_ids.shape[0] // 2 73 | seq_len = input_ids.shape[1] 74 | 75 | chosen_ids = input_ids[:bs] # bs x seq x 1 76 | rejected_ids = input_ids[bs:] 77 | chosen_rewards = rewards[:bs] 78 | rejected_rewards = rewards[bs:] 79 | 80 | # Compute pairwise loss. Only backprop on the different tokens before padding 81 | loss = 0. 82 | for i in range(bs): 83 | chosen_id = chosen_ids[i] 84 | rejected_id = rejected_ids[i] 85 | chosen_reward = chosen_rewards[i] 86 | rejected_reward = rejected_rewards[i] 87 | 88 | c_inds = (chosen_id == self.PAD_ID).nonzero() 89 | c_ind = c_inds[self.num_padding_at_beginning].item() if len( 90 | c_inds 91 | ) > self.num_padding_at_beginning else seq_len # OPT model pads the first token, so we need to use the second padding token as the end of the sequence 92 | check_divergence = (chosen_id != rejected_id).nonzero() 93 | 94 | if len(check_divergence) == 0: 95 | end_ind = rejected_reward.size(-1) 96 | divergence_ind = end_ind - 1 97 | r_ind = c_ind 98 | else: 99 | # Check if there is any padding otherwise take length of sequence 100 | r_inds = (rejected_id == self.PAD_ID).nonzero() 101 | r_ind = r_inds[self.num_padding_at_beginning].item( 102 | ) if len(r_inds) > self.num_padding_at_beginning else seq_len 103 | end_ind = max(c_ind, r_ind) 104 | divergence_ind = check_divergence[0] 105 | assert divergence_ind > 0 106 | c_truncated_reward = chosen_reward[divergence_ind:end_ind] 107 | r_truncated_reward = rejected_reward[divergence_ind:end_ind] 108 | chosen_mean_scores.append( 109 | chosen_reward[c_ind - 1]) #use the end score for reference 110 | rejected_mean_scores.append(rejected_reward[r_ind - 1]) 111 | 112 | if self.compute_fp32_loss: 113 | c_truncated_reward = c_truncated_reward.float() 114 | r_truncated_reward = r_truncated_reward.float() 115 | loss += -torch.nn.functional.logsigmoid(c_truncated_reward - 116 | r_truncated_reward).mean() 117 | 118 | loss = loss / bs 119 | chosen_mean_scores = torch.stack(chosen_mean_scores) 120 | rejected_mean_scores = torch.stack(rejected_mean_scores) 121 | return { 122 | "loss": loss, 123 | "chosen_mean_scores": chosen_mean_scores, 124 | "rejected_mean_scores": rejected_mean_scores, 125 | } 126 | 127 | def forward_value(self, 128 | input_ids=None, 129 | attention_mask=None, 130 | past_key_values=None, 131 | position_ids=None, 132 | head_mask=None, 133 | inputs_embeds=None, 134 | return_value_only=False, 135 | prompt_length=0, 136 | use_cache=False): 137 | 138 | if self.config.model_type == "llama": 139 | kwargs = dict() 140 | else: 141 | kwargs = dict(head_mask=head_mask) 142 | 143 | transformer_outputs = self.rwtransformer( 144 | input_ids, 145 | past_key_values=past_key_values, 146 | attention_mask=attention_mask, 147 | inputs_embeds=inputs_embeds, 148 | use_cache=use_cache, 149 | **kwargs) 150 | hidden_states = transformer_outputs[0] 151 | values = self.v_head(hidden_states).squeeze(-1) 152 | if return_value_only: 153 | return values 154 | else: 155 | # [0 0 0 0 prompt, answer, 0 0 0 0 ] for step 3, we have padding at the beginning 156 | # [prompt, answer, 0, 0, 0, 0] this is normal 157 | assert prompt_length > 1, "prompt_length must be greater than 1 to help select the end score" 158 | bs = values.size(0) 159 | seq_len = input_ids.shape[1] 160 | chosen_end_scores = [ 161 | ] # we use this name for consistency with the original forward function 162 | for i in range(bs): 163 | input_id = input_ids[i] 164 | value = values[i] 165 | 166 | c_inds = (input_id[prompt_length:] == self.PAD_ID).nonzero() 167 | # here we only use the answer part of the sequence so we do not need to care about the padding at the beginning 168 | c_ind = c_inds[0].item() + prompt_length if len( 169 | c_inds) > 0 else seq_len 170 | chosen_end_scores.append(value[c_ind - 1]) 171 | return { 172 | "values": values, 173 | "chosen_end_scores": torch.stack(chosen_end_scores), 174 | } 175 | -------------------------------------------------------------------------------- /deepspeed_example/dschat/utils/module/lora.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import math 6 | import torch 7 | from torch import nn 8 | import torch.nn.functional as F 9 | from deepspeed.compression.helper import recursive_getattr, recursive_setattr 10 | import deepspeed 11 | 12 | 13 | class LinearLayer_LoRA(nn.Module): 14 | # an simple implementation of LoRA 15 | # for now only support Linear Layer 16 | def __init__(self, 17 | weight, 18 | lora_dim=0, 19 | lora_scaling=1, 20 | lora_droppout=0, 21 | bias=None): 22 | super(LinearLayer_LoRA, self).__init__() 23 | self.weight = weight 24 | self.bias = bias 25 | 26 | if lora_dim <= 0: 27 | raise ValueError( 28 | "You are training to use LoRA, whose reduced dim should be larger than 1" 29 | ) 30 | 31 | try: 32 | # for zero stage 3 33 | rows, columns = weight.ds_shape 34 | except: 35 | rows, columns = weight.shape 36 | self.lora_right_weight = nn.Parameter(torch.zeros( 37 | columns, 38 | lora_dim)) # apply transpose so in forward we do not need to 39 | self.lora_left_weight = nn.Parameter(torch.zeros(lora_dim, rows)) 40 | self.lora_scaling = lora_scaling / lora_dim 41 | 42 | if lora_droppout > 0: 43 | self.lora_dropout = nn.Dropout(lora_droppout) 44 | else: 45 | self.lora_dropout = nn.Identity() 46 | 47 | self.reset_parameters() 48 | # disable the original weight gradient 49 | self.weight.requires_grad = False 50 | # fuse LoRA to the original weight 51 | self.fuse_lora = False 52 | 53 | def eval(self): 54 | self.lora_dropout.eval() 55 | 56 | # self.fuse_lora_weight() 57 | 58 | def train(self, mode=True): 59 | self.lora_dropout.train(mode) 60 | # self.unfuse_lora_weight() 61 | 62 | def reset_parameters(self): 63 | nn.init.kaiming_uniform_(self.lora_right_weight, a=math.sqrt(5)) 64 | nn.init.zeros_(self.lora_left_weight) 65 | 66 | def fuse_lora_weight(self): 67 | if not self.fuse_lora: 68 | self.weight.data += self.lora_scaling * torch.matmul( 69 | self.lora_left_weight.t(), self.lora_right_weight.t()) 70 | self.fuse_lora = True 71 | 72 | def unfuse_lora_weight(self): 73 | if self.fuse_lora: 74 | self.weight.data -= self.lora_scaling * torch.matmul( 75 | self.lora_left_weight.t(), self.lora_right_weight.t()) 76 | self.fuse_lora = False 77 | 78 | def forward(self, input): 79 | if self.fuse_lora: 80 | return F.linear(input, self.weight, self.bias) 81 | else: 82 | return F.linear( 83 | input, self.weight, 84 | self.bias) + (self.lora_dropout(input) @ self.lora_right_weight 85 | @ self.lora_left_weight) * self.lora_scaling 86 | 87 | 88 | # convert the linear layer to LoRA 89 | def convert_linear_layer_to_lora(model, 90 | part_module_name, 91 | lora_dim=0, 92 | lora_scaling=1, 93 | lora_droppout=0): 94 | replace_name = [] 95 | for name, module in model.named_modules(): 96 | if isinstance(module, nn.Linear) and part_module_name in name: 97 | replace_name.append(name) 98 | for name in replace_name: 99 | module = recursive_getattr(model, name) 100 | tmp = LinearLayer_LoRA( 101 | module.weight, lora_dim, lora_scaling, lora_droppout, 102 | module.bias).to(module.weight.device).to(module.weight.dtype) 103 | recursive_setattr(model, name, tmp) 104 | return model 105 | 106 | 107 | def _z3_params_to_fetch(param_list): 108 | return [ 109 | p for p in param_list 110 | if hasattr(p, 'ds_id') and p.ds_status == deepspeed.runtime.zero. 111 | partition_parameters.ZeroParamStatus.NOT_AVAILABLE 112 | ] 113 | 114 | 115 | # convert the LoRA layer to linear layer 116 | def convert_lora_to_linear_layer(model): 117 | replace_name = [] 118 | for name, module in model.named_modules(): 119 | if isinstance(module, LinearLayer_LoRA): 120 | replace_name.append(name) 121 | for name in replace_name: 122 | module = recursive_getattr(model, name) 123 | zero_stage_3 = hasattr(module.weight, 'ds_id') 124 | with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([ 125 | module.weight, module.bias, module.lora_left_weight, 126 | module.lora_right_weight 127 | ]), 128 | modifier_rank=0, 129 | enabled=zero_stage_3): 130 | module.fuse_lora_weight() 131 | return model 132 | 133 | 134 | def only_optimize_lora_parameters(model, force_optimize_params=[]): 135 | # turn off the gradient of all the parameters except the LoRA parameters 136 | for name, param in model.named_parameters(): 137 | if "lora_right_weight" in name or "lora_left_weight" in name or name in force_optimize_params: 138 | param.requires_grad = True 139 | else: 140 | param.requires_grad = False 141 | return model 142 | 143 | 144 | def make_model_gradient_checkpointing_compatible(model): 145 | # Higgingface added this enable input require grads function to make gradient checkpointing work for lora-only optimization 146 | if hasattr(model, "enable_input_require_grads"): 147 | model.enable_input_require_grads() 148 | elif hasattr(model, "get_input_embeddings"): 149 | 150 | def make_inputs_require_grad(module, input, output): 151 | output.requires_grad_(True) 152 | 153 | model.get_input_embeddings().register_forward_hook( 154 | make_inputs_require_grad) 155 | return model 156 | -------------------------------------------------------------------------------- /deepspeed_example/dschat/utils/perf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import torch 7 | 8 | 9 | # This function can be used to print throughput for Step 1 and 2 only 10 | def print_throughput(hf_model, args, e2e_time, rank=0): 11 | if rank <= 0: 12 | hf_config = hf_model.config 13 | num_layers, hidden_size, vocab_size = get_hf_configs(hf_config) 14 | 15 | gpus_per_model = torch.distributed.get_world_size() 16 | seq_length = args.max_seq_len 17 | batch_size = args.per_device_train_batch_size 18 | samples_per_second = batch_size / e2e_time 19 | checkpoint_activations_factor = 4 if args.gradient_checkpointing else 3 20 | if args.lora_dim > 0: 21 | k = args.lora_dim * 2 / hidden_size 22 | checkpoint_activations_factor -= (1 - k) 23 | 24 | hf_model._num_params = sum([ 25 | p.ds_numel if hasattr(p, "ds_tensor") else p.numel() 26 | for p in hf_model.parameters() 27 | ]) 28 | params_in_billions = hf_model._num_params / (1e9) 29 | 30 | # Megatron paper's formula to calculate training flops 31 | train_flops_per_iteration = calculate_flops( 32 | checkpoint_activations_factor, batch_size, seq_length, hf_config) 33 | 34 | train_tflops = train_flops_per_iteration / (e2e_time * gpus_per_model * 35 | (10**12)) 36 | 37 | param_string = f"{params_in_billions:.3f} B" if params_in_billions != 0 else "NA" 38 | print( 39 | f"Model Parameters: {param_string}, Latency: {e2e_time:.2f}s, TFLOPs: {train_tflops:.2f}, Samples/sec: {samples_per_second:.2f}, Time/seq {e2e_time/batch_size:.2f}s, Batch Size: {batch_size}, Sequence Length: {seq_length}" 40 | ) 41 | 42 | 43 | # Enhanced version of the function above that provides calculations and printing for Step 3 44 | def print_throughput_step3(actor_model, 45 | critic_model, 46 | args, 47 | e2e_time, 48 | gen_exp_time, 49 | train_time, 50 | rank=0): 51 | if rank <= 0: 52 | # Actor model passed here is a HF model. 53 | actor_hf_config = actor_model.config 54 | # Critic model passed here is a DeepSpeed Engine. The module inside is the Reward model (that wraps a HF model). 55 | critic_hf_config = critic_model.module.config 56 | 57 | actor_num_layers, actor_hidden_size, actor_vocab_size = get_hf_configs( 58 | actor_hf_config) 59 | critic_num_layers, critic_hidden_size, critic_vocab_size = get_hf_configs( 60 | critic_hf_config) 61 | 62 | gpus_per_model = torch.distributed.get_world_size() 63 | seq_length = args.max_answer_seq_len + args.max_prompt_seq_len 64 | batch_size = args.per_device_generation_batch_size * args.generation_batches * args.ppo_epochs * gpus_per_model * 1 if args.unsupervised_dataset_name is None else 2 65 | samples_per_second = batch_size / e2e_time 66 | 67 | actor_checkpoint_activations_factor = 4 if args.actor_gradient_checkpointing else 3 68 | critic_checkpoint_activations_factor = 4 if args.critic_gradient_checkpointing else 3 69 | if args.actor_lora_dim > 0: 70 | k = args.actor_lora_dim * 2 / actor_hidden_size 71 | actor_checkpoint_activations_factor -= (1 - k) 72 | if args.critic_lora_dim > 0: 73 | k = args.critic_lora_dim * 2 / critic_hidden_size 74 | critic_checkpoint_activations_factor -= (1 - k) 75 | 76 | actor_model._num_params = sum([ 77 | p.ds_numel if hasattr(p, "ds_tensor") else p.numel() 78 | for p in actor_model.parameters() 79 | ]) 80 | actor_params_in_billions = actor_model._num_params / (1e9) 81 | 82 | critic_model._num_params = sum([ 83 | p.ds_numel if hasattr(p, "ds_tensor") else p.numel() 84 | for p in critic_model.parameters() 85 | ]) 86 | critic_params_in_billions = critic_model._num_params / (1e9) 87 | 88 | # Megatron paper's formula to calculate training flops 89 | 90 | actor_train_flops_per_iteration = calculate_flops( 91 | actor_checkpoint_activations_factor, batch_size, seq_length, 92 | actor_hf_config) 93 | critic_train_flops_per_iteration = calculate_flops( 94 | critic_checkpoint_activations_factor, batch_size, seq_length, 95 | critic_hf_config) 96 | 97 | total_train_flops = actor_train_flops_per_iteration + critic_train_flops_per_iteration 98 | train_tflops = total_train_flops / (train_time * gpus_per_model * 99 | (10**12)) 100 | 101 | gen_bs = args.per_device_generation_batch_size * gpus_per_model 102 | 103 | # Modified formula for calculating flops in the forward pass only 104 | gen_flops_per_iteration = ( 105 | 24 * gen_bs * seq_length * actor_num_layers * 106 | (actor_hidden_size**2)) * ( 107 | 1.0 + (seq_length / (6.0 * actor_hidden_size)) + 108 | (actor_vocab_size / 109 | (16.0 * actor_num_layers * actor_hidden_size))) 110 | 111 | gen_tflops = gen_flops_per_iteration / (gen_exp_time * gpus_per_model * 112 | (10**12)) 113 | 114 | if actor_hf_config.torch_dtype == torch.float16: 115 | num_bytes = 2 116 | elif actor_hf_config.torch_dtype == torch.float32: 117 | num_bytes = 4 118 | else: 119 | num_bytes = -1 120 | 121 | pertok_lat = gen_exp_time / args.max_answer_seq_len 122 | gen_bw = 1 / pertok_lat * actor_model._num_params * num_bytes / 1e9 123 | 124 | total_flops_per_iteration = total_train_flops + gen_flops_per_iteration * args.generation_batches 125 | total_tflops = total_flops_per_iteration / (e2e_time * gpus_per_model * 126 | (10**12)) 127 | 128 | print( 129 | f"End-to-End => Latency: {e2e_time:.2f}s, TFLOPs: {total_tflops:.2f}, Samples/sec: {samples_per_second:.2f}, Time/seq {e2e_time/batch_size:.2f}s, Batch Size: {batch_size}, Total Seq. Length: {seq_length}" 130 | ) 131 | print( 132 | f"Generation => Latency: {gen_exp_time:.2f}s, Per-token Latency {pertok_lat*1000:.2f} ms, TFLOPs: {gen_tflops:.2f}, BW: {gen_bw if num_bytes > 0 else num_bytes:.2f} GB/sec, Answer Seq. Length: {args.max_answer_seq_len}" 133 | ) 134 | print( 135 | f"Training => Latency: {train_time:.2f}s, TFLOPs: {train_tflops:.2f}" 136 | ) 137 | actor_param_string = f"{actor_params_in_billions:.3f} B" if actor_params_in_billions != 0 else "NA" 138 | critic_param_string = f"{critic_params_in_billions:.3f} B" if critic_params_in_billions != 0 else "NA" 139 | print( 140 | f"Actor Model Parameters => {actor_param_string}, Critic Model Parameters => {critic_param_string}" 141 | ) 142 | 143 | 144 | # Helper function to calculate FLOPs using the Megatron-LM paper's formula 145 | def calculate_flops(checkpoint_activations_factor, batch_size, seq_length, 146 | hf_config): 147 | num_layers, hidden_size, vocab_size = get_hf_configs(hf_config) 148 | flops_per_iteration = (24 * checkpoint_activations_factor * batch_size * 149 | seq_length * num_layers * (hidden_size**2)) * ( 150 | 1.0 + (seq_length / (6.0 * hidden_size)) + 151 | (vocab_size / 152 | (16.0 * num_layers * hidden_size))) 153 | return flops_per_iteration 154 | 155 | 156 | def get_hf_configs(hf_config): 157 | num_layers = getattr(hf_config, "num_hidden_layers", 158 | getattr(hf_config, "n_layer", None)) 159 | hidden_size = getattr(hf_config, "hidden_size", 160 | getattr(hf_config, "n_embd", None)) 161 | vocab_size = getattr(hf_config, "vocab_size", None) 162 | assert all( 163 | (num_layers, hidden_size, vocab_size) 164 | ), "Could not determine number of layers, hidden size, and vocab size of the model" 165 | 166 | return num_layers, hidden_size, vocab_size 167 | -------------------------------------------------------------------------------- /deepspeed_example/dschat/utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import torch 7 | import random 8 | import numpy as np 9 | from transformers import set_seed, AutoTokenizer 10 | import json 11 | import deepspeed 12 | from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus 13 | from deepspeed.accelerator import get_accelerator 14 | import torch.nn as nn 15 | 16 | 17 | def print_rank_0(msg, rank=None): 18 | if rank is not None and rank <= 0: 19 | print(msg) 20 | elif is_rank_0(): 21 | print(msg) 22 | 23 | 24 | def is_rank_0(): 25 | """Check whether it is rank 0.""" 26 | if torch.distributed.is_initialized(): 27 | if torch.distributed.get_rank() == 0: 28 | return True 29 | else: 30 | return False 31 | else: 32 | return True 33 | 34 | 35 | def to_device(batch, device): 36 | output = {} 37 | for k, v in batch.items(): 38 | try: 39 | output[k] = v.to(device) 40 | except: 41 | output[k] = v 42 | return output 43 | 44 | 45 | class MovingAverage: 46 | 47 | def __init__(self): 48 | self.count = 0 49 | self.total = 0 50 | self.mean = 0 51 | 52 | def update(self, num): 53 | self.total += num 54 | self.count += 1 55 | self.mean = self.total / self.count 56 | 57 | return self.mean 58 | 59 | 60 | class ExponentialMovingAverage: 61 | 62 | def __init__(self, alpha=0.9): 63 | self.alpha = alpha 64 | self.ema = None 65 | 66 | def update(self, num): 67 | prev_ema = num if self.ema is None else self.ema 68 | self.ema = self.alpha * prev_ema + (1.0 - self.alpha) * num 69 | return self.ema 70 | 71 | def get(self): 72 | return self.ema if self.ema is not None else 0. 73 | 74 | 75 | def get_tokenizer(model_name_or_path, fast_tokenizer=True): 76 | if "llama" in model_name_or_path: 77 | from transformers.models.llama import LlamaTokenizer 78 | tokenizer = LlamaTokenizer.from_pretrained( 79 | model_name_or_path, fast_tokenizer=fast_tokenizer) 80 | if tokenizer.pad_token is None: 81 | # assert tokenizer.eos_token is not None 82 | # tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) 83 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 84 | tokenizer.padding_side = 'right' 85 | else: 86 | tokenizer = AutoTokenizer.from_pretrained( 87 | model_name_or_path, fast_tokenizer=fast_tokenizer) 88 | tokenizer.pad_token = tokenizer.eos_token 89 | # make sure tokenizer is right pad in our logic 90 | tokenizer.padding_side = 'right' 91 | return tokenizer 92 | 93 | 94 | def load_hf_tokenizer(model_name_or_path, 95 | fast_tokenizer=True, 96 | add_special_tokens=None): 97 | if os.path.exists(model_name_or_path): 98 | # Locally tokenizer loading has some issue, so we need to force download 99 | model_json = os.path.join(model_name_or_path, "config.json") 100 | if os.path.exists(model_json): 101 | model_json_file = json.load(open(model_json)) 102 | model_name = model_json_file.get("_name_or_path", 103 | model_name_or_path) 104 | tokenizer = get_tokenizer(model_name, 105 | fast_tokenizer=fast_tokenizer) 106 | else: 107 | tokenizer = get_tokenizer(model_name_or_path, 108 | fast_tokenizer=fast_tokenizer) 109 | 110 | if add_special_tokens is not None: 111 | add_special_tokens = [add_special_tokens] if isinstance(add_special_tokens, str) \ 112 | else add_special_tokens 113 | tokenizer.add_special_tokens( 114 | {'additional_special_tokens': add_special_tokens}) 115 | 116 | return tokenizer 117 | 118 | 119 | def save_hf_format(model, tokenizer, args, sub_folder=""): 120 | # used to save huggingface format, so we can use it for hf.from_pretrained 121 | model_to_save = model.module if hasattr(model, 'module') else model 122 | CONFIG_NAME = "config.json" 123 | WEIGHTS_NAME = "pytorch_model.bin" 124 | output_dir = os.path.join(args.output_dir, sub_folder) 125 | os.makedirs(output_dir, exist_ok=True) 126 | output_model_file = os.path.join(output_dir, WEIGHTS_NAME) 127 | output_config_file = os.path.join(output_dir, CONFIG_NAME) 128 | save_dict = model_to_save.state_dict() 129 | for key in list(save_dict.keys()): 130 | if "lora" in key: 131 | del save_dict[key] 132 | torch.save(save_dict, output_model_file) 133 | model_to_save.config.to_json_file(output_config_file) 134 | tokenizer.save_vocabulary(output_dir) 135 | 136 | 137 | def set_random_seed(seed): 138 | if seed is not None: 139 | set_seed(seed) 140 | random.seed(seed) 141 | np.random.seed(seed) 142 | torch.manual_seed(seed) 143 | get_accelerator().manual_seed_all(seed) 144 | 145 | 146 | def get_all_reduce_mean(tensor): 147 | torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM) 148 | tensor = tensor / torch.distributed.get_world_size() 149 | return tensor 150 | 151 | 152 | # This function is a modified version of code available in the from_pretrained API of HuggingFace Transformers 153 | # The code is copied and modified from: https://github.com/huggingface/transformers/blob/5ee9693a1c77c617ebc43ef20194b6d3b674318e/src/transformers/modeling_utils.py#L498 154 | # This function helps load a HF format checkpoint into a DeepSpeed wrapped model that has been sharded using ZeRO Stage 3 155 | def load_state_dict_into_model(model_to_load=None, 156 | state_dict=None, 157 | start_prefix="", 158 | zero_stage=0): 159 | 160 | # copy state_dict so _load_from_state_dict can modify it 161 | metadata = getattr(state_dict, "_metadata", None) 162 | state_dict = state_dict.copy() 163 | if metadata is not None: 164 | state_dict._metadata = metadata 165 | 166 | error_msgs = [] 167 | 168 | # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants 169 | # so we need to apply the function recursively. 170 | def load(module: nn.Module, state_dict, prefix=""): 171 | local_metadata = {} if metadata is None else metadata.get( 172 | prefix[:-1], {}) 173 | args = (state_dict, prefix, local_metadata, True, [], [], error_msgs) 174 | # Parameters of module and children will start with prefix. We can exit early if there are none in this 175 | # state_dict 176 | if len([key for key in state_dict if key.startswith(prefix)]) > 0: 177 | if zero_stage == 3: 178 | # In sharded models, each shard has only part of the full state_dict, so only gather 179 | # parameters that are in the current state_dict. 180 | named_parameters = dict( 181 | module.named_parameters(prefix=prefix[:-1], recurse=False)) 182 | params_to_gather = [ 183 | named_parameters[k] for k in state_dict.keys() 184 | if k in named_parameters 185 | ] 186 | if len(params_to_gather) > 0: 187 | # because zero3 puts placeholders in model params, this context 188 | # manager gathers (unpartitions) the params of the current layer, then loads from 189 | # the state dict and then re-partitions them again 190 | with deepspeed.zero.GatheredParameters(params_to_gather, 191 | modifier_rank=0): 192 | if torch.distributed.get_rank() == 0: 193 | module._load_from_state_dict(*args) 194 | else: 195 | module._load_from_state_dict(*args) 196 | 197 | for name, child in module._modules.items(): 198 | if child is not None: 199 | load(child, state_dict, prefix + name + ".") 200 | 201 | load(model_to_load, state_dict, prefix=start_prefix) 202 | # Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so 203 | # it's safe to delete it. 204 | del state_dict 205 | 206 | return error_msgs 207 | 208 | 209 | def get_optimizer_grouped_parameters( 210 | model, 211 | weight_decay, 212 | lora_lr=5e-4, 213 | no_decay_name_list=[ 214 | "bias", "layer_norm.weight", "layernorm.weight", "norm.weight", 215 | "ln_f.weight" 216 | ], 217 | lora_name_list=["lora_right_weight", "lora_left_weight"], 218 | ): 219 | optimizer_grouped_parameters = [ 220 | { 221 | "params": [ 222 | p for n, p in model.named_parameters() 223 | if (not any(nd in n.lower() for nd in no_decay_name_list) 224 | and p.requires_grad and not any(nd in n.lower() 225 | for nd in lora_name_list)) 226 | ], 227 | "weight_decay": 228 | weight_decay, 229 | }, 230 | { 231 | "params": [ 232 | p for n, p in model.named_parameters() 233 | if (not any(nd in n.lower() for nd in no_decay_name_list) 234 | and p.requires_grad and any(nd in n.lower() 235 | for nd in lora_name_list)) 236 | ], 237 | "weight_decay": 238 | weight_decay, 239 | "lr": 240 | lora_lr 241 | }, 242 | { 243 | "params": [ 244 | p for n, p in model.named_parameters() 245 | if (any(nd in n.lower() 246 | for nd in no_decay_name_list) and p.requires_grad) 247 | ], 248 | "weight_decay": 249 | 0.0, 250 | }, 251 | ] 252 | 253 | non_empty_groups = [] 254 | for group in optimizer_grouped_parameters: 255 | if group["params"]: 256 | non_empty_groups.append(group) 257 | return non_empty_groups 258 | 259 | 260 | def _z3_params_to_fetch(param_list): 261 | return [ 262 | p for p in param_list 263 | if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE 264 | ] 265 | 266 | 267 | def moving_average(model, model_ema, beta=0.992, device=None, zero_stage=0): 268 | zero_stage_3 = (zero_stage == 3) 269 | with torch.no_grad(): 270 | for param, param_ema in zip(model.parameters(), 271 | model_ema.parameters()): 272 | # TODO: use prefiltering for efficiency 273 | params_to_fetch = _z3_params_to_fetch([param, param_ema 274 | ]) if zero_stage_3 else [] 275 | should_gather_param = len(params_to_fetch) > 0 276 | with deepspeed.zero.GatheredParameters( 277 | params_to_fetch, enabled=should_gather_param): 278 | data = param.data 279 | if device is not None: 280 | data = data.to(device) 281 | param_ema.data.copy_(torch.lerp(data, param_ema.data, beta)) 282 | 283 | 284 | def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0): 285 | zero_stage_3 = (zero_stage == 3) 286 | os.makedirs(save_dir, exist_ok=True) 287 | WEIGHTS_NAME = "pytorch_model.bin" 288 | output_model_file = os.path.join(save_dir, WEIGHTS_NAME) 289 | 290 | model_to_save = model_ema.module if hasattr(model_ema, 291 | 'module') else model_ema 292 | if not zero_stage_3: 293 | if global_rank == 0: 294 | torch.save(model_to_save.state_dict(), output_model_file) 295 | else: 296 | output_state_dict = {} 297 | for k, v in model_to_save.named_parameters(): 298 | 299 | if hasattr(v, 'ds_id'): 300 | with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([v 301 | ]), 302 | enabled=zero_stage_3): 303 | v_p = v.data.cpu() 304 | else: 305 | v_p = v.cpu() 306 | if global_rank == 0 and "lora" not in k: 307 | output_state_dict[k] = v_p 308 | if global_rank == 0: 309 | torch.save(output_state_dict, output_model_file) 310 | del output_state_dict 311 | -------------------------------------------------------------------------------- /deepspeed_example/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets>=2.8.0 2 | sentencepiece>=0.1.97 3 | protobuf==3.20.3 4 | accelerate>=0.15.0 5 | torch>=1.12.0 6 | deepspeed>=0.9.0 7 | transformers>=4.31.0,!=4.33.2 8 | tensorboard 9 | -------------------------------------------------------------------------------- /deepspeed_example/run_llama2_7b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | ZERO_STAGE=$1 7 | OUTPUT=./output_llama2_7b 8 | if [ "$ZERO_STAGE" == "" ]; then 9 | ZERO_STAGE=3 10 | fi 11 | mkdir -p $OUTPUT 12 | 13 | deepspeed main.py \ 14 | --data_split 2,4,4 \ 15 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 16 | --per_device_train_batch_size 1 \ 17 | --per_device_eval_batch_size 4 \ 18 | --max_seq_len 512 \ 19 | --learning_rate 9.65e-6 \ 20 | --weight_decay 0. \ 21 | --num_train_epochs 3 \ 22 | --gradient_accumulation_steps 4 \ 23 | --lr_scheduler_type cosine \ 24 | --num_warmup_steps 0 \ 25 | --seed 1234 \ 26 | --gradient_checkpointing \ 27 | --dtype bf16 \ 28 | --zero_stage $ZERO_STAGE \ 29 | --deepspeed \ 30 | --output_dir $OUTPUT \ 31 | #&> $OUTPUT/training.log 32 | -------------------------------------------------------------------------------- /deepspeed_example/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | # setup.py: install script for deepspeed_chat 8 | """ 9 | to install deepspeed_chat and its dependencies for development work, 10 | run this cmd from the root directory: 11 | pip install -e . 12 | """ 13 | import setuptools 14 | 15 | setuptools.setup( 16 | name="deepspeed-chat", 17 | version="0.1", 18 | url= 19 | "https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat", 20 | include_package_data=True, 21 | packages=setuptools.find_packages(include=['dschat']), 22 | install_requires=[ 23 | "datasets>=2.8.0", "sentencepiece>=0.1.97", "protobuf==3.20.3", 24 | "accelerate>=0.15.0", "torch>=1.12.0", "deepspeed>=0.9.2", 25 | "transformers>=4.31.0,!=4.33.2", "tensorboard" 26 | ], 27 | extras_require={ 28 | "azureml": [ 29 | "azure-ml-component", 30 | "azureml-core", 31 | ], 32 | }) 33 | -------------------------------------------------------------------------------- /imgs/broadcast_matrix_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/broadcast_matrix_matrix.png -------------------------------------------------------------------------------- /imgs/broadcast_matrix_vector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/broadcast_matrix_vector.png -------------------------------------------------------------------------------- /imgs/broadcast_mv_scalar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/broadcast_mv_scalar.png -------------------------------------------------------------------------------- /imgs/broadcast_rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/broadcast_rule.png -------------------------------------------------------------------------------- /imgs/data_storage_operators.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/data_storage_operators.png -------------------------------------------------------------------------------- /imgs/high_level_abstraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/high_level_abstraction.png -------------------------------------------------------------------------------- /imgs/reduce.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/reduce.jpg -------------------------------------------------------------------------------- /imgs/strides.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/strides.png -------------------------------------------------------------------------------- /mini_tensorflow/mini_tensorflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "\n", 11 | "class Graph():\n", 12 | " \"\"\" Define a computation graph, which contains\n", 13 | " placeholders: to store the input data\n", 14 | " variables: to store the network parameters\n", 15 | " constants: some static data\n", 16 | " operations: the mathmatical operations for each neural network layer\n", 17 | " \"\"\"\n", 18 | " def __init__(self):\n", 19 | " \"\"\"\n", 20 | " Please define attributes for placeholders, variables, constants, operations,\n", 21 | " using lists.\n", 22 | " \"\"\"\n", 23 | " [implement]\n", 24 | "\n", 25 | " def as_default(self):\n", 26 | " \"\"\"\n", 27 | " define a global default computation graph\n", 28 | " \"\"\"\n", 29 | " global _default_graph\n", 30 | " _default_graph = [implement]\n", 31 | " \n", 32 | " def add_operation(self, op):\n", 33 | " \"\"\"\n", 34 | " add op to operations\n", 35 | " \"\"\"\n", 36 | " [implement]\n", 37 | "\n", 38 | " def add_placeholder(self, holder):\n", 39 | " \"\"\"\n", 40 | " add holder to placeholders\n", 41 | " \"\"\"\n", 42 | " [implement]\n", 43 | " \n", 44 | " def add_variable(self, var):\n", 45 | " \"\"\"\n", 46 | " add var to variables\n", 47 | " \"\"\"\n", 48 | " [implement]\n", 49 | " \n", 50 | " def add_constant(self, c):\n", 51 | " \"\"\"\n", 52 | " add c to constants\n", 53 | " \"\"\"\n", 54 | " [implement]\n", 55 | "\n", 56 | "class Operation():\n", 57 | " \"\"\"\n", 58 | " Define network operation node. It should contain input nodes and one output node.\n", 59 | " \"\"\"\n", 60 | " def __init__(self, input_nodes=None):\n", 61 | " \"\"\"\n", 62 | " Define two attributes here: \n", 63 | " input_nodes\n", 64 | " output\n", 65 | " Add the current Operation node to _default_graph's operations\n", 66 | " \"\"\"\n", 67 | " [implement]\n", 68 | " \n", 69 | " # Append operation to the list of operations of the default graph\n", 70 | " [implement]\n", 71 | "\n", 72 | " def forward(self):\n", 73 | " pass\n", 74 | "\n", 75 | " def backward(self):\n", 76 | " pass\n", 77 | "\n", 78 | "\n", 79 | "class BinaryOperation(Operation):\n", 80 | " \"\"\"\n", 81 | " define binary operations\n", 82 | " \"\"\"\n", 83 | " def __init__(self, a, b):\n", 84 | " \"\"\"\n", 85 | " a, b are input nodes. please initialize\n", 86 | " \"\"\"\n", 87 | " [implement]\n", 88 | "\n", 89 | "class add(BinaryOperation):\n", 90 | " \"\"\"\n", 91 | " Computes a + b, element-wise\n", 92 | " \"\"\"\n", 93 | " def forward(self, a, b):\n", 94 | " [implement]\n", 95 | "\n", 96 | " def backward(self, upstream_grad):\n", 97 | " raise NotImplementedError\n", 98 | "\n", 99 | "class multiply(BinaryOperation):\n", 100 | " \"\"\"\n", 101 | " Computes a * b, element-wise\n", 102 | " \"\"\"\n", 103 | " def forward(self, a, b):\n", 104 | " [implement]\n", 105 | "\n", 106 | " def backward(self, upstream_grad):\n", 107 | " raise NotImplementedError\n", 108 | "\n", 109 | "class divide(BinaryOperation):\n", 110 | " \"\"\"\n", 111 | " Returns the true division of the inputs, element-wise\n", 112 | " \"\"\"\n", 113 | " def forward(self, a, b):\n", 114 | " return np.true_divide(a, b)\n", 115 | "\n", 116 | " def backward(self, upstream_grad):\n", 117 | " raise NotImplementedError\n", 118 | "\n", 119 | "class matmul(BinaryOperation):\n", 120 | " \"\"\"\n", 121 | " Multiplies matrix a by matrix b, producing a * b\n", 122 | " \"\"\"\n", 123 | " def forward(self, a, b):\n", 124 | " \"\"\"\n", 125 | " using numpy.dot to perform matrix multiplication on a and b\n", 126 | " \"\"\"\n", 127 | " [implement]\n", 128 | "\n", 129 | " def backward(self, upstream_grad):\n", 130 | " raise NotImplementedError\n", 131 | " \n", 132 | "class Placeholder():\n", 133 | " \"\"\"\n", 134 | " define placeholder. It should contain a value attribute to store value\n", 135 | " \"\"\"\n", 136 | " def __init__(self):\n", 137 | " \"\"\"\n", 138 | " initialize the value to None, add the current node to default graph's placeholder\n", 139 | " \"\"\"\n", 140 | " [implement]\n", 141 | "\n", 142 | "class Constant():\n", 143 | " \"\"\"\n", 144 | " Define a constant node\n", 145 | " \"\"\"\n", 146 | " def __init__(self, value=None):\n", 147 | " \"\"\"\n", 148 | " define internal __value to store the value, add the current node to graph's constant\n", 149 | " \"\"\"\n", 150 | " [implement]\n", 151 | "\n", 152 | " @property\n", 153 | " def value(self):\n", 154 | " \"\"\"\n", 155 | " return the internal value\n", 156 | " \"\"\"\n", 157 | " [implement]\n", 158 | "\n", 159 | " @value.setter\n", 160 | " def value(self, value):\n", 161 | " raise ValueError(\"Cannot reassign value.\")\n", 162 | " \n", 163 | "class Variable():\n", 164 | " \"\"\"\n", 165 | " define a variable node (for parameter) with initial_value\n", 166 | " \"\"\"\n", 167 | " def __init__(self, initial_value=None):\n", 168 | " \"\"\"\n", 169 | " assign initial_value to value, add the current node to graph's variables\n", 170 | " \"\"\"\n", 171 | " [implement]\n", 172 | "\n", 173 | "def topology_sort(operation):\n", 174 | " \"\"\"\n", 175 | " implement topological sort to order the operations, starting from current node\n", 176 | " \"\"\"\n", 177 | " ordering = []\n", 178 | " visited_nodes = set()\n", 179 | "\n", 180 | " def recursive_helper(node):\n", 181 | " \"\"\"\n", 182 | " for each Operation node (using isinstance)\n", 183 | " recursively find the incoming nodes, visit them first and add node to visited nodes. \n", 184 | " \"\"\"\n", 185 | " [implement]\n", 186 | "\n", 187 | " # start recursive depth-first search\n", 188 | " [implement]\n", 189 | "\n", 190 | " return ordering\n", 191 | "\n", 192 | "# session = Session()\n", 193 | "# output = session.run(some_operation, {\n", 194 | "# X: train_X # [1,2,...,n_features]\n", 195 | "# })\n", 196 | "\n", 197 | "class Session():\n", 198 | " \"\"\"\n", 199 | " A session provides a context to run the computation graph\n", 200 | " \"\"\"\n", 201 | " def run(self, operation, feed_dict={}):\n", 202 | " \"\"\"\n", 203 | " apply topological sort on the computation graph starting from the operation node\n", 204 | " operation is the final operation node\n", 205 | " feed_dict: a dictionary that maps Placeholder to actual data value (in numpy)\n", 206 | " if a node is a placeholder, it should take value from feed_dict, \n", 207 | " if a node is variable or constant, it just use the node's value\n", 208 | " it a node is an operation, it should get the node's input_nodes, and then apply forward\n", 209 | " \"\"\"\n", 210 | " nodes_sorted = topology_sort(operation)\n", 211 | "\n", 212 | " for node in nodes_sorted:\n", 213 | " [implement]\n", 214 | "\n", 215 | " return operation.output" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 8, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "2.7\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "# create default graph\n", 233 | "Graph().as_default()\n", 234 | "\n", 235 | "# construct computational graph by creating some nodes\n", 236 | "# implement a simple network for y = a * x + b\n", 237 | "# a and b are constant\n", 238 | "# x is input\n", 239 | "[implement]\n", 240 | "\n", 241 | "\n", 242 | "# create a session object\n", 243 | "[implement]\n", 244 | "\n", 245 | "# run computational graph to compute the output for 'res'\n", 246 | "[implement]\n" 247 | ] 248 | } 249 | ], 250 | "metadata": { 251 | "kernelspec": { 252 | "display_name": "base", 253 | "language": "python", 254 | "name": "python3" 255 | }, 256 | "language_info": { 257 | "codemirror_mode": { 258 | "name": "ipython", 259 | "version": 3 260 | }, 261 | "file_extension": ".py", 262 | "mimetype": "text/x-python", 263 | "name": "python", 264 | "nbconvert_exporter": "python", 265 | "pygments_lexer": "ipython3", 266 | "version": "3.9.18" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 2 271 | } 272 | -------------------------------------------------------------------------------- /mini_tensorflow/mini_tensorflow_full.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "\n", 11 | "class Graph():\n", 12 | " \"\"\" Define a computation graph, which contains\n", 13 | " placeholders: to store the input data\n", 14 | " variables: to store the network parameters\n", 15 | " constants: some static data\n", 16 | " operations: the mathmatical operations for each neural network layer\n", 17 | " \"\"\"\n", 18 | " def __init__(self):\n", 19 | " \"\"\"\n", 20 | " Please define attributes for placeholders, variables, constants, operations,\n", 21 | " using lists.\n", 22 | " \"\"\"\n", 23 | " self.operations = []\n", 24 | " self.placeholders = []\n", 25 | " self.variables = []\n", 26 | " self.constants = []\n", 27 | "\n", 28 | " def as_default(self):\n", 29 | " \"\"\"\n", 30 | " define a global default computation graph\n", 31 | " \"\"\"\n", 32 | " global _default_graph\n", 33 | " _default_graph = self\n", 34 | " \n", 35 | " def add_operation(self, op):\n", 36 | " \"\"\"\n", 37 | " add op to operations\n", 38 | " \"\"\"\n", 39 | " self.operations.append(op)\n", 40 | "\n", 41 | " def add_placeholder(self, holder):\n", 42 | " \"\"\"\n", 43 | " add holder to placeholders\n", 44 | " \"\"\"\n", 45 | " self.placeholders.append(holder)\n", 46 | " \n", 47 | " def add_variable(self, var):\n", 48 | " \"\"\"\n", 49 | " add var to variables\n", 50 | " \"\"\"\n", 51 | " self.variables.append(var)\n", 52 | " \n", 53 | " def add_constant(self, c):\n", 54 | " \"\"\"\n", 55 | " add c to constants\n", 56 | " \"\"\"\n", 57 | " self.constants.append(c)\n", 58 | "\n", 59 | "class Operation():\n", 60 | " \"\"\"\n", 61 | " Define network operation node. It should contain input nodes and one output node.\n", 62 | " \"\"\"\n", 63 | " def __init__(self, input_nodes=None):\n", 64 | " \"\"\"\n", 65 | " Define two attributes here: \n", 66 | " input_nodes\n", 67 | " output\n", 68 | " Add the current Operation node to _default_graph's operations\n", 69 | " \"\"\"\n", 70 | " self.input_nodes = input_nodes\n", 71 | " self.output = None\n", 72 | " \n", 73 | " # Append operation to the list of operations of the default graph\n", 74 | " _default_graph.add_operation(self)\n", 75 | "\n", 76 | " def forward(self):\n", 77 | " pass\n", 78 | "\n", 79 | " def backward(self):\n", 80 | " pass\n", 81 | "\n", 82 | "\n", 83 | "class BinaryOperation(Operation):\n", 84 | " \"\"\"\n", 85 | " define binary operations\n", 86 | " \"\"\"\n", 87 | " def __init__(self, a, b):\n", 88 | " \"\"\"\n", 89 | " a, b are input nodes. please initialize\n", 90 | " \"\"\"\n", 91 | " super().__init__([a, b])\n", 92 | "\n", 93 | "class add(BinaryOperation):\n", 94 | " \"\"\"\n", 95 | " Computes a + b, element-wise\n", 96 | " \"\"\"\n", 97 | " def forward(self, a, b):\n", 98 | " return a + b\n", 99 | "\n", 100 | " def backward(self, upstream_grad):\n", 101 | " raise NotImplementedError\n", 102 | "\n", 103 | "class multiply(BinaryOperation):\n", 104 | " \"\"\"\n", 105 | " Computes a * b, element-wise\n", 106 | " \"\"\"\n", 107 | " def forward(self, a, b):\n", 108 | " return a * b\n", 109 | "\n", 110 | " def backward(self, upstream_grad):\n", 111 | " raise NotImplementedError\n", 112 | "\n", 113 | "class divide(BinaryOperation):\n", 114 | " \"\"\"\n", 115 | " Returns the true division of the inputs, element-wise\n", 116 | " \"\"\"\n", 117 | " def forward(self, a, b):\n", 118 | " return np.true_divide(a, b)\n", 119 | "\n", 120 | " def backward(self, upstream_grad):\n", 121 | " raise NotImplementedError\n", 122 | "\n", 123 | "class matmul(BinaryOperation):\n", 124 | " \"\"\"\n", 125 | " Multiplies matrix a by matrix b, producing a * b\n", 126 | " \"\"\"\n", 127 | " def forward(self, a, b):\n", 128 | " \"\"\"\n", 129 | " using numpy.dot to perform matrix multiplication on a and b\n", 130 | " \"\"\" \n", 131 | " return a.dot(b)\n", 132 | "\n", 133 | " def backward(self, upstream_grad):\n", 134 | " raise NotImplementedError\n", 135 | " \n", 136 | "class Placeholder():\n", 137 | " \"\"\"\n", 138 | " define placeholder. It should contain a value attribute to store value\n", 139 | " \"\"\"\n", 140 | " def __init__(self):\n", 141 | " \"\"\"\n", 142 | " initialize the value to None, add the current node to default graph's placeholder\n", 143 | " \"\"\" \n", 144 | " self.value = None\n", 145 | " _default_graph.add_placeholder(self)\n", 146 | "\n", 147 | "class Constant():\n", 148 | " \"\"\"\n", 149 | " Define a constant node\n", 150 | " \"\"\"\n", 151 | " def __init__(self, value=None):\n", 152 | " \"\"\"\n", 153 | " define internal __value to store the value, add the current node to graph's constant\n", 154 | " \"\"\" \n", 155 | " self.__value = value\n", 156 | " _default_graph.add_constant(self)\n", 157 | "\n", 158 | " @property\n", 159 | " def value(self):\n", 160 | " \"\"\"\n", 161 | " return the internal value\n", 162 | " \"\"\"\n", 163 | " return self.__value\n", 164 | "\n", 165 | " @value.setter\n", 166 | " def value(self, value):\n", 167 | " raise ValueError(\"Cannot reassign value.\")\n", 168 | " \n", 169 | "class Variable():\n", 170 | " \"\"\"\n", 171 | " define a variable node (for parameter) with initial_value\n", 172 | " \"\"\"\n", 173 | " def __init__(self, initial_value=None):\n", 174 | " \"\"\"\n", 175 | " assign initial_value to value, add the current node to graph's variables\n", 176 | " \"\"\" \n", 177 | " self.value = initial_value\n", 178 | " _default_graph.add_variable(self)\n", 179 | "\n", 180 | "def topology_sort(operation):\n", 181 | " \"\"\"\n", 182 | " implement topological sort to order the operations, starting from current node\n", 183 | " \"\"\"\n", 184 | " ordering = []\n", 185 | " visited_nodes = set()\n", 186 | "\n", 187 | " def recursive_helper(node):\n", 188 | " \"\"\"\n", 189 | " for each Operation node (using isinstance)\n", 190 | " recursively find the incoming nodes, visit them first and add node to visited nodes. \n", 191 | " \"\"\" \n", 192 | " if isinstance(node, Operation):\n", 193 | " for input_node in node.input_nodes:\n", 194 | " if input_node not in visited_nodes:\n", 195 | " recursive_helper(input_node)\n", 196 | "\n", 197 | " visited_nodes.add(node)\n", 198 | " ordering.append(node)\n", 199 | "\n", 200 | " # start recursive depth-first search\n", 201 | " recursive_helper(operation)\n", 202 | "\n", 203 | " return ordering\n", 204 | "\n", 205 | "# session = Session()\n", 206 | "# output = session.run(some_operation, {\n", 207 | "# X: train_X # [1,2,...,n_features]\n", 208 | "# })\n", 209 | "\n", 210 | "class Session():\n", 211 | " \"\"\"\n", 212 | " A session provides a context to run the computation graph\n", 213 | " \"\"\"\n", 214 | " def run(self, operation, feed_dict={}):\n", 215 | " \"\"\"\n", 216 | " apply topological sort on the computation graph starting from the operation node\n", 217 | " operation is the final operation node\n", 218 | " feed_dict: a dictionary that maps Placeholder to actual data value (in numpy)\n", 219 | " if a node is a placeholder, it should take value from feed_dict, \n", 220 | " if a node is variable or constant, it just use the node's value\n", 221 | " it a node is an operation, it should get the node's input_nodes, and then apply forward\n", 222 | " \"\"\"\n", 223 | " nodes_sorted = topology_sort(operation)\n", 224 | "\n", 225 | " for node in nodes_sorted:\n", 226 | " if type(node) == Placeholder:\n", 227 | " node.output = feed_dict[node]\n", 228 | " elif type(node) == Variable or type(node) == Constant:\n", 229 | " node.output = node.value\n", 230 | " else:\n", 231 | " inputs = [node.output for node in node.input_nodes]\n", 232 | " node.output = node.forward(*inputs)\n", 233 | "\n", 234 | " return operation.output" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 8, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "2.7\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "# create default graph\n", 252 | "Graph().as_default()\n", 253 | "\n", 254 | "# construct computational graph by creating some nodes\n", 255 | "# implement a simple network for y = a * x + b\n", 256 | "a = Constant(np.array([2.0, 1.5]))\n", 257 | "b = Constant(0.5)\n", 258 | "x = Placeholder()\n", 259 | "x2 = matmul(a, x)\n", 260 | "y = add(x2, b)\n", 261 | "\n", 262 | "x_data = np.array([0.5, 0.8])\n", 263 | "input_data = {x: x_data}\n", 264 | "\n", 265 | "# create a session object\n", 266 | "session = Session()\n", 267 | "\n", 268 | "# run computational graph to compute the output for 'res'\n", 269 | "out = session.run(y, input_data)\n", 270 | "print(out)" 271 | ] 272 | } 273 | ], 274 | "metadata": { 275 | "kernelspec": { 276 | "display_name": "base", 277 | "language": "python", 278 | "name": "python3" 279 | }, 280 | "language_info": { 281 | "codemirror_mode": { 282 | "name": "ipython", 283 | "version": 3 284 | }, 285 | "file_extension": ".py", 286 | "mimetype": "text/x-python", 287 | "name": "python", 288 | "nbconvert_exporter": "python", 289 | "pygments_lexer": "ipython3", 290 | "version": "3.9.18" 291 | } 292 | }, 293 | "nbformat": 4, 294 | "nbformat_minor": 2 295 | } 296 | -------------------------------------------------------------------------------- /simple_cuda_demo/example_matadd.cu: -------------------------------------------------------------------------------- 1 | // This program computes a simple version of matrix multiplication 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | using std::generate; 14 | using std::vector; 15 | 16 | 17 | __global__ void matrixAdd(const int * a, const int * b, 18 | int * c, int N) { 19 | // Compute each thread's global row and column index 20 | int row = blockIdx.x * blockDim.x + threadIdx.x; 21 | int col = blockIdx.y * blockDim.y + threadIdx.y; 22 | 23 | // Iterate over row, and down column 24 | if (row < N && col < N) { 25 | c[row * N + col] = a[row * N + col] + b[row * N + col]; 26 | } 27 | } 28 | 29 | 30 | 31 | // Check result on the CPU 32 | void verify_result_add(vector &a, vector &b, vector &c, int N) { 33 | // For every row... 34 | for (int i = 0; i < N; i++) { 35 | // For every column... 36 | for (int j = 0; j < N; j++) { 37 | // For every element in the row-column pair 38 | // Check against the CPU result 39 | if (a[i * N + j] + b[i * N + j] != c[i * N + j]) { 40 | printf("Error in (%d, %d): %d + %d != %d\n", i, j, a[i * N + j], b[i * N + j], c[i * N + j]); 41 | } 42 | assert(a[i * N + j] + b[i * N + j] == c[i * N + j]); 43 | } 44 | } 45 | } 46 | 47 | int main() { 48 | // Matrix size of 1024 x 1024; 49 | int N = 1 << 10; 50 | 51 | // Size (in bytes) of matrix 52 | size_t bytes = N * N * sizeof(int); 53 | 54 | // Host vectors 55 | vector h_a(N * N); 56 | vector h_b(N * N); 57 | vector h_c(N * N); 58 | 59 | // Initialize matrices 60 | generate(h_a.begin(), h_a.end(), []() { return rand() % 100; }); 61 | generate(h_b.begin(), h_b.end(), []() { return rand() % 100; }); 62 | 63 | 64 | // Allocate device memory 65 | int *d_a, *d_b, *d_c; 66 | cudaMalloc(&d_a, bytes); 67 | cudaMalloc(&d_b, bytes); 68 | cudaMalloc(&d_c, bytes); 69 | 70 | 71 | // Copy data to the device 72 | cudaMemcpy(d_a, h_a.data(), bytes, cudaMemcpyHostToDevice); 73 | cudaMemcpy(d_b, h_b.data(), bytes, cudaMemcpyHostToDevice); 74 | 75 | 76 | // Threads per CTA dimension 77 | int THREADS = 32; 78 | 79 | // Blocks per grid dimension (assumes THREADS divides N evenly) 80 | int BLOCKS = N / THREADS; 81 | 82 | // Use dim3 structs for block and grid dimensions 83 | dim3 threads(THREADS, THREADS); // should be <= 1024 84 | dim3 blocks(BLOCKS, BLOCKS); 85 | 86 | // Launch kernel 87 | matrixAdd<<>>(d_a, d_b, d_c, N); 88 | 89 | // Copy back to the host 90 | cudaMemcpy(h_c.data(), d_c, bytes, cudaMemcpyDeviceToHost); 91 | 92 | cudaDeviceSynchronize(); 93 | 94 | // Check CUDA execution 95 | cudaError_t err = cudaGetLastError(); 96 | if (err != cudaSuccess) { 97 | fprintf(stderr, "Error: %s\n", cudaGetErrorString(err)); 98 | // Handle the error (e.g., by exiting the program) 99 | } 100 | 101 | // Check result 102 | verify_result_add(h_a, h_b, h_c, N); 103 | 104 | cout << "COMPLETED SUCCESSFULLY\n"; 105 | 106 | // Free memory on device 107 | cudaFree(d_a); 108 | cudaFree(d_b); 109 | cudaFree(d_c); 110 | 111 | return 0; 112 | } 113 | -------------------------------------------------------------------------------- /simple_cuda_demo/example_matmul.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | __global__ void MatmulKernel(const float* a, const float* b, float* out, 9 | int M, int N, int P) { 10 | // Calculate the global thread index and the row and column it corresponds to 11 | // Every thread will compute one element of the output matrix 12 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 13 | int row = idx / P; 14 | int col = idx % P; 15 | // Compute the summation of the dot product of the row of a and the column of b 16 | if (row < M && col < P) { 17 | float sum = 0.0; 18 | for (int i = 0; i < N; i++) { 19 | sum += a[row * N + i] * b[i * P + col]; 20 | } 21 | out[row * P + col] = sum; 22 | } 23 | } 24 | 25 | extern "C" { 26 | 27 | // This functions takes in arrays which are already on the GPU 28 | // and will return arrays which are also on the GPU 29 | // Copying values between the device memory and host memory is done in the python codes 30 | 31 | void Matmul(const float* a, const float* b, float* c, int M, int N, int P) { 32 | int n = M * P; 33 | int threads_per_block = 256; 34 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 35 | MatmulKernel<<>>(a, b, c, M, N, P); 36 | } 37 | 38 | } -------------------------------------------------------------------------------- /simple_cuda_demo/example_matmul2.cu: -------------------------------------------------------------------------------- 1 | // This program computes a simple version of matrix multiplication 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using std::cout; 11 | using std::generate; 12 | using std::vector; 13 | 14 | __global__ void matrixMul(const int *a, const int *b, int *c, int M, int N, int P) { 15 | // Compute each thread's global row and column index 16 | int row = blockIdx.x * blockDim.x + threadIdx.x; 17 | int col = blockIdx.y * blockDim.y + threadIdx.y; 18 | if (row >= M || col >= P) return; 19 | // Iterate over row, and down column 20 | c[row * P + col] = 0; 21 | for (int k = 0; k < N; k++) { 22 | // Accumulate results for a single element 23 | c[row * P + col] += a[row * N + k] * b[k * P + col]; 24 | } 25 | } 26 | 27 | // Check result on the CPU 28 | void verify_result(vector &a, vector &b, vector &c, int M, int N, int P) { 29 | // For every row... 30 | for (int i = 0; i < M; i++) { 31 | // For every column... 32 | for (int j = 0; j < P; j++) { 33 | // For every element in the row-column pair 34 | int tmp = 0; 35 | for (int k = 0; k < N; k++) { 36 | // Accumulate the partial results 37 | tmp += a[i * N + k] * b[k * P + j]; 38 | } 39 | 40 | // Check against the CPU result 41 | if (tmp != c[i * P + j]) { 42 | printf("Error in (%d, %d): %d != %d\n", i, j, tmp, c[i * P + j]); 43 | } 44 | assert(tmp == c[i * P + j]); 45 | } 46 | } 47 | } 48 | 49 | int main() { 50 | // Matrix size of 256 x 1024, 1024 x 512; 51 | int M = 1 << 8; 52 | int N = 1 << 10; 53 | int P = 1 << 9; 54 | 55 | 56 | // Host vectors 57 | vector h_a(M * N); 58 | vector h_b(N * P); 59 | vector h_c(M * P); 60 | 61 | // Initialize matrices 62 | generate(h_a.begin(), h_a.end(), []() { return rand() % 100; }); 63 | generate(h_b.begin(), h_b.end(), []() { return rand() % 100; }); 64 | 65 | // Allocate device memory 66 | int *d_a, *d_b, *d_c; 67 | cudaMalloc(&d_a, M * N * sizeof(int)); 68 | cudaMalloc(&d_b, N * P * sizeof(int)); 69 | cudaMalloc(&d_c, M * P * sizeof(int)); 70 | 71 | // Copy data to the device 72 | cudaMemcpy(d_a, h_a.data(), M * N * sizeof(int), cudaMemcpyHostToDevice); 73 | cudaMemcpy(d_b, h_b.data(), N * P * sizeof(int), cudaMemcpyHostToDevice); 74 | 75 | // Threads per CTA dimension 76 | int THREADS = 32; 77 | 78 | // Blocks per grid dimension (assumes THREADS divides N evenly) 79 | int BLOCKS_X = M / THREADS, BLOCKS_Y = P / THREADS; 80 | 81 | // Use dim3 structs for block and grid dimensions 82 | dim3 threads(THREADS, THREADS); 83 | dim3 blocks(BLOCKS_X, BLOCKS_Y); 84 | 85 | // Launch kernel 86 | matrixMul<<>>(d_a, d_b, d_c, M, N, P); 87 | 88 | // Copy back to the host 89 | cudaMemcpy(h_c.data(), d_c, M * P * sizeof(int), cudaMemcpyDeviceToHost); 90 | 91 | // Check result 92 | verify_result(h_a, h_b, h_c, M, N, P); 93 | 94 | cout << "COMPLETED SUCCESSFULLY\n"; 95 | 96 | // Free memory on device 97 | cudaFree(d_a); 98 | cudaFree(d_b); 99 | cudaFree(d_c); 100 | 101 | return 0; 102 | } 103 | -------------------------------------------------------------------------------- /simple_cuda_demo/example_vector_add.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | using std::generate; 13 | using std::vector; 14 | 15 | __global__ void VecAddKernel(int* A, int* B, int* C, int n) { 16 | // blockDim is size of block along x-axis 17 | // blockIdx is the index of the current thread's block 18 | // threadIdx is the index of the current thread within the block 19 | // Compute the global thread ID 20 | int i = blockDim.x * blockIdx.x + threadIdx.x; 21 | if (i < n) { 22 | // Calculate the addition of the ith element of A and B 23 | C[i] = A[i] + B[i]; 24 | } 25 | } 26 | 27 | 28 | extern "C" { 29 | 30 | void VecAddCPU(int* A, int* B, int* C, int n) { 31 | for(int i = 0; i < n; ++i) { 32 | C[i] = A[i] + B[i]; 33 | } 34 | } 35 | 36 | 37 | void VecAddCUDA(int* Agpu, int* Bgpu, int* Cgpu, int n) { 38 | // In this example, we load the data into the GPU by Python codes. 39 | int threads_per_block = 256; 40 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 41 | VecAddKernel<<>>(Agpu, Bgpu, Cgpu, n); 42 | } 43 | 44 | 45 | void VecAddCUDA2(int* Acpu, int* Bcpu, int* Ccpu, int n) { 46 | // In this example, we load the data into the GPU by C++ codes. 47 | int *dA, *dB, *dC; 48 | // Allocate device memory 49 | cudaMalloc(&dA, n * sizeof(int)); 50 | cudaMalloc(&dB, n * sizeof(int)); 51 | cudaMalloc(&dC, n * sizeof(int)); 52 | // Copy data from host memory to device memory 53 | cudaMemcpy(dA, Acpu, n * sizeof(int), cudaMemcpyHostToDevice); 54 | cudaMemcpy(dB, Bcpu, n * sizeof(int), cudaMemcpyHostToDevice); 55 | // Launch the CUDA kernel 56 | int threads_per_block = 256; 57 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 58 | VecAddKernel<<>>(dA, dB, dC, n); 59 | // Copy the result from device memory to host memory 60 | cudaMemcpy(Ccpu, dC, n * sizeof(int), cudaMemcpyDeviceToHost); 61 | // Free device memory 62 | cudaFree(dA); 63 | cudaFree(dB); 64 | cudaFree(dC); 65 | } 66 | 67 | } 68 | 69 | // Check result on the CPU 70 | void verify_result_vecadd(vector &a, vector &b, vector &c, int N) { 71 | // For every element... 72 | for (int i = 0; i < N; i++) { 73 | // For every element in the row-column pair 74 | // Check against the CPU result 75 | if (a[i] + b[i] != c[i]) { 76 | printf("Error in (%d): %d + %d != %d\n", i, a[i], b[i], c[i]); 77 | } 78 | assert(a[i] + b[i] == c[i]); 79 | } 80 | } 81 | 82 | int main() { 83 | // length of the vector 84 | int n = 1024; 85 | 86 | // Host vectors 87 | vector h_a(n); 88 | vector h_b(n); 89 | vector h_c(n); 90 | 91 | // Initialize matrices 92 | generate(h_a.begin(), h_a.end(), []() { return rand() % 100; }); 93 | generate(h_b.begin(), h_b.end(), []() { return rand() % 100; }); 94 | 95 | VecAddCUDA2(h_a.data(), h_b.data(), h_c.data(), n); 96 | 97 | cudaDeviceSynchronize(); 98 | 99 | // Check CUDA execution 100 | cudaError_t err = cudaGetLastError(); 101 | if (err != cudaSuccess) { 102 | fprintf(stderr, "Error: %s\n", cudaGetErrorString(err)); 103 | // Handle the error (e.g., by exiting the program) 104 | } 105 | 106 | // Check result 107 | verify_result_vecadd(h_a, h_b, h_c, n); 108 | 109 | cout << "Vector add verified! COMPLETED SUCCESSFULLY\n"; 110 | } -------------------------------------------------------------------------------- /simple_cuda_demo/example_window_sum.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | #define RADIUS 2 9 | #define THREADS_PER_BLOCK 4 10 | 11 | 12 | __global__ void WindowSumSimpleKernel(float* A, float *B, int n) { 13 | // Each thread will compute one element of B, calculate the global index of the element 14 | int out_idx = blockDim.x * blockIdx.x + threadIdx.x; 15 | if (out_idx < n) { 16 | // Calculate the sum of the elements in the window of 5 17 | float sum = 0; 18 | for (int dx = -RADIUS; dx <= RADIUS; ++dx) { 19 | sum += A[dx + out_idx + RADIUS]; 20 | } 21 | B[out_idx] = sum; 22 | } 23 | } 24 | 25 | __global__ void WindowSumSharedKernel(float* A, float *B, int size_a, int size_b) { 26 | __shared__ float temp[THREADS_PER_BLOCK + 2 * RADIUS]; 27 | int base = blockDim.x * blockIdx.x; 28 | int out_idx = base + threadIdx.x; 29 | // Load the elements into the shared memory 30 | if (base + threadIdx.x < size_a) { 31 | temp[threadIdx.x] = A[base + threadIdx.x]; 32 | } 33 | if (threadIdx.x < 2 * RADIUS && base + THREADS_PER_BLOCK + threadIdx.x < size_a) { 34 | temp[threadIdx.x + THREADS_PER_BLOCK] = A[base + THREADS_PER_BLOCK + threadIdx.x]; 35 | } 36 | // Wait for all threads to finish loading, after this point, all threads will have the same copy of the shared memory 37 | __syncthreads(); 38 | // Until then, start calculating the sum of the elements in the window of 5 39 | if (out_idx < size_b) { 40 | float sum = 0; 41 | for (int dx = -RADIUS; dx <= RADIUS; ++dx) { 42 | // Accessing shared memory within blocks is faster than accessing global memory 43 | sum += temp[threadIdx.x + dx + RADIUS]; 44 | } 45 | B[out_idx] = sum; 46 | } 47 | } 48 | 49 | extern "C" { 50 | 51 | // These two functions take in arrays which are already on the GPU 52 | // and will return arrays which are also on the GPU 53 | // Copying values between the device memory and host memory is done in the python codes 54 | 55 | void WindowSumSimple(float* in_array, float* out_array, int n) { 56 | int num_blocks = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; 57 | WindowSumSimpleKernel<<>>(in_array, out_array, n); 58 | } 59 | 60 | void WindowSumShared(float* in_array, float* out_array, int size_a, int size_b) { 61 | int num_blocks = (size_b + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; 62 | WindowSumSharedKernel<<>>(in_array, out_array, size_a, size_b); 63 | cudaDeviceSynchronize(); 64 | } 65 | } -------------------------------------------------------------------------------- /simple_cuda_demo/test_matmul.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import pycuda.gpuarray as gpuarray 3 | import pycuda.driver as cuda 4 | from pycuda.compiler import SourceModule 5 | import pycuda.autoinit 6 | 7 | import os 8 | import numpy as np 9 | 10 | # Load the shared library 11 | cur_path = os.getcwd() 12 | lib = ctypes.CDLL(os.path.join(cur_path, "matmul.so")) 13 | 14 | m, n, p = 4, 4, 2 15 | np.random.seed(0) 16 | a = np.random.randint(1, 3, [m, n]).astype(np.float32) 17 | b = np.random.randint(1, 3, [n, p]).astype(np.float32) 18 | cgpu = np.zeros([m, p], dtype=np.float32) 19 | cgputile = np.zeros([m, p], dtype=np.float32) 20 | 21 | print(f"Input a: {a}\nInput b: {b}") 22 | 23 | print(f"Numpy matmul: {a @ b}, {type(a@b)}") 24 | 25 | # Define argtypes and returntypes of the C function 26 | lib.Matmul.argtypes = [ 27 | ctypes.POINTER(ctypes.c_float), 28 | ctypes.POINTER(ctypes.c_float), 29 | ctypes.POINTER(ctypes.c_float), 30 | ctypes.c_int, 31 | ] 32 | 33 | lib.Matmul.restype = None 34 | 35 | # Load the arrays to CUDA device 36 | a_gpu = gpuarray.to_gpu(a) 37 | b_gpu = gpuarray.to_gpu(b) 38 | c_gpu = gpuarray.to_gpu(cgpu) 39 | 40 | # Call the C wrapper function with CUDA kernel 41 | lib.Matmul( 42 | ctypes.cast(a_gpu.ptr, ctypes.POINTER(ctypes.c_float)), 43 | ctypes.cast(b_gpu.ptr, ctypes.POINTER(ctypes.c_float)), 44 | ctypes.cast(c_gpu.ptr, ctypes.POINTER(ctypes.c_float)), 45 | ctypes.c_int(m), 46 | ctypes.c_int(n), 47 | ctypes.c_int(p) 48 | ) 49 | 50 | print(f"GPU matmul: {c_gpu}, {type(c_gpu)}") 51 | # Load the gpuarray back to array in the host device 52 | cgpu = c_gpu.get() 53 | print(f"After offload: {cgpu}, {type(cgpu)}") 54 | 55 | # Compare result 56 | ccpu = a @ b 57 | print(f"Compare result: {np.linalg.norm(ccpu - cgpu)}") -------------------------------------------------------------------------------- /simple_cuda_demo/test_vector_add.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import pycuda.gpuarray as gpuarray 3 | import pycuda.driver as cuda 4 | from pycuda.compiler import SourceModule 5 | import pycuda.autoinit 6 | 7 | import os 8 | import numpy as np 9 | 10 | # Load the shared library 11 | cur_path = os.getcwd() 12 | lib = ctypes.CDLL(os.path.join(cur_path, "vector_add.so")) 13 | 14 | size = 10 15 | a = np.random.randint(1, 10, size, dtype=np.int32) 16 | b = np.random.randint(1, 10, size, dtype=np.int32) 17 | ccpu = np.zeros(size, dtype=np.int32) 18 | cgpu = np.zeros(size, dtype=np.int32) 19 | 20 | print(f"Input a: {a}\nInput b: {b}") 21 | 22 | print(f"Numpy add: {a + b}, {type(a+b)}") 23 | 24 | # Define argtypes and returntypes of the C function 25 | lib.VecAddCPU.argtypes = [ 26 | np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'), 27 | np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'), 28 | np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'), 29 | ctypes.c_int 30 | ] 31 | 32 | lib.VecAddCPU.restype = None 33 | 34 | lib.VecAddCUDA.argtypes = [ 35 | ctypes.POINTER(ctypes.c_int), 36 | ctypes.POINTER(ctypes.c_int), 37 | ctypes.POINTER(ctypes.c_int), 38 | ctypes.c_int, 39 | ] 40 | 41 | lib.VecAddCUDA.restype = None 42 | 43 | # Call the C function 44 | lib.VecAddCPU(a, b, ccpu, size) 45 | 46 | print(f"CPU add: {ccpu}, {type(ccpu)}") 47 | 48 | # Load the arrays to CUDA device 49 | a_gpu = gpuarray.to_gpu(a) 50 | b_gpu = gpuarray.to_gpu(b) 51 | c_gpu = gpuarray.to_gpu(cgpu) 52 | 53 | # Call the C wrapper function with CUDA kernel 54 | lib.VecAddCUDA( 55 | ctypes.cast(a_gpu.ptr, ctypes.POINTER(ctypes.c_int)), 56 | ctypes.cast(b_gpu.ptr, ctypes.POINTER(ctypes.c_int)), 57 | ctypes.cast(c_gpu.ptr, ctypes.POINTER(ctypes.c_int)), 58 | ctypes.c_int(size) 59 | ) 60 | 61 | print(f"GPU add: {c_gpu}, {type(c_gpu)}") 62 | # Load the gpuarray back to array in the host device 63 | cgpu = c_gpu.get() 64 | print(f"After offload: {cgpu}, {type(cgpu)}") 65 | 66 | 67 | 68 | lib.VecAddCUDA2.argtypes = [ 69 | np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'), 70 | np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'), 71 | np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'), 72 | ctypes.c_int 73 | ] 74 | 75 | lib.VecAddCUDA2.restype = None 76 | cgpu2 = np.zeros(size, dtype=np.int32) 77 | lib.VecAddCUDA2(a, b, cgpu2, size) 78 | print(f"GPU add2: {cgpu2}, {type(cgpu2)}") 79 | 80 | 81 | -------------------------------------------------------------------------------- /simple_cuda_demo/test_window_sum.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import pycuda.gpuarray as gpuarray 3 | import pycuda.driver as cuda 4 | from pycuda.compiler import SourceModule 5 | import pycuda.autoinit 6 | 7 | import os 8 | import numpy as np 9 | from numpy.lib.stride_tricks import sliding_window_view 10 | 11 | # Load the shared library 12 | cur_path = os.getcwd() 13 | lib = ctypes.CDLL(os.path.join(cur_path, "window_sum.so")) 14 | 15 | in_array = np.array([i+1 for i in range(12)], dtype=np.float32) 16 | simple_array = np.zeros(8, dtype=np.float32) 17 | shared_array = np.zeros(8, dtype=np.float32) 18 | 19 | print(f"Input: {in_array}") 20 | win_temp = sliding_window_view(in_array, 5) 21 | np_res = np.sum(win_temp, axis=1) 22 | print(f"Numpy window sum: {np_res}") 23 | 24 | # Define argtypes and returntypes of the C function 25 | lib.WindowSumSimple.argtypes = [ 26 | ctypes.POINTER(ctypes.c_float), 27 | ctypes.POINTER(ctypes.c_float), 28 | ctypes.c_int, 29 | ] 30 | 31 | lib.WindowSumSimple.restype = None 32 | 33 | lib.WindowSumShared.argtypes = [ 34 | ctypes.POINTER(ctypes.c_float), 35 | ctypes.POINTER(ctypes.c_float), 36 | ctypes.c_int, 37 | ctypes.c_int, 38 | ] 39 | 40 | lib.WindowSumShared.restype = None 41 | 42 | # Load the arrays to CUDA device 43 | in_gpu = gpuarray.to_gpu(in_array) 44 | simple_out_gpu = gpuarray.to_gpu(simple_array) 45 | shared_out_gpu = gpuarray.to_gpu(shared_array) 46 | 47 | # Call the C wrapper function with CUDA kernel 48 | lib.WindowSumSimple( 49 | ctypes.cast(in_gpu.ptr, ctypes.POINTER(ctypes.c_float)), 50 | ctypes.cast(simple_out_gpu.ptr, ctypes.POINTER(ctypes.c_float)), 51 | ctypes.c_int(len(simple_array)), 52 | ) 53 | 54 | # Load the gpuarray back to array in the host device 55 | simple_array = simple_out_gpu.get() 56 | print(f"GPU simple window sum: {simple_array}") 57 | 58 | # Call the C wrapper function with CUDA kernel 59 | lib.WindowSumShared( 60 | ctypes.cast(in_gpu.ptr, ctypes.POINTER(ctypes.c_float)), 61 | ctypes.cast(shared_out_gpu.ptr, ctypes.POINTER(ctypes.c_float)), 62 | ctypes.c_int(len(in_gpu)), 63 | ctypes.c_int(len(shared_array)), 64 | ) 65 | 66 | # Load the gpuarray back to array in the host device 67 | shared_array = shared_out_gpu.get() 68 | print(f"GPU shared window sum: {shared_array}") -------------------------------------------------------------------------------- /tensor_demo/miniTorch/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "venv/bin/python", 3 | "python.testing.unittestEnabled": false, 4 | "python.testing.nosetestsEnabled": false, 5 | "python.testing.pytestEnabled": true, 6 | "python.testing.pytestArgs": [ 7 | "tests" 8 | ], 9 | "restructuredtext.confPath": "${workspaceFolder}/docs/source", 10 | "python.linting.enabled": true, 11 | "python.linting.pylintEnabled": false, 12 | "python.linting.banditEnabled": false, 13 | "python.linting.flake8Enabled": true, 14 | "python.linting.mypyEnabled": false, 15 | "python.linting.flake8Args": [ 16 | "--ignore", 17 | "N801, E203, E266, E501, W503, F812, E741, N803, N802, N806" 18 | ], 19 | } 20 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Sasha Rush 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: minitorch 3 | Version: 0.4 4 | License-File: LICENSE 5 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | README.md 3 | setup.cfg 4 | setup.py 5 | minitorch.egg-info/PKG-INFO 6 | minitorch.egg-info/SOURCES.txt 7 | minitorch.egg-info/dependency_links.txt 8 | minitorch.egg-info/top_level.txt -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import datasets # noqa: F401,F403 2 | from .tensor import * # noqa: F401,F403 3 | from .tensor_data import * # noqa: F401,F403 4 | from .tensor_functions import * # noqa: F401,F403 5 | from .tensor_ops import * # noqa: F401,F403 6 | 7 | version = "0.4" 8 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/autodiff.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/autodiff.cpython-38.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/autodiff.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/autodiff.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/cuda_kernel_ops.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/cuda_kernel_ops.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/cuda_ops.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/cuda_ops.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/datasets.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/datasets.cpython-38.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/datasets.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/datasets.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/fast_conv.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/fast_conv.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/fast_ops.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/fast_ops.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/module.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/module.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/nn.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/nn.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/operators.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/operators.cpython-38.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/operators.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/operators.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/optim.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/optim.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/scalar.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/scalar.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/scalar_functions.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/scalar_functions.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/tensor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor.cpython-38.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/tensor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/tensor_data.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_data.cpython-38.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/tensor_data.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_data.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/tensor_functions.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_functions.cpython-38.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/tensor_functions.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_functions.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/tensor_ops.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_ops.cpython-38.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/tensor_ops.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_ops.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/__pycache__/testing.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/testing.cpython-39.pyc -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/autodiff.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any, Iterable, List, Tuple 3 | 4 | from typing_extensions import Protocol 5 | 6 | # ## Task 1.1 7 | # Central Difference calculation 8 | 9 | 10 | def central_difference(f: Any, *vals: Any, arg: int = 0, epsilon: float = 1e-6) -> Any: 11 | r""" 12 | Computes an approximation to the derivative of `f` with respect to one arg. 13 | 14 | See :doc:`derivative` or https://en.wikipedia.org/wiki/Finite_difference for more details. 15 | 16 | Args: 17 | f : arbitrary function from n-scalar args to one value 18 | *vals : n-float values $x_0 \ldots x_{n-1}$ 19 | arg : the number $i$ of the arg to compute the derivative 20 | epsilon : a small constant 21 | 22 | Returns: 23 | An approximation of $f'_i(x_0, \ldots, x_{n-1})$ 24 | """ 25 | # ASSIGN1.1 26 | vals1 = [v for v in vals] 27 | vals2 = [v for v in vals] 28 | vals1[arg] = vals1[arg] + epsilon 29 | vals2[arg] = vals2[arg] - epsilon 30 | delta = f(*vals1) - f(*vals2) 31 | return delta / (2 * epsilon) 32 | # END ASSIGN1 33 | 34 | 35 | variable_count = 1 36 | 37 | 38 | class Variable(Protocol): 39 | def accumulate_derivative(self, x: Any) -> None: 40 | pass 41 | 42 | @property 43 | def unique_id(self) -> int: 44 | pass 45 | 46 | def is_leaf(self) -> bool: 47 | pass 48 | 49 | def is_constant(self) -> bool: 50 | pass 51 | 52 | @property 53 | def parents(self) -> Iterable["Variable"]: 54 | pass 55 | 56 | def chain_rule(self, d_output: Any) -> Iterable[Tuple["Variable", Any]]: 57 | pass 58 | 59 | 60 | def topological_sort(variable: Variable) -> Iterable[Variable]: 61 | """ 62 | Computes the topological order of the computation graph. 63 | 64 | Args: 65 | variable: The right-most variable 66 | 67 | Returns: 68 | Non-constant Variables in topological order starting from the right. 69 | """ 70 | order: List[Variable] = [] 71 | return order 72 | 73 | 74 | def backpropagate(variable: Variable, deriv: Any) -> None: 75 | """ 76 | Runs backpropagation on the computation graph in order to 77 | compute derivatives for the leave nodes. 78 | 79 | Args: 80 | variable: The right-most variable 81 | deriv : Its derivative that we want to propagate backward to the leaves. 82 | 83 | No return. Should write to its results to the derivative values of each leaf through `accumulate_derivative`. 84 | """ 85 | 86 | 87 | @dataclass 88 | class Context: 89 | """ 90 | Context class is used by `Function` to store information during the forward pass. 91 | """ 92 | 93 | no_grad: bool = False 94 | saved_values: Tuple[Any, ...] = () 95 | 96 | def save_for_backward(self, *values: Any) -> None: 97 | "Store the given `values` if they need to be used during backpropagation." 98 | if self.no_grad: 99 | return 100 | self.saved_values = values 101 | 102 | @property 103 | def saved_tensors(self) -> Tuple[Any, ...]: 104 | return self.saved_values 105 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/datasets.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | from dataclasses import dataclass 4 | from typing import List, Tuple 5 | 6 | 7 | def make_pts(N: int) -> List[Tuple[float, float]]: 8 | X = [] 9 | for i in range(N): 10 | x_1 = random.random() 11 | x_2 = random.random() 12 | X.append((x_1, x_2)) 13 | return X 14 | 15 | 16 | @dataclass 17 | class Graph: 18 | N: int 19 | X: List[Tuple[float, float]] 20 | y: List[int] 21 | 22 | 23 | def simple(N: int) -> Graph: 24 | X = make_pts(N) 25 | y = [] 26 | for x_1, x_2 in X: 27 | y1 = 1 if x_1 < 0.5 else 0 28 | y.append(y1) 29 | return Graph(N, X, y) 30 | 31 | 32 | def diag(N: int) -> Graph: 33 | X = make_pts(N) 34 | y = [] 35 | for x_1, x_2 in X: 36 | y1 = 1 if x_1 + x_2 < 0.5 else 0 37 | y.append(y1) 38 | return Graph(N, X, y) 39 | 40 | 41 | def split(N: int) -> Graph: 42 | X = make_pts(N) 43 | y = [] 44 | for x_1, x_2 in X: 45 | y1 = 1 if x_1 < 0.2 or x_1 > 0.8 else 0 46 | y.append(y1) 47 | return Graph(N, X, y) 48 | 49 | 50 | def xor(N: int) -> Graph: 51 | X = make_pts(N) 52 | y = [] 53 | for x_1, x_2 in X: 54 | y1 = 1 if ((x_1 < 0.5 and x_2 > 0.5) or (x_1 > 0.5 and x_2 < 0.5)) else 0 55 | y.append(y1) 56 | return Graph(N, X, y) 57 | 58 | 59 | def circle(N: int) -> Graph: 60 | X = make_pts(N) 61 | y = [] 62 | for x_1, x_2 in X: 63 | x1, x2 = (x_1 - 0.5, x_2 - 0.5) 64 | y1 = 1 if x1 * x1 + x2 * x2 > 0.1 else 0 65 | y.append(y1) 66 | return Graph(N, X, y) 67 | 68 | 69 | def spiral(N: int) -> Graph: 70 | def x(t: float) -> float: 71 | return t * math.cos(t) / 20.0 72 | 73 | def y(t: float) -> float: 74 | return t * math.sin(t) / 20.0 75 | 76 | X = [ 77 | (x(10.0 * (float(i) / (N // 2))) + 0.5, y(10.0 * (float(i) / (N // 2))) + 0.5) 78 | for i in range(5 + 0, 5 + N // 2) 79 | ] 80 | X = X + [ 81 | (y(-10.0 * (float(i) / (N // 2))) + 0.5, x(-10.0 * (float(i) / (N // 2))) + 0.5) 82 | for i in range(5 + 0, 5 + N // 2) 83 | ] 84 | y2 = [0] * (N // 2) + [1] * (N // 2) 85 | return Graph(N, X, y2) 86 | 87 | 88 | datasets = { 89 | "Simple": simple, 90 | "Diag": diag, 91 | "Split": split, 92 | "Xor": xor, 93 | "Circle": circle, 94 | "Spiral": spiral, 95 | } 96 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/module.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Dict, Optional, Sequence, Tuple 4 | 5 | 6 | class Module: 7 | """ 8 | Modules form a tree that store parameters and other 9 | submodules. They make up the basis of neural network stacks. 10 | 11 | Attributes: 12 | _modules : Storage of the child modules 13 | _parameters : Storage of the module's parameters 14 | training : Whether the module is in training mode or evaluation mode 15 | 16 | """ 17 | 18 | _modules: Dict[str, Module] 19 | _parameters: Dict[str, Parameter] 20 | training: bool 21 | 22 | def __init__(self) -> None: 23 | self._modules = {} 24 | self._parameters = {} 25 | self.training = True 26 | 27 | def modules(self) -> Sequence[Module]: 28 | "Return the direct child modules of this module." 29 | m: Dict[str, Module] = self.__dict__["_modules"] 30 | return list(m.values()) 31 | 32 | def train(self) -> None: 33 | "Set the mode of this module and all descendent modules to `train`." 34 | # ASSIGN0.4 35 | for m in self.modules(): 36 | m.train() 37 | self.training = True 38 | # END ASSIGN0.4 39 | 40 | def eval(self) -> None: 41 | "Set the mode of this module and all descendent modules to `eval`." 42 | for m in self.modules(): 43 | m.eval() 44 | self.training = False 45 | 46 | def named_parameters(self) -> Sequence[Tuple[str, Parameter]]: 47 | """ 48 | Collect all the parameters of this module and its descendents. 49 | 50 | 51 | Returns: 52 | The name and `Parameter` of each ancestor parameter. 53 | """ 54 | 55 | # Collect our parameters and give them a name. 56 | parameters = {} 57 | for k, v in self._parameters.items(): 58 | parameters[k] = v 59 | 60 | # Recurse down to children submodules 61 | for mod_name, m in self._modules.items(): 62 | for k, v in m.named_parameters(): 63 | parameters[f"{mod_name}.{k}"] = v 64 | return list(parameters.items()) 65 | 66 | def parameters(self) -> Sequence[Parameter]: 67 | "Enumerate over all the parameters of this module and its descendents." 68 | return [j for _, j in self.named_parameters()] 69 | 70 | def add_parameter(self, k: str, v: Any) -> Parameter: 71 | """ 72 | Manually add a parameter. Useful helper for scalar parameters. 73 | 74 | Args: 75 | k: Local name of the parameter. 76 | v: Value for the parameter. 77 | 78 | Returns: 79 | Newly created parameter. 80 | """ 81 | val = Parameter(v, k) 82 | self.__dict__["_parameters"][k] = val 83 | return val 84 | 85 | def __setattr__(self, key: str, val: Parameter) -> None: 86 | if isinstance(val, Parameter): 87 | self.__dict__["_parameters"][key] = val 88 | elif isinstance(val, Module): 89 | self.__dict__["_modules"][key] = val 90 | else: 91 | super().__setattr__(key, val) 92 | 93 | def __getattr__(self, key: str) -> Any: 94 | if key in self.__dict__["_parameters"]: 95 | return self.__dict__["_parameters"][key] 96 | 97 | if key in self.__dict__["_modules"]: 98 | return self.__dict__["_modules"][key] 99 | return None 100 | 101 | def __call__(self, *args: Any, **kwargs: Any) -> Any: 102 | return self.forward(*args, **kwargs) 103 | 104 | def __repr__(self) -> str: 105 | def _addindent(s_: str, numSpaces: int) -> str: 106 | s2 = s_.split("\n") 107 | if len(s2) == 1: 108 | return s_ 109 | first = s2.pop(0) 110 | s2 = [(numSpaces * " ") + line for line in s2] 111 | s = "\n".join(s2) 112 | s = first + "\n" + s 113 | return s 114 | 115 | child_lines = [] 116 | 117 | for key, module in self._modules.items(): 118 | mod_str = repr(module) 119 | mod_str = _addindent(mod_str, 2) 120 | child_lines.append("(" + key + "): " + mod_str) 121 | lines = child_lines 122 | 123 | main_str = self.__class__.__name__ + "(" 124 | if lines: 125 | # simple one-liner info, which most builtin Modules will use 126 | main_str += "\n " + "\n ".join(lines) + "\n" 127 | 128 | main_str += ")" 129 | return main_str 130 | 131 | 132 | class Parameter: 133 | """ 134 | A Parameter is a special container stored in a `Module`. 135 | 136 | It is designed to hold a `Variable`, but we allow it to hold 137 | any value for testing. 138 | """ 139 | 140 | def __init__(self, x: Any, name: Optional[str] = None) -> None: 141 | self.value = x 142 | self.name = name 143 | if hasattr(x, "requires_grad_"): 144 | self.value.requires_grad_(True) 145 | if self.name: 146 | self.value.name = self.name 147 | 148 | def update(self, x: Any) -> None: 149 | "Update the parameter value." 150 | self.value = x 151 | if hasattr(x, "requires_grad_"): 152 | self.value.requires_grad_(True) 153 | if self.name: 154 | self.value.name = self.name 155 | 156 | def __repr__(self) -> str: 157 | return repr(self.value) 158 | 159 | def __str__(self) -> str: 160 | return str(self.value) 161 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/operators.py: -------------------------------------------------------------------------------- 1 | """ 2 | Collection of the core mathematical operators used throughout the code base. 3 | """ 4 | 5 | import math 6 | from typing import Callable, Iterable 7 | 8 | # ## Task 0.1 9 | # 10 | # Implementation of a prelude of elementary functions. 11 | 12 | 13 | def mul(x: float, y: float) -> float: 14 | "$f(x, y) = x * y$" 15 | # ASSIGN0.1 16 | return x * y 17 | # END ASSIGN0.1 18 | 19 | 20 | def id(x: float) -> float: 21 | "$f(x) = x$" 22 | # ASSIGN0.1 23 | return x 24 | # END ASSIGN0.1 25 | 26 | 27 | def add(x: float, y: float) -> float: 28 | "$f(x, y) = x + y$" 29 | # ASSIGN0.1 30 | return x + y 31 | # END ASSIGN0.1 32 | 33 | 34 | def neg(x: float) -> float: 35 | "$f(x) = -x$" 36 | # ASSIGN0.1 37 | return -x 38 | # END ASSIGN0.1 39 | 40 | 41 | def lt(x: float, y: float) -> float: 42 | "$f(x) =$ 1.0 if x is less than y else 0.0" 43 | # ASSIGN0.1 44 | return 1.0 if x < y else 0.0 45 | # END ASSIGN0.1 46 | 47 | 48 | def eq(x: float, y: float) -> float: 49 | "$f(x) =$ 1.0 if x is equal to y else 0.0" 50 | # ASSIGN0.1 51 | return 1.0 if x == y else 0.0 52 | # END ASSIGN0.1 53 | 54 | 55 | def max(x: float, y: float) -> float: 56 | "$f(x) =$ x if x is greater than y else y" 57 | # ASSIGN0.1 58 | return x if x > y else y 59 | # END ASSIGN0.1 60 | 61 | 62 | def is_close(x: float, y: float) -> float: 63 | "$f(x) = |x - y| < 1e-2$" 64 | # ASSIGN0.1 65 | return (x - y < 1e-2) and (y - x < 1e-2) 66 | # END ASSIGN0.1 67 | 68 | 69 | def sigmoid(x: float) -> float: 70 | r""" 71 | $f(x) = \frac{1.0}{(1.0 + e^{-x})}$ 72 | 73 | (See https://en.wikipedia.org/wiki/Sigmoid_function ) 74 | 75 | Calculate as 76 | 77 | $f(x) = \frac{1.0}{(1.0 + e^{-x})}$ if x >=0 else $\frac{e^x}{(1.0 + e^{x})}$ 78 | 79 | for stability. 80 | """ 81 | # ASSIGN0.1 82 | if x >= 0: 83 | return 1.0 / (1.0 + math.exp(-x)) 84 | else: 85 | return math.exp(x) / (1.0 + math.exp(x)) 86 | # END ASSIGN0.1 87 | 88 | 89 | def relu(x: float) -> float: 90 | """ 91 | $f(x) =$ x if x is greater than 0, else 0 92 | 93 | (See https://en.wikipedia.org/wiki/Rectifier_(neural_networks) .) 94 | """ 95 | # ASSIGN0.1 96 | return x if x > 0 else 0.0 97 | # END ASSIGN0.1 98 | 99 | 100 | EPS = 1e-6 101 | 102 | 103 | def log(x: float) -> float: 104 | "$f(x) = log(x)$" 105 | return math.log(x + EPS) 106 | 107 | 108 | def exp(x: float) -> float: 109 | "$f(x) = e^{x}$" 110 | return math.exp(x) 111 | 112 | 113 | def log_back(x: float, d: float) -> float: 114 | r"If $f = log$ as above, compute $d \times f'(x)$" 115 | # ASSIGN0.1 116 | return d / (x + EPS) 117 | # END ASSIGN0.1 118 | 119 | 120 | def inv(x: float) -> float: 121 | "$f(x) = 1/x$" 122 | # ASSIGN0.1 123 | return 1.0 / x 124 | # END ASSIGN0.1 125 | 126 | 127 | def inv_back(x: float, d: float) -> float: 128 | r"If $f(x) = 1/x$ compute $d \times f'(x)$" 129 | # ASSIGN0.1 130 | return -(1.0 / x**2) * d 131 | # END ASSIGN0.1 132 | 133 | 134 | def relu_back(x: float, d: float) -> float: 135 | r"If $f = relu$ compute $d \times f'(x)$" 136 | # ASSIGN0.1 137 | return d if x > 0 else 0.0 138 | # END ASSIGN0.1 139 | 140 | 141 | # ## Task 0.3 142 | 143 | # Small practice library of elementary higher-order functions. 144 | 145 | 146 | def map(fn: Callable[[float], float]) -> Callable[[Iterable[float]], Iterable[float]]: 147 | """ 148 | Higher-order map. 149 | 150 | See https://en.wikipedia.org/wiki/Map_(higher-order_function) 151 | 152 | Args: 153 | fn: Function from one value to one value. 154 | 155 | Returns: 156 | A function that takes a list, applies `fn` to each element, and returns a 157 | new list 158 | """ 159 | # ASSIGN0.3 160 | def _map(ls: Iterable[float]) -> Iterable[float]: 161 | ret = [] 162 | for x in ls: 163 | ret.append(fn(x)) 164 | return ret 165 | 166 | return _map 167 | # END ASSIGN0.3 168 | 169 | 170 | def negList(ls: Iterable[float]) -> Iterable[float]: 171 | "Use `map` and `neg` to negate each element in `ls`" 172 | # ASSIGN0.3 173 | return map(neg)(ls) 174 | # END ASSIGN0.3 175 | 176 | 177 | def zipWith( 178 | fn: Callable[[float, float], float] 179 | ) -> Callable[[Iterable[float], Iterable[float]], Iterable[float]]: 180 | """ 181 | Higher-order zipwith (or map2). 182 | 183 | See https://en.wikipedia.org/wiki/Map_(higher-order_function) 184 | 185 | Args: 186 | fn: combine two values 187 | 188 | Returns: 189 | Function that takes two equally sized lists `ls1` and `ls2`, produce a new list by 190 | applying fn(x, y) on each pair of elements. 191 | 192 | """ 193 | # ASSIGN0.3 194 | def _zipWith(ls1: Iterable[float], ls2: Iterable[float]) -> Iterable[float]: 195 | ret = [] 196 | for x, y in zip(ls1, ls2): 197 | ret.append(fn(x, y)) 198 | return ret 199 | 200 | return _zipWith 201 | # END ASSIGN0.3 202 | 203 | 204 | def addLists(ls1: Iterable[float], ls2: Iterable[float]) -> Iterable[float]: 205 | "Add the elements of `ls1` and `ls2` using `zipWith` and `add`" 206 | # ASSIGN0.3 207 | return zipWith(add)(ls1, ls2) 208 | # END ASSIGN0.3 209 | 210 | 211 | def reduce( 212 | fn: Callable[[float, float], float], start: float 213 | ) -> Callable[[Iterable[float]], float]: 214 | r""" 215 | Higher-order reduce. 216 | 217 | Args: 218 | fn: combine two values 219 | start: start value $x_0$ 220 | 221 | Returns: 222 | Function that takes a list `ls` of elements 223 | $x_1 \ldots x_n$ and computes the reduction :math:`fn(x_3, fn(x_2, 224 | fn(x_1, x_0)))` 225 | """ 226 | # ASSIGN0.3 227 | def _reduce(ls: Iterable[float]) -> float: 228 | val = start 229 | for l in ls: 230 | val = fn(val, l) 231 | return val 232 | 233 | return _reduce 234 | # END ASSIGN0.3 235 | 236 | 237 | def sum(ls: Iterable[float]) -> float: 238 | "Sum up a list using `reduce` and `add`." 239 | # ASSIGN0.3 240 | return reduce(add, 0.0)(ls) 241 | # END ASSIGN0.3 242 | 243 | 244 | def prod(ls: Iterable[float]) -> float: 245 | "Product of a list using `reduce` and `mul`." 246 | # ASSIGN0.3 247 | return reduce(mul, 1.0)(ls) 248 | # END ASSIGN0.3 249 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/optim.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence 2 | 3 | from .module import Parameter 4 | 5 | 6 | class Optimizer: 7 | def __init__(self, parameters: Sequence[Parameter]): 8 | self.parameters = parameters 9 | 10 | 11 | class SGD(Optimizer): 12 | def __init__(self, parameters: Sequence[Parameter], lr: float = 1.0): 13 | super().__init__(parameters) 14 | self.lr = lr 15 | 16 | def zero_grad(self) -> None: 17 | for p in self.parameters: 18 | if p.value is None: 19 | continue 20 | if hasattr(p.value, "derivative"): 21 | if p.value.derivative is not None: 22 | p.value.derivative = None 23 | if hasattr(p.value, "grad"): 24 | if p.value.grad is not None: 25 | p.value.grad = None 26 | 27 | def step(self) -> None: 28 | for p in self.parameters: 29 | if p.value is None: 30 | continue 31 | if hasattr(p.value, "grad"): 32 | if p.value.grad is not None: 33 | p.update(p.value - self.lr * p.value.grad) 34 | 35 | def _print(self) -> None: 36 | for param in self.parameters: 37 | if param.value is None: 38 | continue 39 | print(param.value.shape) 40 | print(param.value.grad) 41 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/tensor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of the core Tensor object for autodifferentiation. 3 | """ 4 | 5 | from __future__ import annotations 6 | 7 | from dataclasses import dataclass 8 | from typing import TYPE_CHECKING 9 | 10 | import numpy as np 11 | 12 | from . import operators 13 | from .autodiff import Context, Variable, backpropagate 14 | from .tensor_data import TensorData 15 | from .tensor_functions import ( 16 | EQ, 17 | LT, 18 | Add, 19 | All, 20 | Copy, 21 | Exp, 22 | Inv, 23 | IsClose, 24 | Log, 25 | MatMul, 26 | Mul, 27 | Neg, 28 | Permute, 29 | ReLU, 30 | Sigmoid, 31 | Sum, 32 | View, 33 | tensor, 34 | ) 35 | 36 | if TYPE_CHECKING: 37 | from typing import Any, Iterable, List, Optional, Sequence, Tuple, Type, Union 38 | 39 | import numpy.typing as npt 40 | 41 | from .tensor_data import Shape, Storage, Strides, UserIndex, UserShape, UserStrides 42 | from .tensor_functions import Function 43 | from .tensor_ops import TensorBackend 44 | 45 | TensorLike = Union[float, int, "Tensor"] 46 | 47 | 48 | @dataclass 49 | class History: 50 | """ 51 | `History` stores the history of `Function` operations that was 52 | used to construct the current Variable. 53 | """ 54 | 55 | last_fn: Optional[Type[Function]] = None 56 | ctx: Optional[Context] = None 57 | inputs: Sequence[Tensor] = () 58 | 59 | 60 | _tensor_count = 0 61 | 62 | 63 | class Tensor: 64 | """ 65 | Tensor is a generalization of Scalar in that it is a Variable that 66 | handles multidimensional arrays. 67 | """ 68 | 69 | backend: TensorBackend 70 | history: Optional[History] 71 | grad: Optional[Tensor] 72 | _tensor: TensorData 73 | unique_id: int 74 | name: str 75 | 76 | def __init__( 77 | self, 78 | v: TensorData, 79 | back: Optional[History] = None, 80 | name: Optional[str] = None, 81 | backend: Optional[TensorBackend] = None, 82 | ): 83 | global _tensor_count 84 | _tensor_count += 1 85 | self.unique_id = _tensor_count 86 | assert isinstance(v, TensorData) 87 | assert backend is not None 88 | self._tensor = v 89 | self.history = back 90 | self.backend = backend 91 | self.grad = None 92 | if name is not None: 93 | self.name = name 94 | else: 95 | self.name = str(self.unique_id) 96 | 97 | self.f = backend 98 | 99 | def requires_grad_(self, x: bool) -> None: 100 | self.history = History() 101 | 102 | def requires_grad(self) -> bool: 103 | return self.history is not None 104 | 105 | def to_numpy(self) -> npt.NDArray[np.float64]: 106 | """ 107 | Returns: 108 | Converted to numpy array 109 | """ 110 | return self.contiguous()._tensor._storage.reshape(self.shape) 111 | 112 | # Properties 113 | @property 114 | def shape(self) -> UserShape: 115 | """ 116 | Returns: 117 | shape of the tensor 118 | """ 119 | return self._tensor.shape 120 | 121 | @property 122 | def size(self) -> int: 123 | """ 124 | Returns: 125 | int : size of the tensor 126 | """ 127 | return self._tensor.size 128 | 129 | @property 130 | def dims(self) -> int: 131 | """ 132 | Returns: 133 | int : dimensionality of the tensor 134 | """ 135 | return self._tensor.dims 136 | 137 | def _ensure_tensor(self, b: TensorLike) -> Tensor: 138 | "Turns a python number into a tensor with the same backend." 139 | if isinstance(b, (int, float)): 140 | c = Tensor.make([b], (1,), backend=self.backend) 141 | else: 142 | b._type_(self.backend) 143 | c = b 144 | return c 145 | 146 | # Functions 147 | def __add__(self, b: TensorLike) -> Tensor: 148 | return Add.apply(self, self._ensure_tensor(b)) 149 | 150 | def __sub__(self, b: TensorLike) -> Tensor: 151 | return Add.apply(self, -self._ensure_tensor(b)) 152 | 153 | def __mul__(self, b: TensorLike) -> Tensor: 154 | return Mul.apply(self, self._ensure_tensor(b)) 155 | 156 | def __truediv__(self, b: TensorLike) -> Tensor: 157 | return Mul.apply(self, Inv.apply(self._ensure_tensor(b))) 158 | 159 | def __rtruediv__(self, b: TensorLike) -> Tensor: 160 | return Mul.apply(self._ensure_tensor(b), Inv.apply(self)) 161 | 162 | def __matmul__(self, b: Tensor) -> Tensor: 163 | "Not used until Module 3" 164 | return MatMul.apply(self, b) 165 | 166 | def __lt__(self, b: TensorLike) -> Tensor: 167 | return LT.apply(self, self._ensure_tensor(b)) 168 | 169 | def __eq__(self, b: TensorLike) -> Tensor: # type: ignore[override] 170 | return EQ.apply(self, self._ensure_tensor(b)) 171 | 172 | def __gt__(self, b: TensorLike) -> Tensor: 173 | return LT.apply(self._ensure_tensor(b), self) 174 | 175 | def __neg__(self) -> Tensor: 176 | return Neg.apply(self) 177 | 178 | def __radd__(self, b: TensorLike) -> Tensor: 179 | return self + b 180 | 181 | def __rmul__(self, b: TensorLike) -> Tensor: 182 | return self * b 183 | 184 | def all(self, dim: Optional[int] = None) -> Tensor: 185 | if dim is None: 186 | return All.apply(self.view(self.size), self._ensure_tensor(0)) 187 | else: 188 | return All.apply(self, self._ensure_tensor(dim)) 189 | 190 | def is_close(self, y: Tensor) -> Tensor: 191 | return IsClose.apply(self, y) 192 | 193 | def sigmoid(self) -> Tensor: 194 | return Sigmoid.apply(self) 195 | 196 | def relu(self) -> Tensor: 197 | return ReLU.apply(self) 198 | 199 | def log(self) -> Tensor: 200 | return Log.apply(self) 201 | 202 | def exp(self) -> Tensor: 203 | return Exp.apply(self) 204 | 205 | def item(self) -> float: 206 | assert self.size == 1 207 | x: float = self._tensor._storage[0] 208 | return x 209 | 210 | def sum(self, dim: Optional[int] = None) -> Tensor: 211 | "Compute the sum over dimension `dim`" 212 | if dim is None: 213 | return Sum.apply(self.contiguous().view(self.size), self._ensure_tensor(0)) 214 | else: 215 | return Sum.apply(self, self._ensure_tensor(dim)) 216 | 217 | def mean(self, dim: Optional[int] = None) -> Tensor: 218 | "Compute the mean over dimension `dim`" 219 | if dim is not None: 220 | return self.sum(dim) / self.shape[dim] 221 | else: 222 | return self.sum() / self.size 223 | 224 | def var(self, dim: Optional[int] = None) -> Tensor: 225 | "Compute the variance over dimension `dim`" 226 | if dim is not None: 227 | shape = self.shape 228 | 229 | mean = self.sum(dim) / self.shape[dim] 230 | mean = mean.contiguous().view(shape) 231 | 232 | diff = self.__sub__(mean) ** 2 233 | diff = diff.sum(dim) / self.shape[dim] 234 | 235 | return diff 236 | else: 237 | shape = self.shape 238 | mean = self.sum() / self.size 239 | mean = mean.contiguous().view(shape) 240 | 241 | diff = self.__sub__(mean) ** 2 242 | diff = diff.sum() / self.size 243 | 244 | return diff 245 | 246 | def permute(self, *order: int) -> Tensor: 247 | "Permute tensor dimensions to *order" 248 | return Permute.apply(self, tensor(list(order))) 249 | 250 | def view(self, *shape: int) -> Tensor: 251 | "Change the shape of the tensor to a new shape with the same size" 252 | return View.apply(self, tensor(list(shape))) 253 | 254 | def contiguous(self) -> Tensor: 255 | "Return a contiguous tensor with the same data" 256 | return Copy.apply(self) 257 | 258 | def __repr__(self) -> str: 259 | return self._tensor.to_string() 260 | 261 | def __getitem__(self, key: Union[int, UserIndex]) -> float: 262 | key2 = (key,) if isinstance(key, int) else key 263 | return self._tensor.get(key2) 264 | 265 | def __setitem__(self, key: Union[int, UserIndex], val: float) -> None: 266 | key2 = (key,) if isinstance(key, int) else key 267 | self._tensor.set(key2, val) 268 | 269 | # Internal methods used for autodiff. 270 | def _type_(self, backend: TensorBackend) -> None: 271 | self.backend = backend 272 | if backend.cuda: # pragma: no cover 273 | self._tensor.to_cuda_() 274 | 275 | def _new(self, tensor_data: TensorData) -> Tensor: 276 | return Tensor(tensor_data, backend=self.backend) 277 | 278 | @staticmethod 279 | def make( 280 | storage: Union[Storage, List[float]], 281 | shape: UserShape, 282 | strides: Optional[UserStrides] = None, 283 | backend: Optional[TensorBackend] = None, 284 | ) -> Tensor: 285 | "Create a new tensor from data" 286 | return Tensor(TensorData(storage, shape, strides), backend=backend) 287 | 288 | def expand(self, other: Tensor) -> Tensor: 289 | """ 290 | Method used to allow for backprop over broadcasting. 291 | This method is called when the output of `backward` 292 | is a different size than the input of `forward`. 293 | 294 | 295 | Parameters: 296 | other : backward tensor (must broadcast with self) 297 | 298 | Returns: 299 | Expanded version of `other` with the right derivatives 300 | 301 | """ 302 | 303 | # Case 1: Both the same shape. 304 | if self.shape == other.shape: 305 | return other 306 | 307 | # Case 2: Backward is a smaller than self. Broadcast up. 308 | true_shape = TensorData.shape_broadcast(self.shape, other.shape) 309 | buf = self.zeros(true_shape) 310 | self.backend.id_map(other, buf) 311 | if self.shape == true_shape: 312 | return buf 313 | 314 | # Case 3: Still different, reduce extra dims. 315 | out = buf 316 | orig_shape = [1] * (len(out.shape) - len(self.shape)) + list(self.shape) 317 | for dim, shape in enumerate(out.shape): 318 | if orig_shape[dim] == 1 and shape != 1: 319 | out = self.backend.add_reduce(out, dim) 320 | assert out.size == self.size, f"{out.shape} {self.shape}" 321 | # START CODE CHANGE (2021) 322 | return Tensor.make(out._tensor._storage, self.shape, backend=self.backend) 323 | # END CODE CHANGE (2021) 324 | 325 | def zeros(self, shape: Optional[UserShape] = None) -> Tensor: 326 | def zero(shape: UserShape) -> Tensor: 327 | return Tensor.make( 328 | [0.0] * int(operators.prod(shape)), shape, backend=self.backend 329 | ) 330 | 331 | if shape is None: 332 | out = zero(self.shape) 333 | else: 334 | out = zero(shape) 335 | out._type_(self.backend) 336 | return out 337 | 338 | def tuple(self) -> Tuple[Storage, Shape, Strides]: 339 | return self._tensor.tuple() 340 | 341 | def detach(self) -> Tensor: 342 | return Tensor(self._tensor, backend=self.backend) 343 | 344 | # Variable elements for backprop 345 | 346 | def accumulate_derivative(self, x: Any) -> None: 347 | """ 348 | Add `val` to the the derivative accumulated on this variable. 349 | Should only be called during autodifferentiation on leaf variables. 350 | 351 | Args: 352 | x : value to be accumulated 353 | """ 354 | assert self.is_leaf(), "Only leaf variables can have derivatives." 355 | if self.grad is None: 356 | self.grad = Tensor.make( 357 | [0] * int(operators.prod(self.shape)), self.shape, backend=self.backend 358 | ) 359 | self.grad += x 360 | 361 | def is_leaf(self) -> bool: 362 | "True if this variable created by the user (no `last_fn`)" 363 | return self.history is not None and self.history.last_fn is None 364 | 365 | def is_constant(self) -> bool: 366 | return self.history is None 367 | 368 | @property 369 | def parents(self) -> Iterable[Variable]: 370 | assert self.history is not None 371 | return self.history.inputs 372 | 373 | def chain_rule(self, d_output: Any) -> Iterable[Tuple[Variable, Any]]: 374 | h = self.history 375 | assert h is not None 376 | assert h.last_fn is not None 377 | assert h.ctx is not None 378 | 379 | x = h.last_fn._backward(h.ctx, d_output) 380 | assert len(x) == len(h.inputs), f"Bug in function {h.last_fn}" 381 | return [ 382 | (inp, inp.expand(self._ensure_tensor(d_in))) 383 | for inp, d_in in zip(h.inputs, x) 384 | ] 385 | 386 | def backward(self, grad_output: Optional[Tensor] = None) -> None: 387 | if grad_output is None: 388 | assert self.shape == (1,), "Must provide grad_output if non-scalar" 389 | grad_output = Tensor.make([1.0], (1,), backend=self.backend) 390 | backpropagate(self, grad_output) 391 | 392 | def zero_grad_(self) -> None: # pragma: no cover 393 | """ 394 | Reset the derivative on this variable. 395 | """ 396 | self.grad = None 397 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/tensor_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import random 4 | from typing import Iterable, Optional, Sequence, Tuple, Union 5 | 6 | import numba 7 | import numpy as np 8 | import numpy.typing as npt 9 | from numpy import array, float64 10 | from typing_extensions import TypeAlias 11 | 12 | from .operators import prod 13 | 14 | MAX_DIMS = 32 15 | 16 | 17 | class IndexingError(RuntimeError): 18 | "Exception raised for indexing errors." 19 | pass 20 | 21 | 22 | Storage: TypeAlias = npt.NDArray[np.float64] 23 | OutIndex: TypeAlias = npt.NDArray[np.int32] 24 | Index: TypeAlias = npt.NDArray[np.int32] 25 | Shape: TypeAlias = npt.NDArray[np.int32] 26 | Strides: TypeAlias = npt.NDArray[np.int32] 27 | 28 | UserIndex: TypeAlias = Sequence[int] 29 | UserShape: TypeAlias = Sequence[int] 30 | UserStrides: TypeAlias = Sequence[int] 31 | 32 | 33 | def index_to_position(index: Index, strides: Strides) -> int: 34 | """ 35 | Converts a multidimensional tensor `index` into a single-dimensional position in 36 | storage based on strides. 37 | 38 | Args: 39 | index : index tuple of ints 40 | strides : tensor strides 41 | 42 | Returns: 43 | Position in storage 44 | """ 45 | 46 | # ASSIGN2.1 47 | position = 0 48 | for ind, stride in zip(index, strides): 49 | position += ind * stride 50 | return position 51 | # END ASSIGN2.1 52 | 53 | 54 | def to_index(ordinal: int, shape: Shape, out_index: OutIndex) -> None: 55 | """ 56 | Convert an `ordinal` to an index in the `shape`. 57 | Should ensure that enumerating position 0 ... size of a 58 | tensor produces every index exactly once. It 59 | may not be the inverse of `index_to_position`. 60 | 61 | Args: 62 | ordinal: ordinal position to convert. 63 | shape : tensor shape. 64 | out_index : return index corresponding to position. 65 | 66 | """ 67 | # ASSIGN2.1 68 | cur_ord = ordinal + 0 69 | for i in range(len(shape) - 1, -1, -1): 70 | sh = shape[i] 71 | out_index[i] = int(cur_ord % sh) 72 | cur_ord = cur_ord // sh 73 | # END ASSIGN2.1 74 | 75 | 76 | def broadcast_index( 77 | big_index: Index, big_shape: Shape, shape: Shape, out_index: OutIndex 78 | ) -> None: 79 | """ 80 | Convert a `big_index` into `big_shape` to a smaller `out_index` 81 | into `shape` following broadcasting rules. In this case 82 | it may be larger or with more dimensions than the `shape` 83 | given. Additional dimensions may need to be mapped to 0 or 84 | removed. 85 | 86 | Args: 87 | big_index : multidimensional index of bigger tensor 88 | big_shape : tensor shape of bigger tensor 89 | shape : tensor shape of smaller tensor 90 | out_index : multidimensional index of smaller tensor 91 | 92 | Returns: 93 | None 94 | """ 95 | # ASSIGN2.2 96 | for i, s in enumerate(shape): 97 | if s > 1: 98 | out_index[i] = big_index[i + (len(big_shape) - len(shape))] 99 | else: 100 | out_index[i] = 0 101 | return None 102 | # END ASSIGN2.2 103 | 104 | 105 | def shape_broadcast(shape1: UserShape, shape2: UserShape) -> UserShape: 106 | """ 107 | Broadcast two shapes to create a new union shape. 108 | 109 | Args: 110 | shape1 : first shape 111 | shape2 : second shape 112 | 113 | Returns: 114 | broadcasted shape 115 | 116 | Raises: 117 | IndexingError : if cannot broadcast 118 | """ 119 | # ASSIGN2.2 120 | a, b = shape1, shape2 121 | m = max(len(a), len(b)) 122 | c_rev = [0] * m 123 | a_rev = list(reversed(a)) 124 | b_rev = list(reversed(b)) 125 | for i in range(m): 126 | if i >= len(a): 127 | c_rev[i] = b_rev[i] 128 | elif i >= len(b): 129 | c_rev[i] = a_rev[i] 130 | else: 131 | c_rev[i] = max(a_rev[i], b_rev[i]) 132 | if a_rev[i] != c_rev[i] and a_rev[i] != 1: 133 | raise IndexingError(f"Broadcast failure {a} {b}") 134 | if b_rev[i] != c_rev[i] and b_rev[i] != 1: 135 | raise IndexingError(f"Broadcast failure {a} {b}") 136 | return tuple(reversed(c_rev)) 137 | # END ASSIGN2.2 138 | 139 | 140 | def strides_from_shape(shape: UserShape) -> UserStrides: 141 | layout = [1] 142 | offset = 1 143 | for s in reversed(shape): 144 | layout.append(s * offset) 145 | offset = s * offset 146 | return tuple(reversed(layout[:-1])) 147 | 148 | 149 | class TensorData: 150 | _storage: Storage 151 | _strides: Strides 152 | _shape: Shape 153 | strides: UserStrides 154 | shape: UserShape 155 | dims: int 156 | 157 | def __init__( 158 | self, 159 | storage: Union[Sequence[float], Storage], 160 | shape: UserShape, 161 | strides: Optional[UserStrides] = None, 162 | ): 163 | if isinstance(storage, np.ndarray): 164 | self._storage = storage 165 | else: 166 | self._storage = array(storage, dtype=float64) 167 | 168 | if strides is None: 169 | strides = strides_from_shape(shape) 170 | 171 | assert isinstance(strides, tuple), "Strides must be tuple" 172 | assert isinstance(shape, tuple), "Shape must be tuple" 173 | if len(strides) != len(shape): 174 | raise IndexingError(f"Len of strides {strides} must match {shape}.") 175 | self._strides = array(strides) 176 | self._shape = array(shape) 177 | self.strides = strides 178 | self.dims = len(strides) 179 | self.size = int(prod(shape)) 180 | self.shape = shape 181 | assert len(self._storage) == self.size 182 | 183 | def to_cuda_(self) -> None: # pragma: no cover 184 | if not numba.cuda.is_cuda_array(self._storage): 185 | self._storage = numba.cuda.to_device(self._storage) 186 | 187 | def is_contiguous(self) -> bool: 188 | """ 189 | Check that the layout is contiguous, i.e. outer dimensions have bigger strides than inner dimensions. 190 | 191 | Returns: 192 | bool : True if contiguous 193 | """ 194 | last = 1e9 195 | for stride in self._strides: 196 | if stride > last: 197 | return False 198 | last = stride 199 | return True 200 | 201 | @staticmethod 202 | def shape_broadcast(shape_a: UserShape, shape_b: UserShape) -> UserShape: 203 | return shape_broadcast(shape_a, shape_b) 204 | 205 | def index(self, index: Union[int, UserIndex]) -> int: 206 | if isinstance(index, int): 207 | aindex: Index = array([index]) 208 | if isinstance(index, tuple): 209 | aindex = array(index) 210 | 211 | # Pretend 0-dim shape is 1-dim shape of singleton 212 | shape = self.shape 213 | if len(shape) == 0 and len(aindex) != 0: 214 | shape = (1,) 215 | 216 | # Check for errors 217 | if aindex.shape[0] != len(self.shape): 218 | raise IndexingError(f"Index {aindex} must be size of {self.shape}.") 219 | for i, ind in enumerate(aindex): 220 | if ind >= self.shape[i]: 221 | raise IndexingError(f"Index {aindex} out of range {self.shape}.") 222 | if ind < 0: 223 | raise IndexingError(f"Negative indexing for {aindex} not supported.") 224 | 225 | # Call fast indexing. 226 | return index_to_position(array(index), self._strides) 227 | 228 | def indices(self) -> Iterable[UserIndex]: 229 | lshape: Shape = array(self.shape) 230 | out_index: Index = array(self.shape) 231 | for i in range(self.size): 232 | to_index(i, lshape, out_index) 233 | yield tuple(out_index) 234 | 235 | def sample(self) -> UserIndex: 236 | return tuple((random.randint(0, s - 1) for s in self.shape)) 237 | 238 | def get(self, key: UserIndex) -> float: 239 | x: float = self._storage[self.index(key)] 240 | return x 241 | 242 | def set(self, key: UserIndex, val: float) -> None: 243 | self._storage[self.index(key)] = val 244 | 245 | def tuple(self) -> Tuple[Storage, Shape, Strides]: 246 | return (self._storage, self._shape, self._strides) 247 | 248 | def permute(self, *order: int) -> TensorData: 249 | """ 250 | Permute the dimensions of the tensor. 251 | 252 | Args: 253 | *order: a permutation of the dimensions 254 | 255 | Returns: 256 | New `TensorData` with the same storage and a new dimension order. 257 | """ 258 | assert list(sorted(order)) == list( 259 | range(len(self.shape)) 260 | ), f"Must give a position to each dimension. Shape: {self.shape} Order: {order}" 261 | 262 | # ASSIGN2.1 263 | return TensorData( 264 | self._storage, 265 | tuple([self.shape[o] for o in order]), 266 | tuple([self._strides[o] for o in order]), 267 | ) 268 | # END ASSIGN2.1 269 | 270 | def to_string(self) -> str: 271 | s = "" 272 | for index in self.indices(): 273 | l = "" 274 | for i in range(len(index) - 1, -1, -1): 275 | if index[i] == 0: 276 | l = "\n%s[" % ("\t" * i) + l 277 | else: 278 | break 279 | s += l 280 | v = self.get(index) 281 | s += f"{v:f}" 282 | l = "" 283 | for i in range(len(index) - 1, -1, -1): 284 | if index[i] == self.shape[i] - 1: 285 | l += "]" 286 | else: 287 | break 288 | if l: 289 | s += l 290 | else: 291 | s += " " 292 | return s 293 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/minitorch/tensor_ops.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING, Callable, Optional, Type 4 | 5 | import numpy as np 6 | from typing_extensions import Protocol 7 | 8 | from . import operators 9 | from .tensor_data import ( 10 | MAX_DIMS, 11 | broadcast_index, 12 | index_to_position, 13 | shape_broadcast, 14 | to_index, 15 | ) 16 | 17 | if TYPE_CHECKING: 18 | from .tensor import Tensor 19 | from .tensor_data import Index, Shape, Storage, Strides 20 | 21 | 22 | class MapProto(Protocol): 23 | def __call__(self, x: Tensor, out: Optional[Tensor] = ..., /) -> Tensor: 24 | ... 25 | 26 | 27 | class TensorOps: 28 | @staticmethod 29 | def map(fn: Callable[[float], float]) -> MapProto: 30 | pass 31 | 32 | @staticmethod 33 | def cmap(fn: Callable[[float], float]) -> Callable[[Tensor, Tensor], Tensor]: 34 | pass 35 | 36 | @staticmethod 37 | def zip(fn: Callable[[float, float], float]) -> Callable[[Tensor, Tensor], Tensor]: 38 | pass 39 | 40 | @staticmethod 41 | def reduce( 42 | fn: Callable[[float, float], float], start: float = 0.0 43 | ) -> Callable[[Tensor, int], Tensor]: 44 | pass 45 | 46 | @staticmethod 47 | def matrix_multiply(a: Tensor, b: Tensor) -> Tensor: 48 | raise NotImplementedError("Not implemented in this assignment") 49 | 50 | cuda = False 51 | 52 | 53 | class TensorBackend: 54 | def __init__(self, ops: Type[TensorOps]): 55 | """ 56 | Dynamically construct a tensor backend based on a `tensor_ops` object 57 | that implements map, zip, and reduce higher-order functions. 58 | 59 | Args: 60 | ops : tensor operations object see `tensor_ops.py` 61 | 62 | 63 | Returns : 64 | A collection of tensor functions 65 | 66 | """ 67 | 68 | # Maps 69 | self.neg_map = ops.map(operators.neg) 70 | self.sigmoid_map = ops.map(operators.sigmoid) 71 | self.relu_map = ops.map(operators.relu) 72 | self.log_map = ops.map(operators.log) 73 | self.exp_map = ops.map(operators.exp) 74 | self.id_map = ops.map(operators.id) 75 | self.id_cmap = ops.cmap(operators.id) 76 | self.inv_map = ops.map(operators.inv) 77 | 78 | # Zips 79 | self.add_zip = ops.zip(operators.add) 80 | self.mul_zip = ops.zip(operators.mul) 81 | self.lt_zip = ops.zip(operators.lt) 82 | self.eq_zip = ops.zip(operators.eq) 83 | self.is_close_zip = ops.zip(operators.is_close) 84 | self.relu_back_zip = ops.zip(operators.relu_back) 85 | self.log_back_zip = ops.zip(operators.log_back) 86 | self.inv_back_zip = ops.zip(operators.inv_back) 87 | 88 | # Reduce 89 | self.add_reduce = ops.reduce(operators.add, 0.0) 90 | self.mul_reduce = ops.reduce(operators.mul, 1.0) 91 | self.matrix_multiply = ops.matrix_multiply 92 | self.cuda = ops.cuda 93 | 94 | 95 | class SimpleOps(TensorOps): 96 | @staticmethod 97 | def map(fn: Callable[[float], float]) -> MapProto: 98 | """ 99 | Higher-order tensor map function :: 100 | 101 | fn_map = map(fn) 102 | fn_map(a, out) 103 | out 104 | 105 | Simple version:: 106 | 107 | for i: 108 | for j: 109 | out[i, j] = fn(a[i, j]) 110 | 111 | Broadcasted version (`a` might be smaller than `out`) :: 112 | 113 | for i: 114 | for j: 115 | out[i, j] = fn(a[i, 0]) 116 | 117 | Args: 118 | fn: function from float-to-float to apply. 119 | a (:class:`TensorData`): tensor to map over 120 | out (:class:`TensorData`): optional, tensor data to fill in, 121 | should broadcast with `a` 122 | 123 | Returns: 124 | new tensor data 125 | """ 126 | 127 | f = tensor_map(fn) 128 | 129 | def ret(a: Tensor, out: Optional[Tensor] = None) -> Tensor: 130 | if out is None: 131 | out = a.zeros(a.shape) 132 | f(*out.tuple(), *a.tuple()) 133 | return out 134 | 135 | return ret 136 | 137 | @staticmethod 138 | def zip( 139 | fn: Callable[[float, float], float] 140 | ) -> Callable[["Tensor", "Tensor"], "Tensor"]: 141 | """ 142 | Higher-order tensor zip function :: 143 | 144 | fn_zip = zip(fn) 145 | out = fn_zip(a, b) 146 | 147 | Simple version :: 148 | 149 | for i: 150 | for j: 151 | out[i, j] = fn(a[i, j], b[i, j]) 152 | 153 | Broadcasted version (`a` and `b` might be smaller than `out`) :: 154 | 155 | for i: 156 | for j: 157 | out[i, j] = fn(a[i, 0], b[0, j]) 158 | 159 | 160 | Args: 161 | fn: function from two floats-to-float to apply 162 | a (:class:`TensorData`): tensor to zip over 163 | b (:class:`TensorData`): tensor to zip over 164 | 165 | Returns: 166 | :class:`TensorData` : new tensor data 167 | """ 168 | 169 | f = tensor_zip(fn) 170 | 171 | def ret(a: "Tensor", b: "Tensor") -> "Tensor": 172 | if a.shape != b.shape: 173 | c_shape = shape_broadcast(a.shape, b.shape) 174 | else: 175 | c_shape = a.shape 176 | out = a.zeros(c_shape) 177 | f(*out.tuple(), *a.tuple(), *b.tuple()) 178 | return out 179 | 180 | return ret 181 | 182 | @staticmethod 183 | def reduce( 184 | fn: Callable[[float, float], float], start: float = 0.0 185 | ) -> Callable[["Tensor", int], "Tensor"]: 186 | """ 187 | Higher-order tensor reduce function. :: 188 | 189 | fn_reduce = reduce(fn) 190 | out = fn_reduce(a, dim) 191 | 192 | Simple version :: 193 | 194 | for j: 195 | out[1, j] = start 196 | for i: 197 | out[1, j] = fn(out[1, j], a[i, j]) 198 | 199 | 200 | Args: 201 | fn: function from two floats-to-float to apply 202 | a (:class:`TensorData`): tensor to reduce over 203 | dim (int): int of dim to reduce 204 | 205 | Returns: 206 | :class:`TensorData` : new tensor 207 | """ 208 | f = tensor_reduce(fn) 209 | 210 | def ret(a: "Tensor", dim: int) -> "Tensor": 211 | out_shape = list(a.shape) 212 | out_shape[dim] = 1 213 | 214 | # Other values when not sum. 215 | out = a.zeros(tuple(out_shape)) 216 | out._tensor._storage[:] = start 217 | 218 | f(*out.tuple(), *a.tuple(), dim) 219 | return out 220 | 221 | return ret 222 | 223 | @staticmethod 224 | def matrix_multiply(a: "Tensor", b: "Tensor") -> "Tensor": 225 | raise NotImplementedError("Not implemented in this assignment") 226 | 227 | is_cuda = False 228 | 229 | 230 | # Implementations. 231 | 232 | 233 | def tensor_map( 234 | fn: Callable[[float], float] 235 | ) -> Callable[[Storage, Shape, Strides, Storage, Shape, Strides], None]: 236 | """ 237 | Low-level implementation of tensor map between 238 | tensors with *possibly different strides*. 239 | 240 | Simple version: 241 | 242 | * Fill in the `out` array by applying `fn` to each 243 | value of `in_storage` assuming `out_shape` and `in_shape` 244 | are the same size. 245 | 246 | Broadcasted version: 247 | 248 | * Fill in the `out` array by applying `fn` to each 249 | value of `in_storage` assuming `out_shape` and `in_shape` 250 | broadcast. (`in_shape` must be smaller than `out_shape`). 251 | 252 | Args: 253 | fn: function from float-to-float to apply 254 | 255 | Returns: 256 | Tensor map function. 257 | """ 258 | 259 | def _map( 260 | out: Storage, 261 | out_shape: Shape, 262 | out_strides: Strides, 263 | in_storage: Storage, 264 | in_shape: Shape, 265 | in_strides: Strides, 266 | ) -> None: 267 | # ASSIGN2.3 268 | out_index: Index = np.zeros(MAX_DIMS, np.int16) 269 | in_index: Index = np.zeros(MAX_DIMS, np.int16) 270 | for i in range(len(out)): 271 | to_index(i, out_shape, out_index) 272 | broadcast_index(out_index, out_shape, in_shape, in_index) 273 | o = index_to_position(out_index, out_strides) 274 | j = index_to_position(in_index, in_strides) 275 | out[o] = fn(in_storage[j]) 276 | # END ASSIGN2.3 277 | 278 | return _map 279 | 280 | 281 | def tensor_zip( 282 | fn: Callable[[float, float], float] 283 | ) -> Callable[ 284 | [Storage, Shape, Strides, Storage, Shape, Strides, Storage, Shape, Strides], None 285 | ]: 286 | """ 287 | Low-level implementation of tensor zip between 288 | tensors with *possibly different strides*. 289 | 290 | Simple version: 291 | 292 | * Fill in the `out` array by applying `fn` to each 293 | value of `a_storage` and `b_storage` assuming `out_shape` 294 | and `a_shape` are the same size. 295 | 296 | Broadcasted version: 297 | 298 | * Fill in the `out` array by applying `fn` to each 299 | value of `a_storage` and `b_storage` assuming `a_shape` 300 | and `b_shape` broadcast to `out_shape`. 301 | 302 | Args: 303 | fn: function mapping two floats to float to apply 304 | 305 | Returns: 306 | Tensor zip function. 307 | """ 308 | 309 | def _zip( 310 | out: Storage, 311 | out_shape: Shape, 312 | out_strides: Strides, 313 | a_storage: Storage, 314 | a_shape: Shape, 315 | a_strides: Strides, 316 | b_storage: Storage, 317 | b_shape: Shape, 318 | b_strides: Strides, 319 | ) -> None: 320 | # ASSIGN2.3 321 | out_index: Index = np.zeros(MAX_DIMS, np.int32) 322 | a_index: Index = np.zeros(MAX_DIMS, np.int32) 323 | b_index: Index = np.zeros(MAX_DIMS, np.int32) 324 | for i in range(len(out)): 325 | to_index(i, out_shape, out_index) 326 | o = index_to_position(out_index, out_strides) 327 | broadcast_index(out_index, out_shape, a_shape, a_index) 328 | j = index_to_position(a_index, a_strides) 329 | broadcast_index(out_index, out_shape, b_shape, b_index) 330 | k = index_to_position(b_index, b_strides) 331 | out[o] = fn(a_storage[j], b_storage[k]) 332 | # END ASSIGN2.3 333 | 334 | return _zip 335 | 336 | 337 | def tensor_reduce( 338 | fn: Callable[[float, float], float] 339 | ) -> Callable[[Storage, Shape, Strides, Storage, Shape, Strides, int], None]: 340 | """ 341 | Low-level implementation of tensor reduce. 342 | 343 | * `out_shape` will be the same as `a_shape` 344 | except with `reduce_dim` turned to size `1` 345 | 346 | Args: 347 | fn: reduction function mapping two floats to float 348 | 349 | Returns: 350 | Tensor reduce function. 351 | """ 352 | 353 | def _reduce( 354 | out: Storage, 355 | out_shape: Shape, 356 | out_strides: Strides, 357 | a_storage: Storage, 358 | a_shape: Shape, 359 | a_strides: Strides, 360 | reduce_dim: int, 361 | ) -> None: 362 | # ASSIGN2.3 363 | out_index: Index = np.zeros(MAX_DIMS, np.int32) 364 | reduce_size = a_shape[reduce_dim] 365 | for i in range(len(out)): 366 | to_index(i, out_shape, out_index) 367 | o = index_to_position(out_index, out_strides) 368 | for s in range(reduce_size): 369 | out_index[reduce_dim] = s 370 | j = index_to_position(out_index, a_strides) 371 | out[o] = fn(out[o], a_storage[j]) 372 | # END ASSIGN2.3 373 | 374 | return _reduce 375 | 1 376 | 377 | SimpleBackend = TensorBackend(SimpleOps) 378 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/requirements.extra.txt: -------------------------------------------------------------------------------- 1 | datasets==2.4.0 2 | embeddings==0.0.8 3 | networkx==2.4 4 | plotly==4.14.3 5 | pydot==1.4.1 6 | python-mnist 7 | streamlit==1.12.0 8 | streamlit-ace 9 | torch 10 | watchdog==1.0.2 11 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/requirements.txt: -------------------------------------------------------------------------------- 1 | colorama==0.4.3 2 | hypothesis == 6.54 3 | mypy == 0.971 4 | numba == 0.58.1 5 | numpy == 1.23.5 6 | pre-commit == 2.20.0 7 | pytest == 7.1.2 8 | pytest-env 9 | pytest-runner == 5.2 10 | typing_extensions -------------------------------------------------------------------------------- /tensor_demo/miniTorch/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name=minitorch 3 | version=0.4 4 | 5 | [files] 6 | packages = 7 | minitorch 8 | [darglint] 9 | ignore_regex=((^_(.*))|(.*map)|(.*zip)|(.*reduce)|(test.*)|(tensor_.*)) 10 | docstring_style=google 11 | strictness=long 12 | 13 | [flake8] 14 | ignore = N801, E203, E266, E501, W503, F812, E741, N803, N802, N806 15 | exclude = .git,__pycache__,docs/slides/*,old,build,dist 16 | 17 | [isort] 18 | profile=black 19 | src_paths=minitorch,test 20 | 21 | [mypy] 22 | strict = True 23 | ignore_missing_imports = True 24 | exclude=^(docs/)|(project/)|(assignments/) 25 | implicit_reexport = True 26 | 27 | [mypy-tests.*] 28 | disallow_untyped_decorators = False 29 | implicit_reexport = True 30 | 31 | [black] 32 | exclude=^(docs/)|(project/)|(assignments/) 33 | 34 | [tool:pytest] 35 | markers = 36 | task0_0 37 | task0_1 38 | task0_2 39 | task0_3 40 | task0_4 41 | task1_0 42 | task1_1 43 | task1_2 44 | task1_3 45 | task1_4 46 | task2_0 47 | task2_1 48 | task2_2 49 | task2_3 50 | task2_4 51 | task3_0 52 | task3_1 53 | task3_2 54 | task3_3 55 | task3_4 56 | task4_0 57 | task4_1 58 | task4_2 59 | task4_3 60 | task4_4 61 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(py_modules=[]) 4 | -------------------------------------------------------------------------------- /tensor_demo/miniTorch/style.sh: -------------------------------------------------------------------------------- 1 | flake8 --ignore "N801, E203, E266, E501, W503, F812, E741, N803, N802, N806" minitorch/ tests/ project/ 2 | -------------------------------------------------------------------------------- /tokenization/tokenization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import re \n", 10 | "from collections import defaultdict \n", 11 | "import string\n", 12 | "\n", 13 | "def get_init_vocab(data): \n", 14 | " \"\"\" \n", 15 | " Given a list of strings, returns a dictionary of words mapping to their frequency \n", 16 | " count in the data. \n", 17 | " Args: \n", 18 | " data: raw text with line breaks\n", 19 | " \n", 20 | " Returns: \n", 21 | " (vocab, tokens) tuple, \n", 22 | " vocab is a dictionary mapping space delimited characters to count (e.g. {'a b c ': 5})\n", 23 | " tokens is a set of basic characters. \n", 24 | " \"\"\"\n", 25 | " vocab = defaultdict(int)\n", 26 | " tokens = set()\n", 27 | " tokens.add('')\n", 28 | " for line in data: \n", 29 | " for word in line.split(): \n", 30 | " vocab[' '.join(list(word)) + ' '] += 1\n", 31 | " tokens.update(list(word))\n", 32 | " return vocab, tokens \n", 33 | " \n", 34 | "def count_cooccurance(vocab): \n", 35 | " \"\"\" \n", 36 | " Given a vocabulary (dictionary mapping words to frequency counts), returns a \n", 37 | " dictionary of tuples representing the frequency count of pairs of characters \n", 38 | " in the vocabulary. \n", 39 | " Args:\n", 40 | " vocab: a dictionary mapping space-delimited tokens to count (e.g. {'a b c ': 5})\n", 41 | " \n", 42 | " Returns: \n", 43 | " a dictionary mapping a tuple of tokens to count\n", 44 | " \"\"\"\n", 45 | " pairs = defaultdict(int) \n", 46 | " for word, freq in vocab.items(): \n", 47 | " chars = word.split() # split the word by any white space\n", 48 | " for i in range(len(chars)-1): \n", 49 | " pairs[chars[i], chars[i+1]] += freq \n", 50 | " return pairs\n", 51 | " \n", 52 | "def merge_vocab(token_pair, vocab_in): \n", 53 | " \"\"\" \n", 54 | " Given a pair of tokens and a vocabulary, returns a new vocabulary with the \n", 55 | " pair of tokens merged together wherever they appear. \n", 56 | " \n", 57 | " e.g. merge_vocab(('a', 'b'), {'a b c ': 5})\n", 58 | " returns {'ab c ': 5}\n", 59 | " \n", 60 | " Args: \n", 61 | " token_pair: a tuple of two tokens\n", 62 | " vocab_in: a dictionary mapping space-delimited tokens to count (e.g. {'a b c ': 5})\n", 63 | " \n", 64 | " Returns: \n", 65 | " a dictionary mapping space-delimited tokens to count (e.g. {'a b c ': 5})\n", 66 | " \"\"\"\n", 67 | " vocab_out = defaultdict(int) \n", 68 | " bigram = re.escape(' '.join(token_pair)) \n", 69 | " new_token = ''.join(token_pair)\n", 70 | " # search for every occurance of bigram (token pairs with a space), \n", 71 | " p = re.compile(r'(?'\n", 126 | " last_idx = 0\n", 127 | " idx = len(word)\n", 128 | " while idx > last_idx:\n", 129 | " whole_word = word[last_idx:idx]\n", 130 | " if whole_word in token_dict:\n", 131 | " encoded_ids.append(token_dict[whole_word])\n", 132 | " last_idx = idx\n", 133 | " idx = len(word)\n", 134 | " else:\n", 135 | " idx = idx - 1\n", 136 | " return encoded_ids\n", 137 | " \n", 138 | "\n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 4, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "step 1: merging \"e\" and \"r\"\n", 151 | "step 2: merging \"s\" and \"\"\n", 152 | "step 3: merging \"e\" and \"\"\n", 153 | "step 4: merging \"e\" and \"n\"\n", 154 | "step 5: merging \"d\" and \"\"\n", 155 | "step 6: merging \"h\" and \"er\"\n", 156 | "step 7: merging \"en\" and \"t\"\n", 157 | "step 8: merging \"e\" and \"d\"\n", 158 | "step 9: merging \",\" and \"\"\n", 159 | "step 10: merging \"her\" and \"\"\n", 160 | "step 11: merging \"n\" and \"\"\n", 161 | "step 12: merging \"p\" and \"a\"\n", 162 | "step 13: merging \"pa\" and \"r\"\n", 163 | "step 14: merging \"par\" and \"ent\"\n", 164 | "step 15: merging \"en\" and \"\"\n", 165 | "step 16: merging \"h\" and \"e\"\n", 166 | "step 17: merging \"a\" and \"s\"\n", 167 | "step 18: merging \"s\" and \"e\"\n", 168 | "step 19: merging \"e\" and \"a\"\n", 169 | "step 20: merging \"i\" and \"t\"\n", 170 | "The bpe tokens are: \n", 171 | "W: 0\n", 172 | "c: 1\n", 173 | "[: 2\n", 174 | "b: 3\n", 175 | "w: 4\n", 176 | "a: 5\n", 177 | "s: 6\n", 178 | "d: 7\n", 179 | "m: 8\n", 180 | "T: 9\n", 181 | "f: 10\n", 182 | "y: 11\n", 183 | ": 12\n", 184 | "k: 13\n", 185 | "t: 14\n", 186 | "H: 15\n", 187 | "o: 16\n", 188 | "': 17\n", 189 | "O: 18\n", 190 | "p: 19\n", 191 | "D: 20\n", 192 | "e: 21\n", 193 | "B: 22\n", 194 | "n: 23\n", 195 | "i: 24\n", 196 | "]: 25\n", 197 | "h: 26\n", 198 | ",: 27\n", 199 | "u: 28\n", 200 | "6: 29\n", 201 | "l: 30\n", 202 | "g: 31\n", 203 | "v: 32\n", 204 | "r: 33\n", 205 | "er: 34\n", 206 | "s: 35\n", 207 | "e: 36\n", 208 | "en: 37\n", 209 | "d: 38\n", 210 | "her: 39\n", 211 | "ent: 40\n", 212 | "ed: 41\n", 213 | ",: 42\n", 214 | "her: 43\n", 215 | "n: 44\n", 216 | "pa: 45\n", 217 | "par: 46\n", 218 | "parent: 47\n", 219 | "en: 48\n", 220 | "he: 49\n", 221 | "as: 50\n", 222 | "se: 51\n", 223 | "ea: 52\n", 224 | "it: 53\n", 225 | "The ids of the tokenized sequence are: \n", 226 | "[22, 34, 8, 5, 23, 17, 35, 47, 35, 7, 24, 32, 16, 33, 1, 41, 4, 26, 48, 49, 4, 50, 51, 32, 48, 9, 39, 52, 10, 14, 34, 42, 49, 6, 19, 30, 53, 12, 14, 24, 8, 36, 3, 21, 14, 4, 21, 48, 52, 1, 26, 12, 47, 17, 35, 26, 16, 28, 51, 26, 16, 30, 38, 28, 23, 14, 24, 30, 12, 49, 40, 34, 41, 1, 16, 30, 30, 21, 31, 36, 2, 29, 25, 12, 15, 24, 35, 10, 5, 14, 43, 33, 21, 30, 16, 1, 5, 14, 41, 14, 16, 12, 20, 5, 30, 30, 50, 10, 16, 33, 12, 5, 12, 19, 16, 6, 53, 24, 16, 44, 50, 5, 12, 30, 16, 3, 3, 11, 24, 6, 14, 12, 16, 44, 3, 21, 26, 5, 30, 10, 12, 16, 10, 12, 10, 16, 16, 7, 51, 33, 32, 24, 1, 36, 3, 28, 6, 24, 23, 21, 6, 51, 6, 42, 4, 26, 24, 30, 36, 26, 24, 35, 8, 16, 14, 43, 8, 16, 32, 41, 3, 5, 1, 13, 12, 24, 44, 4, 53, 26, 12, 43, 47, 35, 24, 44, 0, 16, 16, 6, 14, 34, 42, 18, 26, 24, 16, 42, 5, 23, 38, 3, 21, 1, 5, 8, 36, 5, 12, 14, 52, 1, 43, 14, 39, 36]\n", 227 | "\n", 228 | "The sequence corresponding to ids is: \n", 229 | "B er m a n ' s parent s d i v o r c ed w h en he w as se v en T her ea f t er , he s p l it t i m e b e t w e en ea c h parent ' s h o u se h o l d u n t i l he ent er ed c o l l e g e [ 6 ] H i s f a t her r e l o c a t ed t o D a l l as f o r a p o s it i o n as a l o b b y i s t o n b e h a l f o f f o o d se r v i c e b u s i n e s se s , w h i l e h i s m o t her m o v ed b a c k i n w it h her parent s i n W o o s t er , O h i o , a n d b e c a m e a t ea c her t her e\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "# Example usage: \n", 235 | "corpus = '''Berman's parents divorced when he was seven. \n", 236 | "Thereafter, he split time between each parent's household until he entered college.[6] \n", 237 | "His father relocated to Dallas for a position as a lobbyist on behalf of foodservice businesses, \n", 238 | "while his mother moved back in with her parents in Wooster, Ohio, and became a teacher there'''\n", 239 | "data = corpus.split('.') \n", 240 | " \n", 241 | "n = 20 # number of merge operations\n", 242 | "id_to_tokens, token_to_ids = byte_pair_encoding(data, n)\n", 243 | "\n", 244 | "token_ids = tokenize(data, token_to_ids)\n", 245 | "\n", 246 | "print(\"The bpe tokens are: \")\n", 247 | "for tk, tid in token_to_ids.items():\n", 248 | " print(\"{}: {}\".format(tk, tid))\n", 249 | "\n", 250 | "print(\"The ids of the tokenized sequence are: \")\n", 251 | "print(token_ids)\n", 252 | "print()\n", 253 | "print(\"The sequence corresponding to ids is: \")\n", 254 | "print(' '.join(id_to_tokens[tid] for tid in token_ids))" 255 | ] 256 | } 257 | ], 258 | "metadata": { 259 | "kernelspec": { 260 | "display_name": "base", 261 | "language": "python", 262 | "name": "python3" 263 | }, 264 | "language_info": { 265 | "codemirror_mode": { 266 | "name": "ipython", 267 | "version": 3 268 | }, 269 | "file_extension": ".py", 270 | "mimetype": "text/x-python", 271 | "name": "python", 272 | "nbconvert_exporter": "python", 273 | "pygments_lexer": "ipython3", 274 | "version": "3.9.18" 275 | } 276 | }, 277 | "nbformat": 4, 278 | "nbformat_minor": 2 279 | } 280 | --------------------------------------------------------------------------------